160 files changed, 16674 insertions, 9171 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 6333a0fe87..f467d6a64b 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -52,13 +52,13 @@ Includes some patches in the `patches` folder which have been sent upstream.
 
 ## cvtt
 
-- Upstream: https://github.com/elasota/cvtt
-- Version: 1.0.0-beta4 (cc8472a04ba110fe999c686d07af40f7839051fd, 2018)
+- Upstream: https://github.com/elasota/ConvectionKernels
+- Version: git (dc2dbbe0ae2cf2be06ef56d1021e2222a56c7fe2, 2021)
 - License: MIT
 
 Files extracted from upstream source:
 
-- all .cpp, .h, and .txt files in ConvectionKernels/
+- all .cpp, .h, and .txt files except the folders MakeTables and etc2packer.
 
 
 ## doctest
@@ -206,7 +206,7 @@ Files extracted from upstream source:
 ## harfbuzz
 
 - Upstream: https://github.com/harfbuzz/harfbuzz
-- Version: 3.2.0 (be91d2917d9860326cb5fd1d03ffe1042a72f6d3, 2021)
+- Version: 3.3.2 (ac46c3248e8b0316235943175c4d4a11c24dd4a9, 2022)
 - License: MIT
 
 Files extracted from upstream source:
@@ -309,7 +309,7 @@ Files extracted from upstream source:
 ## libwebp
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 1.2.1 (9ce5843dbabcfd3f7c39ec7ceba9cbeb213cbfdf, 2021)
+- Version: 1.2.2 (b0a860891dcd4c0c2d7c6149e5cccb6eb881cc21, 2022)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -317,10 +317,6 @@ Files extracted from upstream source:
 - `src/*` except from: `.am`, `.rc` and `.in` files
 - `AUTHORS`, `COPYING`, `PATENTS`
 
-Important: The files `utils/bit_reader_utils.{c,h}` have Godot-made
-changes to ensure they build for Javascript/HTML5. Those
-changes are marked with `// -- GODOT --` comments.
-
 
 ## mbedtls
 
@@ -469,7 +465,7 @@ Collection of single-file libraries used in Godot components.
 ## msdfgen
 
 - Upstream: https://github.com/Chlumsky/msdfgen
-- Version: 1.9.1 (1b3b6b985094e6f12751177490add3ad11dd91a9, 2010)
+- Version: 1.9.2 (64a91eec3ca3787e6f78b4c99fcd3052ad3e37c0, 2021)
 - License: MIT
 
 Files extracted from the upstream source:
@@ -606,7 +602,7 @@ instead of `miniz.h` as an external dependency.
 ## thorvg
 
 - Upstream: https://github.com/Samsung/thorvg
-- Version: 0.7.0 (e527f565b770f0a41df821e6618ccaeea94f465e, 2021)
+- Version: 0.7.1 (d53eb2a880002cb770ace1c1ace9c5dfcfc28252, 2022)
 - License: MIT
 
 Files extracted from upstream source:
@@ -614,8 +610,6 @@ Files extracted from upstream source:
 See `thorvg/update-thorvg.sh` for extraction instructions. Set the version
 number and run the script.
 
-Patches in the `patches` directory should be re-applied after updates.
-
 
 ## vhacd
 
diff --git a/thirdparty/cvtt/ConvectionKernels.cpp b/thirdparty/cvtt/ConvectionKernels.cpp
deleted file mode 100644
index 8d379344e1..0000000000
--- a/thirdparty/cvtt/ConvectionKernels.cpp
+++ /dev/null
@@ -1,7586 +0,0 @@
-/*
-Convection Texture Tools
-Copyright (c) 2018 Eric Lasota
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject
-to the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
--------------------------------------------------------------------------------------
-
-Portions based on DirectX Texture Library (DirectXTex)
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-Licensed under the MIT License.
-
-http://go.microsoft.com/fwlink/?LinkId=248926
-*/
-#include "ConvectionKernels.h"
-#include "ConvectionKernels_BC7_SingleColor.h"
-
-#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
-#define CVTT_USE_SSE2
-#endif
-
-#ifdef CVTT_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#include <float.h>
-#include <assert.h>
-#include <string.h>
-#include <algorithm>
-#include <math.h>
-
-#define UNREFERENCED_PARAMETER(n) ((void)n)
-
-namespace cvtt
-{
-#ifdef CVTT_USE_SSE2
-    // SSE2 version
-    struct ParallelMath
-    {
-        typedef uint16_t ScalarUInt16;
-        typedef int16_t ScalarSInt16;
-
-        template<unsigned int TRoundingMode>
-        struct RoundForScope
-        {
-            unsigned int m_oldCSR;
-
-            RoundForScope()
-            {
-                m_oldCSR = _mm_getcsr();
-                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
-            }
-
-            ~RoundForScope()
-            {
-                _mm_setcsr(m_oldCSR);
-            }
-        };
-
-        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
-        {
-        };
-
-        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
-        {
-        };
-
-        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
-        {
-        };
-
-        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
-        {
-        };
-
-        static const int ParallelSize = 8;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        template<int TSubtype>
-        struct VInt16
-        {
-            __m128i m_value;
-
-            inline VInt16 operator+(int16_t other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
-                return result;
-            }
-
-            inline VInt16 operator+(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator|(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator&(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator-(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_sub_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator<<(int bits) const
-            {
-                VInt16 result;
-                result.m_value = _mm_slli_epi16(m_value, bits);
-                return result;
-            }
-        };
-
-        typedef VInt16<IntSubtype_Signed> SInt16;
-        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
-        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
-        typedef VInt16<IntSubtype_Abstract> AInt16;
-
-        template<int TSubtype>
-        struct VInt32
-        {
-            __m128i m_values[2];
-
-            inline VInt32 operator+(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator-(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator<<(const int other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
-                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
-                return result;
-            }
-        };
-
-        typedef VInt32<IntSubtype_Signed> SInt32;
-        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
-        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
-        typedef VInt32<IntSubtype_Abstract> AInt32;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-#ifdef CVTT_PERMIT_ALIASING
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt32<TSubtype>&>(src);
-            }
-
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt16<TSubtype>&>(src);
-            }
-#else
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_values[0] = src.m_values[0];
-                result.m_values[1] = src.m_values[1];
-                return result;
-            }
-
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_value = src.m_value;
-                return result;
-            }
-#endif
-        };
-
-        struct Int64
-        {
-            __m128i m_values[4];
-        };
-
-        struct Float
-        {
-            __m128 m_values[2];
-
-            inline Float operator+(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator+(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator-(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator-() const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
-                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator/(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator/(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-        };
-
-        struct Int16CompFlag
-        {
-            __m128i m_value;
-
-            inline Int16CompFlag operator&(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline Int16CompFlag operator|(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-        };
-
-        struct FloatCompFlag
-        {
-            __m128 m_values[2];
-        };
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
-        }
-
-        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
-            return result;
-        }
-
-        template<int TSubtype>
-        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
-        }
-
-        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void MakeSafeDenominator(Float& v)
-        {
-            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
-        }
-
-        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 Min(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Min(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Min(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Min(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Max(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Max(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Max(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Max(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float Clamp(const Float &v, float min, float max)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
-            return result;
-        }
-
-        static Float Reciprocal(const Float &v)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
-            return result;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static Float MakeFloat(float v)
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
-            return f;
-        }
-
-        static Float MakeFloatZero()
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
-            return f;
-        }
-
-        static UInt16 MakeUInt16(uint16_t v)
-        {
-            UInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt16 MakeSInt16(int16_t v)
-        {
-            SInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static AInt16 MakeAInt16(int16_t v)
-        {
-            AInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static UInt15 MakeUInt15(uint16_t v)
-        {
-            UInt15 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt32 MakeSInt32(int32_t v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static UInt31 MakeUInt31(uint32_t v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static uint16_t Extract(const UInt16 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const SInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static uint16_t Extract(const UInt15 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const AInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
-        {
-            reinterpret_cast<int16_t*>(&dest)[offset] = v;
-        }
-
-        static float ExtractFloat(const Float& v, int offset)
-        {
-            return reinterpret_cast<const float*>(&v)[offset];
-        }
-
-        static void PutFloat(Float &dest, int offset, float v)
-        {
-            reinterpret_cast<float*>(&dest)[offset] = v;
-        }
-
-        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Less(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        template<int TSubtype>
-        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Equal(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float ToFloat(const UInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static UInt31 ToUInt31(const UInt16 &v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const UInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const SInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
-            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
-            return result;
-        }
-
-        static Float ToFloat(const SInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
-            return result;
-        }
-
-        static Float ToFloat(const UInt15 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static Float ToFloat(const UInt31 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
-            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
-            return result;
-        }
-
-        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
-        {
-            __m128i lo = _mm_castps_si128(v.m_values[0]);
-            __m128i hi = _mm_castps_si128(v.m_values[1]);
-
-            Int16CompFlag result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
-        {
-            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
-            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
-
-            FloatCompFlag result;
-            result.m_values[0] = _mm_castsi128_ps(lo);
-            result.m_values[1] = _mm_castsi128_ps(hi);
-            return result;
-        }
-
-        static Int16CompFlag MakeBoolInt16(bool b)
-        {
-            Int16CompFlag result;
-            if (b)
-                result.m_value = _mm_set1_epi16(-1);
-            else
-                result.m_value = _mm_setzero_si128();
-            return result;
-        }
-
-        static FloatCompFlag MakeBoolFloat(bool b)
-        {
-            FloatCompFlag result;
-            if (b)
-                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
-            else
-                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
-            return result;
-        }
-
-        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
-            return result;
-        }
-
-        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
-            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
-            return result;
-        }
-
-        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static Float Sqrt(const Float &f)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Abs(const SInt16 &a)
-        {
-            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
-            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
-
-            UInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
-            return result;
-        }
-
-        static Float Abs(const Float& a)
-        {
-            __m128 invMask = _mm_set1_ps(-0.0f);
-
-            Float result;
-            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
-            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
-            return result;
-        }
-
-        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
-
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(diff, diff);
-            return result;
-        }
-
-        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
-
-            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
-            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
-            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
-            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
-
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
-            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
-
-            return result;
-        }
-
-        static Float TwosCLHalfToFloat(const SInt16 &v)
-        {
-            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
-
-            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
-            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
-            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
-
-            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
-
-            // Convert exponent to high-bits 
-            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
-
-            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
-
-            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
-            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
-
-            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
-            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
-
-            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-
-            Float result;
-            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
-            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
-
-            return result;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static UInt16 RightShift(const UInt16 &v, int bits)
-        {
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt31 RightShift(const UInt31 &v, int bits)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 RightShift(const SInt16 &v, int bits)
-        {
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt15 RightShift(const UInt15 &v, int bits)
-        {
-            UInt15 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static SInt32 RightShift(const SInt32 &v, int bits)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 ToSInt16(const SInt32 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt32 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt31 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt15 ToUInt15(const UInt31 &v)
-        {
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static bool AnySet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) != 0;
-        }
-
-        static bool AllSet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) == 0xffff;
-        }
-
-        static bool AnySet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
-        }
-
-        static bool AllSet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
-        }
-    };
-
-#else
-    // Scalar version
-    struct ParallelMath
-    {
-        struct RoundTowardZeroForScope
-        {
-        };
-
-        struct RoundTowardNearestForScope
-        {
-        };
-
-        struct RoundUpForScope
-        {
-        };
-
-        struct RoundDownForScope
-        {
-        };
-
-        static const int ParallelSize = 1;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        typedef int32_t SInt16;
-        typedef int32_t UInt15;
-        typedef int32_t UInt16;
-        typedef int32_t AInt16;
-
-        typedef int32_t SInt32;
-        typedef int32_t UInt31;
-        typedef int32_t UInt32;
-        typedef int32_t AInt32;
-
-        typedef int32_t ScalarUInt16;
-        typedef int32_t ScalarSInt16;
-
-        typedef float Float;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-            static const int32_t& Cast(const int32_t &src)
-            {
-                return src;
-            }
-        };
-
-        typedef bool Int16CompFlag;
-        typedef bool FloatCompFlag;
-
-        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
-        {
-            return a + b;
-        }
-
-        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
-        {
-            return a - b;
-        }
-
-        static float Select(bool flag, float a, float b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t Select(bool flag, int32_t a, int32_t b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t SelectOrZero(bool flag, int32_t a)
-        {
-            return flag ? a : 0;
-        }
-
-        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static int32_t ConditionalNegate(bool flag, int32_t v)
-        {
-            return (flag) ? -v : v;
-        }
-
-        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void ConditionalSet(float& dest, bool flag, float src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static void NotConditionalSet(float& dest, bool flag, float src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void MakeSafeDenominator(float& v)
-        {
-            if (v == 0.0f)
-                v = 1.0f;
-        }
-
-        static int32_t SignedRightShift(int32_t v, int bits)
-        {
-            return v >> bits;
-        }
-
-        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
-        {
-            v = (v << (32 - precision)) & 0xffffffff;
-            return SignedRightShift(v, 32 - precision);
-        }
-
-        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
-        {
-            return v & ((1 << precision) - 1);
-        }
-
-        static int32_t Min(int32_t a, int32_t b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static float Min(float a, float b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static int32_t Max(int32_t a, int32_t b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Max(float a, float b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Abs(float a)
-        {
-            return fabsf(a);
-        }
-
-        static int32_t Abs(int32_t a)
-        {
-            if (a < 0)
-                return -a;
-            return a;
-        }
-
-        static float Clamp(float v, float min, float max)
-        {
-            if (v < min)
-                return min;
-            if (v > max)
-                return max;
-            return v;
-        }
-
-        static float Reciprocal(float v)
-        {
-            return 1.0f / v;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static float MakeFloat(float v)
-        {
-            return v;
-        }
-
-        static float MakeFloatZero()
-        {
-            return 0.0f;
-        }
-
-        static int32_t MakeUInt16(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeAInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt15(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t Extract(int32_t v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static float ExtractFloat(float v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutFloat(float &dest, int offset, float v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static bool Less(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool Less(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool Equal(int32_t a, int32_t b)
-        {
-            return a == b;
-        }
-
-        static bool Equal(float a, float b)
-        {
-            return a == b;
-        }
-
-        static float ToFloat(int32_t v)
-        {
-            return static_cast<float>(v);
-        }
-
-        static int32_t ToUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static bool FloatFlagToInt16(bool v)
-        {
-            return v;
-        }
-
-        static bool Int16FlagToFloat(bool v)
-        {
-            return v;
-        }
-
-        static bool MakeBoolInt16(bool b)
-        {
-            return b;
-        }
-
-        static bool MakeBoolFloat(bool b)
-        {
-            return b;
-        }
-
-        static bool AndNot(bool a, bool b)
-        {
-            return a && !b;
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
-        {
-            UNREFERENCED_PARAMETER(rtz);
-            return static_cast<int>(v);
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
-        {
-            UNREFERENCED_PARAMETER(ru);
-            return static_cast<int>(ceilf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
-        {
-            UNREFERENCED_PARAMETER(rd);
-            return static_cast<int>(floorf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
-        {
-            UNREFERENCED_PARAMETER(rtn);
-            return static_cast<int>(floorf(v + 0.5f));
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        static float Sqrt(float f)
-        {
-            return sqrtf(f);
-        }
-
-        static int32_t SqDiffUInt8(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffSInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static float TwosCLHalfToFloat(int32_t v)
-        {
-            int32_t absV = (v < 0) ? -v : v;
-
-            int32_t signBits = (absV & -32768);
-            int32_t mantissa = (absV & 0x03ff);
-            int32_t exponent = (absV & 0x7c00);
-
-            bool isDenormal = (exponent == 0);
-
-            // Convert exponent to high-bits
-            exponent = (exponent >> 3) + 14336;
-
-            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
-
-            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
-
-            float f, correction;
-            memcpy(&f, &fBits, 4);
-            memcpy(&correction, &denormalCorrection, 4);
-
-            return f - correction;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static int32_t RightShift(int32_t v, int bits)
-        {
-            return SignedRightShift(v, bits);
-        }
-
-        static int32_t ToSInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt15(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t XMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static int32_t CompactMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static bool AnySet(bool v)
-        {
-            return v;
-        }
-
-        static bool AllSet(bool v)
-        {
-            return v;
-        }
-    };
-
-#endif
-
-    namespace Internal
-    {
-        namespace BC7Data
-        {
-            enum AlphaMode
-            {
-                AlphaMode_Combined,
-                AlphaMode_Separate,
-                AlphaMode_None,
-            };
-
-            enum PBitMode
-            {
-                PBitMode_PerEndpoint,
-                PBitMode_PerSubset,
-                PBitMode_None
-            };
-
-            struct BC7ModeInfo
-            {
-                PBitMode m_pBitMode;
-                AlphaMode m_alphaMode;
-                int m_rgbBits;
-                int m_alphaBits;
-                int m_partitionBits;
-                int m_numSubsets;
-                int m_indexBits;
-                int m_alphaIndexBits;
-                bool m_hasIndexSelector;
-            };
-
-            BC7ModeInfo g_modes[] =
-            {
-                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
-                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
-                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
-                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
-
-                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
-                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
-            };
-
-			const int g_weight2[] = { 0, 21, 43, 64 };
-			const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
-			const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-
-			const int *g_weightTables[] =
-			{
-				NULL,
-				NULL,
-				g_weight2,
-				g_weight3,
-				g_weight4
-			};
-
-            struct BC6HModeInfo
-            {
-                uint16_t m_modeID;
-                bool m_partitioned;
-                bool m_transformed;
-                int m_aPrec;
-                int m_bPrec[3];
-            };
-
-            // [partitioned][precision]
-            bool g_hdrModesExistForPrecision[2][17] =
-            {
-                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
-                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
-                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
-            };
-
-            BC6HModeInfo g_hdrModes[] =
-            {
-                { 0x00, true,  true,  10,{ 5, 5, 5 } },
-                { 0x01, true,  true,  7,{ 6, 6, 6 } },
-                { 0x02, true,  true,  11,{ 5, 4, 4 } },
-                { 0x06, true,  true,  11,{ 4, 5, 4 } },
-                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
-                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
-                { 0x12, true,  true,  8,{ 6, 5, 5 } },
-                { 0x16, true,  true,  8,{ 5, 6, 5 } },
-                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
-                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
-                { 0x03, false, false, 10,{ 10, 10, 10 } },
-                { 0x07, false, true,  11,{ 9, 9, 9 } },
-                { 0x0b, false, true,  12,{ 8, 8, 8 } },
-                { 0x0f, false, true,  16,{ 4, 4, 4 } },
-            };
-
-            const int g_maxHDRPrecision = 16;
-
-            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
-
-            static uint16_t g_partitionMap[64] =
-            {
-                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
-                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
-                0xC800, 0xFFEC, 0xFE80, 0xE800,
-                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
-                0xF710, 0x008E, 0x7100, 0x08CE,
-                0x008C, 0x7310, 0x3100, 0x8CCE,
-                0x088C, 0x3110, 0x6666, 0x366C,
-                0x17E8, 0x0FF0, 0x718E, 0x399C,
-                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
-                0x3c3c, 0x55aa, 0x9696, 0xa55a,
-                0x73ce, 0x13c8, 0x324c, 0x3bdc,
-                0x6996, 0xc33c, 0x9966, 0x660,
-                0x272, 0x4e4, 0x4e40, 0x2720,
-                0xc936, 0x936c, 0x39c6, 0x639c,
-                0x9336, 0x9cc6, 0x817e, 0xe718,
-                0xccf0, 0xfcc, 0x7744, 0xee22,
-            };
-
-            static uint32_t g_partitionMap2[64] =
-            {
-                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
-                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
-                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
-                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
-                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
-                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
-                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
-                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
-                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
-                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
-                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
-                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
-                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
-                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
-                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
-                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
-            };
-
-            static int g_fixupIndexes2[64] =
-            {
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15, 2, 8, 2,
-                2, 8, 8,15,
-                2, 8, 2, 2,
-                8, 8, 2, 2,
-
-                15,15, 6, 8,
-                2, 8,15,15,
-                2, 8, 2, 2,
-                2,15,15, 6,
-                6, 2, 6, 8,
-                15,15, 2, 2,
-                15,15,15,15,
-                15, 2, 2,15,
-            };
-
-            static int g_fixupIndexes3[64][2] =
-            {
-                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
-                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
-                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
-                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
-                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
-                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
-                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
-                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
-
-                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
-                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
-                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
-                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
-                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
-                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
-                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
-                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
-            };
-
-            static const unsigned char g_fragments[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
-                0, 1, 2, 3,  // 16, 4
-                0, 1, 4,  // 20, 3
-                0, 1, 2, 4,  // 23, 4
-                2, 3, 7,  // 27, 3
-                1, 2, 3, 7,  // 30, 4
-                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
-                0, 1, 4, 8,  // 42, 4
-                0, 1, 2, 4, 5, 8,  // 46, 6
-                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
-                1, 4, 5, 6, 9,  // 60, 5
-                2, 5, 6, 7, 10,  // 65, 5
-                5, 6, 9, 10,  // 70, 4
-                2, 3, 7, 11,  // 74, 4
-                1, 2, 3, 6, 7, 11,  // 78, 6
-                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
-                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
-                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
-                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
-                0, 4, 8, 12,  // 128, 4
-                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
-                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
-                3, 6, 7, 8, 9, 12,  // 158, 6
-                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
-                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
-                5, 8, 9, 10, 13,  // 192, 5
-                8, 12, 13,  // 197, 3
-                4, 8, 12, 13,  // 200, 4
-                2, 3, 6, 9, 12, 13,  // 204, 6
-                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
-                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
-                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
-                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
-                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
-                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
-                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
-                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
-                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
-                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
-                6, 9, 10, 11, 14,  // 317, 5
-                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
-                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
-                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
-                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
-                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
-                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
-                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
-                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
-                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
-                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
-                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
-                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
-                8, 12, 13, 14,  // 418, 4
-                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
-                4, 8, 9, 12, 13, 14,  // 430, 6
-                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
-                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
-                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
-                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
-                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
-                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
-                3, 7, 11, 15,  // 490, 4
-                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
-                0, 4, 5, 10, 11, 15,  // 502, 6
-                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
-                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
-                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
-                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
-                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
-                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
-                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
-                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
-                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
-                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
-                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
-                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
-                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
-                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
-                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
-                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
-                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
-                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
-                0, 1, 5, 10, 14, 15,  // 669, 6
-                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
-                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
-                11, 14, 15,  // 691, 3
-                7, 11, 14, 15,  // 694, 4
-                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
-                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
-                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
-                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
-                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
-                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
-                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
-                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
-                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
-                11, 13, 14, 15,  // 792, 4
-                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
-                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
-                7, 10, 11, 13, 14, 15,  // 814, 6
-                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
-                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
-                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
-                12, 13, 14, 15,  // 848, 4
-                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
-                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
-                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
-                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
-                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
-                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
-                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
-                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
-                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
-                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
-                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
-                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
-                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
-                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
-                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
-                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
-                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
-                0, 2,  // 1040, 2
-                1, 3,  // 1042, 2
-                0, 1, 4, 5,  // 1044, 4
-                0, 1, 2, 4, 5,  // 1048, 5
-                2, 3, 6,  // 1053, 3
-                0, 2, 4, 6,  // 1056, 4
-                1, 2, 5, 6,  // 1060, 4
-                0, 1, 2, 3, 5, 6,  // 1064, 6
-                0, 1, 2, 4, 5, 6,  // 1070, 6
-                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
-                0, 3, 4, 7,  // 1083, 4
-                0, 1, 2, 3, 4, 7,  // 1087, 6
-                1, 3, 5, 7,  // 1093, 4
-                2, 3, 6, 7,  // 1097, 4
-                1, 2, 3, 6, 7,  // 1101, 5
-                1, 2, 3, 5, 6, 7,  // 1106, 6
-                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
-                4, 5, 6, 7,  // 1119, 4
-                0, 8,  // 1123, 2
-                0, 1, 4, 5, 8,  // 1125, 5
-                0, 1, 8, 9,  // 1130, 4
-                4, 5, 8, 9,  // 1134, 4
-                0, 1, 4, 5, 8, 9,  // 1138, 6
-                2, 6, 8, 9,  // 1144, 4
-                6, 7, 8, 9,  // 1148, 4
-                0, 2, 4, 6, 8, 10,  // 1152, 6
-                1, 2, 5, 6, 9, 10,  // 1158, 6
-                0, 3, 4, 7, 9, 10,  // 1164, 6
-                0, 1, 2, 8, 9, 10,  // 1170, 6
-                4, 5, 6, 8, 9, 10,  // 1176, 6
-                3, 11,  // 1182, 2
-                2, 3, 6, 7, 11,  // 1184, 5
-                0, 3, 8, 11,  // 1189, 4
-                0, 3, 4, 7, 8, 11,  // 1193, 6
-                1, 3, 5, 7, 9, 11,  // 1199, 6
-                2, 3, 10, 11,  // 1205, 4
-                1, 5, 10, 11,  // 1209, 4
-                4, 5, 10, 11,  // 1213, 4
-                6, 7, 10, 11,  // 1217, 4
-                2, 3, 6, 7, 10, 11,  // 1221, 6
-                1, 2, 3, 9, 10, 11,  // 1227, 6
-                5, 6, 7, 9, 10, 11,  // 1233, 6
-                8, 9, 10, 11,  // 1239, 4
-                4, 12,  // 1243, 2
-                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
-                8, 9, 12,  // 1253, 3
-                0, 4, 5, 8, 9, 12,  // 1256, 6
-                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
-                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
-                1, 5, 9, 13,  // 1276, 4
-                6, 7, 9, 13,  // 1280, 4
-                1, 4, 7, 10, 13,  // 1284, 5
-                1, 6, 8, 11, 13,  // 1289, 5
-                0, 1, 12, 13,  // 1294, 4
-                4, 5, 12, 13,  // 1298, 4
-                0, 1, 6, 7, 12, 13,  // 1302, 6
-                0, 1, 4, 8, 12, 13,  // 1308, 6
-                8, 9, 12, 13,  // 1314, 4
-                4, 8, 9, 12, 13,  // 1318, 5
-                4, 5, 8, 9, 12, 13,  // 1323, 6
-                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
-                0, 1, 6, 10, 12, 13,  // 1336, 6
-                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
-                0, 1, 10, 11, 12, 13,  // 1349, 6
-                2, 4, 7, 9, 14,  // 1355, 5
-                4, 5, 10, 14,  // 1360, 4
-                2, 6, 10, 14,  // 1364, 4
-                2, 5, 8, 11, 14,  // 1368, 5
-                0, 2, 12, 14,  // 1373, 4
-                8, 10, 12, 14,  // 1377, 4
-                4, 6, 8, 10, 12, 14,  // 1381, 6
-                13, 14,  // 1387, 2
-                9, 10, 13, 14,  // 1389, 4
-                5, 6, 9, 10, 13, 14,  // 1393, 6
-                0, 1, 2, 12, 13, 14,  // 1399, 6
-                4, 5, 6, 12, 13, 14,  // 1405, 6
-                8, 9, 12, 13, 14,  // 1411, 5
-                8, 9, 10, 12, 13, 14,  // 1416, 6
-                7, 15,  // 1422, 2
-                0, 5, 10, 15,  // 1424, 4
-                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
-                10, 11, 15,  // 1436, 3
-                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
-                3, 6, 7, 10, 11, 15,  // 1446, 6
-                12, 15,  // 1452, 2
-                0, 3, 12, 15,  // 1454, 4
-                4, 7, 12, 15,  // 1458, 4
-                0, 3, 6, 9, 12, 15,  // 1462, 6
-                0, 3, 5, 10, 12, 15,  // 1468, 6
-                8, 11, 12, 15,  // 1474, 4
-                5, 6, 8, 11, 12, 15,  // 1478, 6
-                4, 7, 8, 11, 12, 15,  // 1484, 6
-                1, 3, 13, 15,  // 1490, 4
-                9, 11, 13, 15,  // 1494, 4
-                5, 7, 9, 11, 13, 15,  // 1498, 6
-                2, 3, 14, 15,  // 1504, 4
-                2, 3, 4, 5, 14, 15,  // 1508, 6
-                6, 7, 14, 15,  // 1514, 4
-                2, 3, 5, 9, 14, 15,  // 1518, 6
-                2, 3, 8, 9, 14, 15,  // 1524, 6
-                10, 14, 15,  // 1530, 3
-                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
-                2, 3, 7, 11, 14, 15,  // 1540, 6
-                10, 11, 14, 15,  // 1546, 4
-                7, 10, 11, 14, 15,  // 1550, 5
-                6, 7, 10, 11, 14, 15,  // 1555, 6
-                1, 2, 3, 13, 14, 15,  // 1561, 6
-                5, 6, 7, 13, 14, 15,  // 1567, 6
-                10, 11, 13, 14, 15,  // 1573, 5
-                9, 10, 11, 13, 14, 15,  // 1578, 6
-                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
-                9, 10, 12, 13, 14, 15,  // 1592, 6
-                8, 11, 12, 13, 14, 15,  // 1598, 6
-                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
-            };
-            static const int g_shapeRanges[][2] =
-            {
-                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
-                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
-                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
-                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
-                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
-                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
-                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
-                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
-                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
-                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
-                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
-                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
-                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
-                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
-                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
-                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
-                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
-                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
-                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
-                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
-                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
-                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
-                { 1604, 8 },
-            };
-            static const int g_shapes1[][2] =
-            {
-                { 0, 16 }
-            };
-            static const int g_shapes2[64][2] =
-            {
-                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
-                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
-                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
-                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
-                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
-                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
-                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
-                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
-            };
-            static const int g_shapes3[64][3] =
-            {
-                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
-                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
-                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
-                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
-                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
-                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
-                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
-                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
-            };
-
-            static const int g_shapeList1[] =
-            {
-                0,
-            };
-
-            static const int g_shapeList1Collapse[] =
-            {
-                0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-            static const int g_shapeList2[] =
-            {
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
-                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
-                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
-                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
-                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
-                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-                122, 123, 124, 125, 126, 127, 128,
-            };
-            static const int g_shapeList2Collapse[] =
-            {
-                -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-                43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-                54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-                65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
-                76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
-                87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
-                98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-                109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList12[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128,
-            };
-
-            static const int g_shapeList12Collapse[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList3[] =
-            {
-                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
-                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
-                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
-                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
-                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
-                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
-                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
-                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
-                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
-                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
-                235, 236, 237, 238, 239, 240, 241, 242,
-            };
-
-            static const int g_shapeList3Collapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1,
-                -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1,
-                -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1,
-                11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13,
-                -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
-                16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21,
-                22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28,
-                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-                40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-                62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
-                73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
-                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
-                95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
-                106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-                117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-                128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
-                139,
-            };
-
-            static const int g_shapeList3Short[] =
-            {
-                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
-                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
-                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
-                233, 237, 240,
-            };
-
-            static const int g_shapeList3ShortCollapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1,
-                -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1,
-                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1,
-                12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14,
-                15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17,
-                18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23,
-                -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26,
-                27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1,
-                -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1,
-                -1,
-            };
-
-            static const int g_shapeListAll[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
-                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
-                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
-                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
-                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
-                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
-                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
-                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
-                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-                242,
-            };
-
-            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
-            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
-            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
-            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
-            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
-            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
-            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
-
-            static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3;
-        }
-
-        namespace BC6HData
-        {
-            enum EField
-            {
-                NA, // N/A
-                M,  // Mode
-                D,  // Shape
-                RW,
-                RX,
-                RY,
-                RZ,
-                GW,
-                GX,
-                GY,
-                GZ,
-                BW,
-                BX,
-                BY,
-                BZ,
-            };
-
-            struct ModeDescriptor
-            {
-                EField m_eField;
-                uint8_t   m_uBit;
-            };
-
-            const ModeDescriptor g_modeDescriptors[14][82] =
-            {
-                {   // Mode 1 (0x00) - 10 5 5 5
-                    { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 2 (0x01) - 7 6 6 6
-                    { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 3 (0x02) - 11 5 4 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 4 (0x06) - 11 4 5 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 5 (0x0a) - 11 4 4 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 6 (0x0e) - 9 5 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 7 (0x12) - 8 6 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 8 (0x16) - 8 5 6 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 9 (0x1a) - 8 5 5 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 10 (0x1e) - 6 6 6 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 11 (0x03) - 10 10
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 12 (0x07) - 11 9
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 13 (0x0b) - 12 8
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 14 (0x0f) - 16 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
-                    { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
-                    { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
-                    { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-            };
-        }
-
-        struct PackingVector
-        {
-            uint32_t m_vector[4];
-            int m_offset;
-
-            void Init()
-            {
-                for (int i = 0; i < 4; i++)
-                    m_vector[i] = 0;
-
-                m_offset = 0;
-            }
-
-            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
-            {
-                int vOffset = m_offset >> 5;
-                int bitOffset = m_offset & 0x1f;
-
-                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
-
-                int overflowBits = bitOffset + bits - 32;
-                if (overflowBits > 0)
-                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
-
-                m_offset += bits;
-            }
-
-            inline void Flush(uint8_t* output)
-            {
-                assert(m_offset == 128);
-
-                for (int v = 0; v < 4; v++)
-                {
-                    uint32_t chunk = m_vector[v];
-                    for (int b = 0; b < 4; b++)
-                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
-                }
-            }
-        };
-
-
-		struct UnpackingVector
-		{
-			uint32_t m_vector[4];
-
-			void Init(const uint8_t *bytes)
-			{
-				for (int i = 0; i < 4; i++)
-					m_vector[i] = 0;
-
-				for (int b = 0; b < 16; b++)
-					m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
-			}
-
-			inline ParallelMath::ScalarUInt16 Unpack(int bits)
-			{
-				uint32_t bitMask = (1 << bits) - 1;
-
-				ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
-
-				for (int i = 0; i < 4; i++)
-				{
-					m_vector[i] >>= bits;
-					if (i != 3)
-						m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
-				}
-
-				return result;
-			}
-		};
-
-        void ComputeTweakFactors(int tweak, int range, float *outFactors)
-        {
-            int totalUnits = range - 1;
-            int minOutsideUnits = ((tweak >> 1) & 1);
-            int maxOutsideUnits = (tweak & 1);
-            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
-
-            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
-            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
-        }
-
-        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
-        {
-            if (isSigned)
-            {
-                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
-                return (v * 32.0f + offset) / 31.0f;
-            }
-            else
-                return (v * 64.0f + 30.0f) / 31.0f;
-        }
-
-        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
-        {
-#ifdef CVTT_ENABLE_ASSERTS
-            for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                assert(ParallelMath::Extract(v, i) != -32768)
-#endif
-
-            ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
-            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
-
-            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
-            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
-            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
-            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
-
-            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
-        }
-
-        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
-        {
-            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
-        }
-
-        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
-        {
-            for (int epi = 0; epi < 2; epi++)
-            {
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    if (isSigned)
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
-                    else
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
-                }
-            }
-        }
-
-        template<int TVectorSize>
-        class UnfinishedEndpoints
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            UnfinishedEndpoints()
-            {
-            }
-
-            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = offset[ch];
-            }
-
-            UnfinishedEndpoints(const UnfinishedEndpoints& other)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = other.m_base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = other.m_offset[ch];
-            }
-
-            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt15 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
-                    }
-
-                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
-                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
-                }
-            }
-
-            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
-                    }
-
-                    outEP0[ch] = channelEPs[0];
-                    outEP1[ch] = channelEPs[1];
-                }
-            }
-
-            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
-                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
-                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
-                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
-                }
-            }
-
-            template<int TNewVectorSize>
-            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
-            {
-                MFloat newBase[TNewVectorSize];
-                MFloat newOffset[TNewVectorSize];
-
-                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
-                {
-                    newBase[ch] = m_base[ch];
-                    newOffset[ch] = m_offset[ch];
-                }
-
-                MFloat fillerV = ParallelMath::MakeFloat(filler);
-
-                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
-                {
-                    newBase[ch] = fillerV;
-                    newOffset[ch] = ParallelMath::MakeFloatZero();
-                }
-
-                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
-            }
-
-        private:
-            MFloat m_base[TVectorSize];
-            MFloat m_offset[TVectorSize];
-        };
-
-        template<int TMatrixSize>
-        class PackedCovarianceMatrix
-        {
-        public:
-            // 0: xx,
-            // 1: xy, yy
-            // 3: xz, yz, zz 
-            // 6: xw, yw, zw, ww
-            // ... etc.
-            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
-
-            typedef ParallelMath::Float MFloat;
-
-            PackedCovarianceMatrix()
-            {
-                for (int i = 0; i < PyramidSize; i++)
-                    m_values[i] = ParallelMath::MakeFloatZero();
-            }
-
-            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
-            {
-                int index = 0;
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    for (int col = 0; col <= row; col++)
-                    {
-                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
-                        index++;
-                    }
-                }
-            }
-
-            void Product(MFloat *outVec, const MFloat *inVec)
-            {
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    MFloat sum = ParallelMath::MakeFloatZero();
-
-                    int index = (row * (row + 1)) >> 1;
-                    for (int col = 0; col < TMatrixSize; col++)
-                    {
-                        sum = sum + inVec[col] * m_values[index];
-                        if (col >= row)
-                            index += col + 1;
-                        else
-                            index++;
-                    }
-
-                    outVec[row] = sum;
-                }
-            }
-
-        private:
-            ParallelMath::Float m_values[PyramidSize];
-        };
-
-        static const int NumEndpointSelectorPasses = 3;
-
-        template<int TVectorSize, int TIterationCount>
-        class EndpointSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-
-            EndpointSelector()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_centroid[ch] = ParallelMath::MakeFloatZero();
-                    m_direction[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_weightTotal = ParallelMath::MakeFloatZero();
-                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
-                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
-            }
-
-            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
-            {
-                if (pass == 0)
-                    ContributeCentroid(value, weight);
-                else if (pass == 1)
-                    ContributeDirection(value, weight);
-                else if (pass == 2)
-                    ContributeMinMax(value);
-            }
-
-            void FinishPass(int pass)
-            {
-                if (pass == 0)
-                    FinishCentroid();
-                else if (pass == 1)
-                    FinishDirection();
-            }
-
-            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
-            {
-                MFloat unweightedBase[TVectorSize];
-                MFloat unweightedOffset[TVectorSize];
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
-                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
-
-                    float safeWeight = channelWeights[ch];
-                    if (safeWeight == 0.f)
-                        safeWeight = 1.0f;
-
-                    unweightedBase[ch] = min / channelWeights[ch];
-                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
-                }
-
-                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
-            }
-
-        private:
-            void ContributeCentroid(const MFloat *value, const MFloat &weight)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
-                m_weightTotal = m_weightTotal + weight;
-            }
-
-            void FinishCentroid()
-            {
-                MFloat denom = m_weightTotal;
-                ParallelMath::MakeSafeDenominator(denom);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] / denom;
-            }
-
-            void ContributeDirection(const MFloat *value, const MFloat &weight)
-            {
-                MFloat diff[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    diff[ch] = value[ch] - m_centroid[ch];
-
-                m_covarianceMatrix.Add(diff, weight);
-            }
-
-            void FinishDirection()
-            {
-                MFloat approx[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approx[ch] = ParallelMath::MakeFloat(1.0f);
-
-                for (int i = 0; i < TIterationCount; i++)
-                {
-                    MFloat product[TVectorSize];
-                    m_covarianceMatrix.Product(product, approx);
-
-                    MFloat largestComponent = product[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
-
-                    // product = largestComponent*newApprox
-                    ParallelMath::MakeSafeDenominator(largestComponent);
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        approx[ch] = product[ch] / largestComponent;
-                }
-
-                // Normalize
-                MFloat approxLen = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approxLen = approxLen + approx[ch] * approx[ch];
-
-                approxLen = ParallelMath::Sqrt(approxLen);
-
-                ParallelMath::MakeSafeDenominator(approxLen);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_direction[ch] = approx[ch] / approxLen;
-            }
-
-            void ContributeMinMax(const MFloat *value)
-            {
-                MFloat dist = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
-
-                m_minDist = ParallelMath::Min(m_minDist, dist);
-                m_maxDist = ParallelMath::Max(m_maxDist, dist);
-            }
-
-            ParallelMath::Float m_centroid[TVectorSize];
-            ParallelMath::Float m_direction[TVectorSize];
-            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
-            ParallelMath::Float m_weightTotal;
-
-            ParallelMath::Float m_minDist;
-            ParallelMath::Float m_maxDist;
-        };
-
-        static const ParallelMath::UInt16 g_weightReciprocals[] =
-        {
-            ParallelMath::MakeUInt16(0),        // -1 
-            ParallelMath::MakeUInt16(0),        // 0
-            ParallelMath::MakeUInt16(32768),    // 1
-            ParallelMath::MakeUInt16(16384),    // 2
-            ParallelMath::MakeUInt16(10923),    // 3
-            ParallelMath::MakeUInt16(8192),     // 4
-            ParallelMath::MakeUInt16(6554),     // 5
-            ParallelMath::MakeUInt16(5461),     // 6
-            ParallelMath::MakeUInt16(4681),     // 7
-            ParallelMath::MakeUInt16(4096),     // 8
-            ParallelMath::MakeUInt16(3641),     // 9
-            ParallelMath::MakeUInt16(3277),     // 10
-            ParallelMath::MakeUInt16(2979),     // 11
-            ParallelMath::MakeUInt16(2731),     // 12
-            ParallelMath::MakeUInt16(2521),     // 13
-            ParallelMath::MakeUInt16(2341),     // 14
-            ParallelMath::MakeUInt16(2185),     // 15
-        };
-
-        template<int TVectorSize>
-        class IndexSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            template<class TInterpolationEPType, class TColorEPType>
-            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
-            {
-                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
-                // We need to select indexes using the color-space endpoints.
-
-                m_isUniform = true;
-                for (int ch = 1; ch < TVectorSize; ch++)
-                {
-                    if (channelWeights[ch] != channelWeights[0])
-                        m_isUniform = false;
-                }
-
-                // To work with channel weights, we need something where:
-                // pxDiff = px - ep[0]
-                // epDiff = ep[1] - ep[0]
-                //
-                // weightedEPDiff = epDiff * channelWeights
-                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
-                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
-                // index = normalizedIndex * maxValue
-                //
-                // Equivalent to:
-                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
-                // index = dot(axis, pxDiff)
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
-
-                m_range = range;
-                m_maxValue = static_cast<float>(range - 1);
-
-                MFloat epDiffWeighted[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
-                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
-                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
-                }
-
-                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
-
-                ParallelMath::MakeSafeDenominator(lenSquared);
-
-                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
-            }
-
-            template<bool TSigned>
-            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
-            {
-                MAInt16 converted[2][TVectorSize];
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
-
-                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
-                }
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
-                }
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDR_BC7(index, pixel, TVectorSize);
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDRPrecise(index, pixel, TVectorSize);
-            }
-
-            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
-
-                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
-            }
-
-        protected:
-            MAInt16 m_endPoint[2][TVectorSize];
-
-        private:
-            MFloat m_origin[TVectorSize];
-            MFloat m_axis[TVectorSize];
-            int m_range;
-            float m_maxValue;
-            bool m_isUniform;
-        };
-
-
-        template<int TVectorSize>
-        class IndexSelectorHDR : public IndexSelector<TVectorSize>
-        {
-        public:
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-        private:
-
-            MUInt15 InvertSingle(const MUInt15& anIndex) const
-            {
-                MUInt15 inverted = m_maxValueMinusOne - anIndex;
-                return ParallelMath::Select(m_isInverted, inverted, anIndex);
-            }
-
-            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
-                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
-
-                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
-                }
-            }
-
-            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
-                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
-
-                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
-                }
-            }
-
-            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
-            {
-                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
-                return diff * diff;
-            }
-
-            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
-            {
-                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
-                return error;
-            }
-
-        public:
-
-            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
-            {
-                assert(range <= 16);
-
-                m_range = range;
-
-                m_isInverted = ParallelMath::MakeBoolInt16(false);
-                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
-
-                if (!fastIndexing)
-                {
-                    for (int i = 0; i < range; i++)
-                    {
-                        MSInt16 recon2CL[TVectorSize];
-
-                        if (isSigned)
-                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-                        else
-                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-
-                        for (int ch = 0; ch < TVectorSize; ch++)
-                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
-                    }
-                }
-            }
-
-            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
-            {
-                m_isInverted = invert;
-            }
-
-            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
-            {
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                MFloat bestError = ErrorForInterpolator(0, pixel);
-                for (int i = 1; i < m_range; i++)
-                {
-                    MFloat error = ErrorForInterpolator(i, pixel);
-                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                    bestError = ParallelMath::Min(bestError, error);
-                }
-
-                return InvertSingle(index);
-            }
-
-            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
-            }
-
-        private:
-            MFloat m_reconstructedInterpolators[16][TVectorSize];
-            ParallelMath::Int16CompFlag m_isInverted;
-            MUInt15 m_maxValueMinusOne;
-            int m_range;
-        };
-
-        // Solve for a, b where v = a*t + b
-        // This allows endpoints to be mapped to where T=0 and T=1
-        // Least squares from totals:
-        // a = (tv - t*v/w)/(tt - t*t/w)
-        // b = (v - a*t)/w
-        template<int TVectorSize>
-        class EndpointRefiner
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            MFloat m_tv[TVectorSize];
-            MFloat m_v[TVectorSize];
-            MFloat m_tt;
-            MFloat m_t;
-            MFloat m_w;
-            int m_wu;
-
-            float m_rcpMaxIndex;
-            float m_channelWeights[TVectorSize];
-            float m_rcpChannelWeights[TVectorSize];
-
-            void Init(int indexRange, const float channelWeights[TVectorSize])
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_tv[ch] = ParallelMath::MakeFloatZero();
-                    m_v[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_tt = ParallelMath::MakeFloatZero();
-                m_t = ParallelMath::MakeFloatZero();
-                m_w = ParallelMath::MakeFloatZero();
-
-                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_channelWeights[ch] = channelWeights[ch];
-                    m_rcpChannelWeights[ch] = 1.0f;
-                    if (m_channelWeights[ch] != 0.0f)
-                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
-                }
-
-                m_wu = 0;
-            }
-
-            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch] * weight;
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + weight * t * t;
-                m_t = m_t + weight * t;
-                m_w = m_w + weight;
-            }
-
-            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch];
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + t * t;
-                m_t = m_t + t;
-                m_wu++;
-            }
-
-            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
-            {
-                ContributeUnweightedPW(floatPixel, index, TVectorSize);
-            }
-
-            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
-            {
-                // a = (tv - t*v/w)/(tt - t*t/w)
-                // b = (v - a*t)/w
-                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
-
-                ParallelMath::MakeSafeDenominator(w);
-                MFloat wRcp = ParallelMath::Reciprocal(w);
-
-                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
-
-                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
-                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    /*
-                    if (adenom == 0.0)
-                        p1 = p2 = er.v / er.w;
-                    else
-                    {
-                        float4 a = (er.tv - er.t*er.v / er.w) / adenom;
-                        float4 b = (er.v - a * er.t) / er.w;
-                        p1 = b;
-                        p2 = a + b;
-                    }
-                    */
-
-                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
-                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
-
-                    MFloat p1 = b;
-                    MFloat p2 = a + b;
-
-                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
-                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
-
-                    // Unweight
-                    float inverseWeight = m_rcpChannelWeights[ch];
-
-                    endPoint[0][ch] = p1 * inverseWeight;
-                    endPoint[1][ch] = p2 * inverseWeight;
-                }
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
-            }
-
-            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                    {
-                        MFloat f = floatEndPoint[epi][ch];
-                        if (isSigned)
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
-                        else
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
-                    }
-                }
-            }
-        };
-
-        template<int TVectorSize>
-        class AggregatedError
-        {
-        public:
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::Float MFloat;
-
-            AggregatedError()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
-            }
-
-            void Add(const MUInt16 &channelErrorUnweighted, int ch)
-            {
-                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
-            }
-
-            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
-            {
-                if (flags & cvtt::Flags::Uniform)
-                {
-                    MUInt31 total = m_errorUnweighted[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + m_errorUnweighted[ch];
-                    return ParallelMath::ToFloat(total);
-                }
-                else
-                {
-                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
-                    return total;
-                }
-            }
-
-        private:
-            MUInt31 m_errorUnweighted[TVectorSize];
-        };
-
-        class BCCommon
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static int TweakRoundsForRange(int range)
-            {
-                if (range == 3)
-                    return 3;
-                return 4;
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
-            {
-                for (int ch = 0; ch < numRealChannels; ch++)
-                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
-            {
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
-            {
-                AggregatedError<TVectorSize> aggError;
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
-                return aggError.Finalize(flags, channelWeightsSq);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-        };
-
-        class BC7Computer
-        {
-        public:
-            static const int MaxTweakRounds = 4;
-
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-            struct WorkInfo
-            {
-                MUInt15 m_mode;
-                MFloat m_error;
-                MUInt15 m_ep[3][2][4];
-                MUInt15 m_indexes[16];
-                MUInt15 m_indexes2[16];
-
-                union
-                {
-                    MUInt15 m_partition;
-                    struct IndexSelectorAndRotation
-                    {
-                        MUInt15 m_indexSelector;
-                        MUInt15 m_rotation;
-                    } m_isr;
-                } m_u;
-            };
-
-            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tf[2];
-                ComputeTweakFactors(tweak, range, tf);
-
-                MFloat base = ParallelMath::ToFloat(original[0]);
-                MFloat offs = ParallelMath::ToFloat(original[1]) - base;
-
-                result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
-                result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
-            }
-
-            static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float maxColor = static_cast<float>((1 << bits) - 1);
-
-                for (int i = 0; i < channels; i++)
-                    color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode);
-            }
-
-            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
-                MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift);
-
-                float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
-
-                float maxQuantized = static_cast<float>((1 << bits) - 1);
-
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    if (p)
-                        clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
-
-                    MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF;
-
-                    clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1;
-                    if (p)
-                        clr = clr | ParallelMath::MakeUInt15(1);
-
-                    color[ch] = clr;
-                }
-            }
-
-            static void Unquantize(MUInt15* color, int bits, int channels)
-            {
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    clr = clr << (8 - bits);
-                    color[ch] = clr | ParallelMath::RightShift(clr, bits);
-                }
-            }
-
-            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 4, p[j], 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 6, p, 3, roundingMode);
-                    Unquantize(ep[j], 7, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(ep[j], 5, 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 7, p[j], 3, roundingMode);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 5, 3, roundingMode);
-                    Unquantize(epRGB[j], 5, 3);
-
-                    Quantize(epA + j, 6, 1, roundingMode);
-                    Unquantize(epA + j, 6, 1);
-                }
-            }
-
-            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 7, 3, roundingMode);
-                    Unquantize(epRGB[j], 7, 3);
-                }
-
-                // Alpha is full precision
-                (void)epA;
-            }
-
-            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                    QuantizeP(ep[j], 7, p[j], 4, roundingMode);
-            }
-
-            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 5, p[j], 4, roundingMode);
-                    Unquantize(ep[j], 6, 4);
-                }
-            }
-
-            struct SinglePlaneTemporaries
-            {
-                UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
-                UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
-
-                MUInt15 fragmentBestIndexes[BC7Data::g_numFragments];
-                MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4];
-                MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode];
-            };
-
-            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
-
-                MUInt15 intAverage[4];
-                for (int ch = 0; ch < 4; ch++)
-                    intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
-
-                MUInt15 eps[2][4];
-                MUInt15 reconstructed[4];
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        eps[epi][ch] = ParallelMath::MakeUInt15(0);
-                    eps[epi][3] = ParallelMath::MakeUInt15(255);
-                }
-
-                for (int ch = 0; ch < 3; ch++)
-                    reconstructed[ch] = ParallelMath::MakeUInt15(0);
-                reconstructed[3] = ParallelMath::MakeUInt15(255);
-
-                // Depending on the target index and parity bits, there are multiple valid solid colors.
-                // We want to find the one closest to the actual average.
-                MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
-                for (int t = 0; t < numTables; t++)
-                {
-                    const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
-
-                    ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
-
-                    MUInt15 candidateReconstructed[4];
-                    MUInt15 candidateEPs[2][4];
-
-                    for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                        {
-                            ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
-                            assert(avgValue >= 0 && avgValue <= 255);
-
-                            const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
-
-                            ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
-                            ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
-                            ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
-                        }
-                    }
-
-                    MFloat avgError = ParallelMath::MakeFloatZero();
-                    for (int ch = 0; ch < numRealChannels; ch++)
-                    {
-                        MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
-                        avgError = avgError + delta * delta * channelWeightsSq[ch];
-                    }
-
-                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
-                    better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
-
-                    if (ParallelMath::AnySet(better))
-                    {
-                        ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
-
-                        MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
-
-                        ParallelMath::ConditionalSet(index, better, candidateIndex);
-
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
-
-                        for (int epi = 0; epi < 2; epi++)
-                            for (int ch = 0; ch < numRealChannels; ch++)
-                                ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
-                    }
-                }
-
-                AggregatedError<4> aggError;
-                for (int pxi = 0; pxi < shapeLength; pxi++)
-                {
-                    int px = fragmentStart[pxi];
-
-                    BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                }
-
-                MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
-
-                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
-                if (ParallelMath::AnySet(better))
-                {
-                    shapeBestError = ParallelMath::Min(shapeBestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
-                    }
-
-                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                        ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
-                }
-            }
-
-
-            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                SinglePlaneTemporaries temps;
-
-                MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-                ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 a = pixels[px][3];
-                    maxAlpha = ParallelMath::Max(maxAlpha, a);
-                    minAlpha = ParallelMath::Min(minAlpha, a);
-
-                    isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
-                }
-
-                ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
-                ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
-
-                bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
-
-                // Try RGB modes if any block has a min alpha 251 or higher
-                bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
-
-                // Try mode 7 if any block has alpha.
-                // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
-                // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
-                // situations, and only by at most 1 unit of error per pixel.
-                bool allowMode7 = anyBlockHasAlpha;
-
-                MFloat preWeightedPixels[16][4];
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                const int *rgbInitialEPCollapseList = NULL;
-
-                // Get initial RGB endpoints
-                if (allowRGBModes)
-                {
-                    const int *shapeList;
-                    int numShapesToEvaluate;
-
-                    if (flags & Flags::BC7_EnablePartitioning)
-                    {
-                        if (flags & Flags::BC7_Enable3Subsets)
-                        {
-                            shapeList = BC7Data::g_shapeListAll;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeListAll;
-                            numShapesToEvaluate = BC7Data::g_numShapesAll;
-                        }
-                        else
-                        {
-                            shapeList = BC7Data::g_shapeList12;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-                            numShapesToEvaluate = BC7Data::g_numShapes12;
-                        }
-                    }
-                    else
-                    {
-                        shapeList = BC7Data::g_shapeList1;
-                        rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse;
-                        numShapesToEvaluate = BC7Data::g_numShapes1;
-                    }
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                        EndpointSelector<3, 8> epSelector;
-
-                        for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                        {
-                            for (int spx = 0; spx < shapeSize; spx++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + spx];
-                                epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                            }
-                            epSelector.FinishPass(epPass);
-                        }
-                        temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                    }
-                }
-
-                const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-
-                // Get initial RGBA endpoints
-                {
-                    const int *shapeList = BC7Data::g_shapeList12;
-                    int numShapesToEvaluate = BC7Data::g_numShapes12;
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        if (anyBlockHasAlpha || !allowRGBModes)
-                        {
-                            int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                            int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                            EndpointSelector<4, 8> epSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int spx = 0; spx < shapeSize; spx++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + spx];
-                                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                                }
-                                epSelector.FinishPass(epPass);
-                            }
-                            temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                        }
-                        else
-                        {
-                            temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255);
-                        }
-                    }
-                }
-
-                for (uint16_t mode = 0; mode <= 7; mode++)
-                {
-                    if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1)
-                        continue;
-
-                    if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3)
-                        continue;
-
-                    if (mode == 4 || mode == 5)
-                        continue;
-
-                    if (mode < 4 && !allowRGBModes)
-                        continue;
-
-                    if (mode == 7 && !allowMode7)
-                        continue;
-
-                    bool isRGB = (mode < 4);
-
-                    unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
-                    int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
-                    int indexPrec = BC7Data::g_modes[mode].m_indexBits;
-
-                    int parityBitMax = 1;
-                    if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                        parityBitMax = 4;
-                    else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
-                        parityBitMax = 2;
-
-                    int numRealChannels = isRGB ? 3 : 4;
-
-                    int numShapes;
-                    const int *shapeList;
-                    const int *shapeCollapseList;
-
-                    if (numSubsets == 1)
-                    {
-                        numShapes = BC7Data::g_numShapes1;
-                        shapeList = BC7Data::g_shapeList1;
-                        shapeCollapseList = BC7Data::g_shapeList1Collapse;
-                    }
-                    else if (numSubsets == 2)
-                    {
-                        numShapes = BC7Data::g_numShapes2;
-                        shapeList = BC7Data::g_shapeList2;
-                        shapeCollapseList = BC7Data::g_shapeList2Collapse;
-                    }
-                    else
-                    {
-                        assert(numSubsets == 3);
-                        if (numPartitions == 16)
-                        {
-                            numShapes = BC7Data::g_numShapes3Short;
-                            shapeList = BC7Data::g_shapeList3Short;
-                            shapeCollapseList = BC7Data::g_shapeList3ShortCollapse;
-                        }
-                        else
-                        {
-                            assert(numPartitions == 64);
-                            numShapes = BC7Data::g_numShapes3;
-                            shapeList = BC7Data::g_shapeList3;
-                            shapeCollapseList = BC7Data::g_shapeList3Collapse;
-                        }
-                    }
-
-                    for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++)
-                        temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
-
-                    for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                        int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                        AggregatedError<1> alphaAggError;
-                        if (isRGB && anyBlockHasAlpha)
-                        {
-                            MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                MUInt15 original[1] = { pixels[px][3] };
-                                BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
-                            }
-                        }
-
-                        float alphaWeightsSq[1] = { channelWeightsSq[3] };
-                        MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
-
-                        assert(shapeCollapsedEvalIndex >= 0);
-
-                        MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
-
-                        for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                        {
-                            if (isRGB)
-                            {
-                                temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                                tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
-                            }
-                            else
-                            {
-                                temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                            }
-                        }
-
-                        ParallelMath::Int16CompFlag punchThroughInvalid[4];
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
-
-                            if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
-                            {
-                                // Modes 6 and 7 have parity bits that affect alpha
-                                if (pIter == 0)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
-                                else if (pIter == parityBitMax - 1)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
-                                else
-                                    punchThroughInvalid[pIter] = isPunchThrough;
-                            }
-                        }
-
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
-                                continue;
-
-                            bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                uint16_t p[2];
-                                p[0] = (pIter & 1);
-                                p[1] = ((pIter >> 1) & 1);
-
-                                MUInt15 ep[2][4];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    switch (mode)
-                                    {
-                                    case 0:
-                                        CompressEndpoints0(ep, p, rtn);
-                                        break;
-                                    case 1:
-                                        CompressEndpoints1(ep, p[0], rtn);
-                                        break;
-                                    case 2:
-                                        CompressEndpoints2(ep, rtn);
-                                        break;
-                                    case 3:
-                                        CompressEndpoints3(ep, p, rtn);
-                                        break;
-                                    case 6:
-                                        CompressEndpoints6(ep, p, rtn);
-                                        break;
-                                    case 7:
-                                        CompressEndpoints7(ep, p, rtn);
-                                        break;
-                                    default:
-                                        assert(false);
-                                        break;
-                                    };
-
-                                    MFloat shapeError = ParallelMath::MakeFloatZero();
-
-                                    IndexSelector<4> indexSelector;
-                                    indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
-
-                                    EndpointRefiner<4> epRefiner;
-                                    epRefiner.Init(1 << indexPrec, channelWeights);
-
-                                    MUInt15 indexes[16];
-
-                                    AggregatedError<4> aggError;
-                                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                                    {
-                                        int px = BC7Data::g_fragments[shapeStart + pxi];
-
-                                        MUInt15 index;
-                                        MUInt15 reconstructed[4];
-
-                                        index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
-                                        indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                            BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                                        else
-                                        {
-                                            MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-
-                                            MUInt15 altIndexes[2];
-                                            altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
-
-                                                MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-                                                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
-                                                error = ParallelMath::Min(error, altError);
-                                                ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
-                                            }
-
-                                            shapeError = shapeError + error;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                            epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
-
-                                        indexes[pxi] = index;
-                                    }
-
-                                    if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        shapeError = aggError.Finalize(flags, channelWeightsSq);
-
-                                    if (isRGB)
-                                        shapeError = shapeError + staticAlphaError;
-
-                                    ParallelMath::FloatCompFlag shapeErrorBetter;
-                                    ParallelMath::Int16CompFlag shapeErrorBetter16;
-
-                                    shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]);
-                                    shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
-
-                                    if (ParallelMath::AnySet(shapeErrorBetter16))
-                                    {
-                                        bool punchThroughOK = true;
-                                        if (needPunchThroughCheck)
-                                        {
-                                            shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
-                                            shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
-
-                                            if (!ParallelMath::AnySet(shapeErrorBetter16))
-                                                punchThroughOK = false;
-                                        }
-
-                                        if (punchThroughOK)
-                                        {
-                                            ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError);
-                                            for (int epi = 0; epi < 2; epi++)
-                                                for (int ch = 0; ch < numRealChannels; ch++)
-                                                    ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]);
-
-                                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                                                ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
-                                        }
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                        epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
-                                } // refine
-                            } // tweak
-                        } // p
-
-                        if (flags & cvtt::Flags::BC7_TrySingleColor)
-                        {
-                            MUInt15 total[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                total[ch] = ParallelMath::MakeUInt15(0);
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                for (int ch = 0; ch < 4; ch++)
-                                    total[ch] = total[ch] + pixels[pxi][ch];
-                            }
-
-                            MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
-                            MFloat average[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
-
-                            const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
-                            MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex];
-                            MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex];
-                            MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
-
-                            const cvtt::Tables::BC7SC::Table **scTables = NULL;
-                            int numSCTables = 0;
-
-                            switch (mode)
-                            {
-                            case 0:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 1:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 2:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode2,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 3:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode3_p0,
-                                        &cvtt::Tables::BC7SC::g_mode3_p1,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 6:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i7,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i7,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 7:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode7_p00,
-                                        &cvtt::Tables::BC7SC::g_mode7_p01,
-                                        &cvtt::Tables::BC7SC::g_mode7_p10,
-                                        &cvtt::Tables::BC7SC::g_mode7_p11,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            default:
-                                assert(false);
-                                break;
-                            }
-
-                            TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
-                        }
-                    } // shapeIter
-
-                    for (uint16_t partition = 0; partition < numPartitions; partition++)
-                    {
-                        const int *partitionShapes;
-                        if (numSubsets == 1)
-                            partitionShapes = BC7Data::g_shapes1[partition];
-                        else if (numSubsets == 2)
-                            partitionShapes = BC7Data::g_shapes2[partition];
-                        else
-                        {
-                            assert(numSubsets == 3);
-                            partitionShapes = BC7Data::g_shapes3[partition];
-                        }
-
-                        MFloat totalError = ParallelMath::MakeFloatZero();
-                        for (int subset = 0; subset < numSubsets; subset++)
-                            totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]];
-
-                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
-                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                        if (ParallelMath::AnySet(errorBetter16))
-                        {
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                int shape = partitionShapes[subset];
-                                int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                                int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                                int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]);
-
-                                for (int pxi = 0; pxi < shapeLength; pxi++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + pxi];
-                                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
-                                }
-                            }
-
-                            work.m_error = ParallelMath::Min(totalError, work.m_error);
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
-                        }
-                    }
-                }
-            }
-
-            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
-                // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
-                // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
-                // - Separate alpha channel, then weighted RGB
-                // - Alpha+2 other channels, then the independent channel
-
-                if (!(flags & Flags::BC7_EnableDualPlane))
-                    return;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (uint16_t mode = 4; mode <= 5; mode++)
-                {
-                    for (uint16_t rotation = 0; rotation < 4; rotation++)
-                    {
-                        int alphaChannel = (rotation + 3) & 3;
-                        int redChannel = (rotation == 1) ? 3 : 0;
-                        int greenChannel = (rotation == 2) ? 3 : 1;
-                        int blueChannel = (rotation == 3) ? 3 : 2;
-
-                        MUInt15 rotatedRGB[16][3];
-                        MFloat floatRotatedRGB[16][3];
-
-                        for (int px = 0; px < 16; px++)
-                        {
-                            rotatedRGB[px][0] = pixels[px][redChannel];
-                            rotatedRGB[px][1] = pixels[px][greenChannel];
-                            rotatedRGB[px][2] = pixels[px][blueChannel];
-
-                            for (int ch = 0; ch < 3; ch++)
-                                floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
-                        }
-
-                        uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
-
-                        float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
-                        float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
-                        float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
-                        float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
-
-                        float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
-
-                        MFloat preWeightedRotatedRGB[16][3];
-                        BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
-
-                        for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
-                        {
-                            EndpointSelector<3, 8> rgbSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int px = 0; px < 16; px++)
-                                    rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
-
-                                rgbSelector.FinishPass(epPass);
-                            }
-
-                            MUInt15 alphaRange[2];
-
-                            alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
-                            for (int px = 1; px < 16; px++)
-                            {
-                                alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
-                                alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
-                            }
-
-                            int rgbPrec = 0;
-                            int alphaPrec = 0;
-
-                            if (mode == 4)
-                            {
-                                rgbPrec = indexSelector ? 3 : 2;
-                                alphaPrec = indexSelector ? 2 : 3;
-                            }
-                            else
-                                rgbPrec = alphaPrec = 2;
-
-                            UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
-
-                            MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
-                            MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
-
-                            MUInt15 bestRGBIndexes[16];
-                            MUInt15 bestAlphaIndexes[16];
-                            MUInt15 bestEP[2][4];
-
-                            for (int px = 0; px < 16; px++)
-                                bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 rgbEP[2][3];
-                                MUInt15 alphaEP[2];
-
-                                unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
-
-                                TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    if (mode == 4)
-                                        CompressEndpoints4(rgbEP, alphaEP, rtn);
-                                    else
-                                        CompressEndpoints5(rgbEP, alphaEP, rtn);
-
-
-                                    IndexSelector<1> alphaIndexSelector;
-                                    IndexSelector<3> rgbIndexSelector;
-
-                                    {
-                                        MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
-                                        alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
-                                    }
-                                    rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
-
-                                    EndpointRefiner<3> rgbRefiner;
-                                    EndpointRefiner<1> alphaRefiner;
-
-                                    rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
-                                    alphaRefiner.Init(1 << alphaPrec, uniformWeight);
-
-                                    MFloat errorRGB = ParallelMath::MakeFloatZero();
-                                    MFloat errorA = ParallelMath::MakeFloatZero();
-
-                                    MUInt15 rgbIndexes[16];
-                                    MUInt15 alphaIndexes[16];
-
-                                    AggregatedError<3> rgbAggError;
-                                    AggregatedError<1> alphaAggError;
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
-                                        MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
-
-                                        MUInt15 reconstructedRGB[3];
-                                        MUInt15 reconstructedAlpha[1];
-
-                                        rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
-                                        alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        {
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
-                                        }
-                                        else
-                                        {
-                                            AggregatedError<3> baseRGBAggError;
-                                            AggregatedError<1> baseAlphaAggError;
-
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
-
-                                            MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                            MUInt15 altRGBIndexes[2];
-                                            MUInt15 altAlphaIndexes[2];
-
-                                            altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
-
-                                            altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
-                                                alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
-
-                                                AggregatedError<3> altRGBAggError;
-                                                AggregatedError<1> altAlphaAggError;
-
-                                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
-                                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
-
-                                                MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                                MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                                ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
-                                                ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
-
-                                                rgbError = ParallelMath::Min(altRGBError, rgbError);
-                                                alphaError = ParallelMath::Min(altAlphaError, alphaError);
-
-                                                ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
-                                                ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
-                                            }
-
-                                            errorRGB = errorRGB + rgbError;
-                                            errorA = errorA + alphaError;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                        {
-                                            rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
-                                            alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
-                                        }
-
-                                        if (flags & Flags::BC7_FastIndexing)
-                                        {
-                                            errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq);
-                                        }
-
-                                        rgbIndexes[px] = rgbIndex;
-                                        alphaIndexes[px] = alphaIndex;
-                                    }
-
-                                    ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
-                                    ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
-
-                                    ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
-                                    ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
-
-                                    if (ParallelMath::AnySet(rgbBetterInt16))
-                                    {
-                                        bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                        {
-                                            for (int ch = 0; ch < 3; ch++)
-                                                ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
-                                        }
-                                    }
-
-                                    if (ParallelMath::AnySet(alphaBetterInt16))
-                                    {
-                                        bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                            ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                    {
-                                        rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
-
-                                        MUInt15 alphaEPTemp[2][1];
-                                        alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
-
-                                        for (int i = 0; i < 2; i++)
-                                            alphaEP[i] = alphaEPTemp[i][0];
-                                    }
-                                }	// refine
-                            } // tweak
-
-                            MFloat combinedError = bestRGBError + bestAlphaError;
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            work.m_error = ParallelMath::Min(combinedError, work.m_error);
-
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
-
-                            for (int px = 0; px < 16; px++)
-                            {
-                                ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
-                                ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
-                            }
-
-                            for (int ep = 0; ep < 2; ep++)
-                                for (int ch = 0; ch < 4; ch++)
-                                    ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
-                        }
-                    }
-                }
-            }
-
-            template<class T>
-            static void Swap(T& a, T& b)
-            {
-                T temp = a;
-                a = b;
-                b = temp;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds)
-            {
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                WorkInfo work;
-                memset(&work, 0, sizeof(work));
-
-                work.m_error = ParallelMath::MakeFloat(FLT_MAX);
-
-                {
-                    ParallelMath::RoundTowardNearestForScope rtn;
-                    TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                    TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    PackingVector pv;
-                    pv.Init();
-
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
-                    ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
-
-                    const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
-
-                    ParallelMath::ScalarUInt16 indexes[16];
-                    ParallelMath::ScalarUInt16 indexes2[16];
-                    ParallelMath::ScalarUInt16 endPoints[3][2][4];
-
-                    for (int i = 0; i < 16; i++)
-                    {
-                        indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
-                        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                            indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
-                    }
-
-                    for (int subset = 0; subset < 3; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 4; ch++)
-                                endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
-                        }
-                    }
-
-                    int fixups[3] = { 0, 0, 0 };
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-                        bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
-
-                        if (flipRGB)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes[px] = highIndex - indexes[px];
-                        }
-
-                        if (flipAlpha)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes2[px] = highIndex - indexes2[px];
-                        }
-
-                        if (indexSelector)
-                            Swap(flipRGB, flipAlpha);
-
-                        if (flipRGB)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
-                        }
-                        if (flipAlpha)
-                            Swap(endPoints[0][0][3], endPoints[0][1][3]);
-
-                    }
-                    else
-                    {
-                        if (modeInfo.m_numSubsets == 2)
-                            fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                        else if (modeInfo.m_numSubsets == 3)
-                        {
-                            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                        }
-
-                        bool flip[3] = { false, false, false };
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-
-                        if (flip[0] || flip[1] || flip[2])
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                int subset = 0;
-                                if (modeInfo.m_numSubsets == 2)
-                                    subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                                else if (modeInfo.m_numSubsets == 3)
-                                    subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                                if (flip[subset])
-                                    indexes[px] = highIndex - indexes[px];
-                            }
-
-                            int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
-                            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            {
-                                if (flip[subset])
-                                    for (int ch = 0; ch < maxCH; ch++)
-                                        Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
-                            }
-                        }
-                    }
-
-                    pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
-
-                    if (modeInfo.m_partitionBits)
-                        pv.Pack(partition, modeInfo.m_partitionBits);
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
-                        pv.Pack(rotation, 2);
-                    }
-
-                    if (modeInfo.m_hasIndexSelector)
-                        pv.Pack(indexSelector, 1);
-
-                    // Encode RGB
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
-                                epPart >>= (8 - modeInfo.m_rgbBits);
-
-                                pv.Pack(epPart, modeInfo.m_rgbBits);
-                            }
-                        }
-                    }
-
-                    // Encode alpha
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
-                                epPart >>= (8 - modeInfo.m_alphaBits);
-
-                                pv.Pack(epPart, modeInfo.m_alphaBits);
-                            }
-                        }
-                    }
-
-                    // Encode parity bits
-                    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
-                            epPart >>= (7 - modeInfo.m_rgbBits);
-                            epPart &= 1;
-
-                            pv.Pack(epPart, 1);
-                        }
-                    }
-                    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
-                                epPart >>= (7 - modeInfo.m_rgbBits);
-                                epPart &= 1;
-
-                                pv.Pack(epPart, 1);
-                            }
-                        }
-                    }
-
-                    // Encode indexes
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_indexBits;
-                        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                            bits--;
-
-                        pv.Pack(indexes[px], bits);
-                    }
-
-                    // Encode secondary indexes
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int bits = modeInfo.m_alphaIndexBits;
-                            if (px == 0)
-                                bits--;
-
-                            pv.Pack(indexes2[px], bits);
-                        }
-                    }
-
-                    pv.Flush(packedBlocks);
-
-                    packedBlocks += 16;
-                }
-            }
-
-            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
-            {
-                UnpackingVector pv;
-                pv.Init(packedBlock);
-
-                int mode = 8;
-                for (int i = 0; i < 8; i++)
-                {
-                    if (pv.Unpack(1) == 1)
-                    {
-                        mode = i;
-                        break;
-                    }
-                }
-
-                if (mode > 7)
-                {
-                    for (int px = 0; px < 16; px++)
-                        for (int ch = 0; ch < 4; ch++)
-                            output.m_pixels[px][ch] = 0;
-
-                    return;
-                }
-
-                const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
-
-                int partition = 0;
-                if (modeInfo.m_partitionBits)
-                    partition = pv.Unpack(modeInfo.m_partitionBits);
-
-                int rotation = 0;
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    rotation = pv.Unpack(2);
-
-                int indexSelector = 0;
-                if (modeInfo.m_hasIndexSelector)
-                    indexSelector = pv.Unpack(1);
-
-                // Resolve fixups
-                int fixups[3] = { 0, 0, 0 };
-
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
-                {
-                    if (modeInfo.m_numSubsets == 2)
-                        fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                    else if (modeInfo.m_numSubsets == 3)
-                    {
-                        fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                        fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                    }
-                }
-
-                int endPoints[3][2][4];
-
-                // Decode RGB
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
-                    }
-                }
-
-                // Decode alpha
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
-                    }
-                }
-                else
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = 255;
-                    }
-                }
-
-                int parityBits = 0;
-
-                // Decode parity bits
-                if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        int p = pv.Unpack(1);
-
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-                else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            int p = pv.Unpack(1);
-
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-
-                // Fill endpoint bits
-                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                {
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
-
-                        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                            endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
-                    }
-                }
-
-                int indexes[16];
-                int indexes2[16];
-
-                // Decode indexes
-                for (int px = 0; px < 16; px++)
-                {
-                    int bits = modeInfo.m_indexBits;
-                    if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                        bits--;
-
-                    indexes[px] = pv.Unpack(bits);
-                }
-
-                // Decode secondary indexes
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_alphaIndexBits;
-                        if (px == 0)
-                            bits--;
-
-                        indexes2[px] = pv.Unpack(bits);
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                        indexes2[px] = 0;
-                }
-
-                const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
-                const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
-
-                // Decode each pixel
-                for (int px = 0; px < 16; px++)
-                {
-                    int rgbWeight = 0;
-                    int alphaWeight = 0;
-
-                    int rgbIndex = indexes[px];
-
-                    rgbWeight = rgbWeights[indexes[px]];
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
-                        alphaWeight = rgbWeight;
-                    else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                        alphaWeight = alphaWeights[indexes2[px]];
-
-                    if (indexSelector == 1)
-                    {
-                        int temp = rgbWeight;
-                        rgbWeight = alphaWeight;
-                        alphaWeight = temp;
-                    }
-
-                    int pixel[4] = { 0, 0, 0, 255 };
-
-                    int subset = 0;
-
-                    if (modeInfo.m_numSubsets == 2)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                    else if (modeInfo.m_numSubsets == 3)
-                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                    for (int ch = 0; ch < 3; ch++)
-                        pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
-
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                        pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
-
-                    if (rotation != 0)
-                    {
-                        int ch = rotation - 1;
-                        int temp = pixel[ch];
-                        pixel[ch] = pixel[3];
-                        pixel[3] = temp;
-                    }
-
-                    for (int ch = 0; ch < 4; ch++)
-                        output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
-                }
-            }
-        };
-
-        class BC6HComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            static const int MaxTweakRounds = 4;
-            static const int MaxRefineRounds = 3;
-
-            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
-                assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
-
-                // Expand to full range
-                ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
-                MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
-
-                absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
-
-                MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
-
-                return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
-            }
-
-            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
-                return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
-            }
-
-            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
-            {
-                MSInt16 zero = ParallelMath::MakeSInt16(0);
-
-                ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
-                MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
-
-                MSInt16 unq;
-                MUInt15 absUnq;
-
-                if (precision >= 16)
-                {
-                    unq = comp;
-                    absUnq = absComp;
-                }
-                else
-                {
-                    MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
-                    ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
-                    ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
-
-                    unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
-                }
-
-                outUnquantized = unq;
-
-                MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
-
-                outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
-            }
-
-            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
-            {
-                MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
-                if (precision < 15)
-                {
-                    MUInt15 zero = ParallelMath::MakeUInt15(0);
-                    MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
-
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
-
-                    ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
-                    ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
-                }
-
-                outUnquantized = unq;
-                outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
-            }
-
-            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MSInt16 unquantizedEP[2][3];
-                MSInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
-                            UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MUInt16 unquantizedEP[2][3];
-                MUInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
-                            UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][0][ch] = ep0[0][ch];
-                    outEncodedEPs[0][1][ch] = ep0[1][ch];
-                    outEncodedEPs[1][0][ch] = ep1[0][ch];
-                    outEncodedEPs[1][1][ch] = ep1[1][ch];
-
-                    if (isTransformed)
-                    {
-                        for (int subset = 0; subset < 2; subset++)
-                        {
-                            for (int epi = 0; epi < 2; epi++)
-                            {
-                                if (epi == 0 && subset == 0)
-                                    continue;
-
-                                MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
-
-                                MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
-
-                                outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                                MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
-                                allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                            }
-                        }
-                    }
-
-                    if (!ParallelMath::AnySet(allLegal))
-                        break;
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][ch] = ep[0][ch];
-                    outEncodedEPs[1][ch] = ep[1][ch];
-
-                    if (isTransformed)
-                    {
-                        MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
-
-                        MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
-
-                        outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                        MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
-                        allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                    }
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
-            {
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-                else if (numRefineRounds > MaxRefineRounds)
-                    numRefineRounds = MaxRefineRounds;
-
-                bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
-                float channelWeightsSq[3];
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                MSInt16 pixels[16][3];
-                MFloat floatPixels2CL[16][3];
-                MFloat floatPixelsLinearWeighted[16][3];
-
-                MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MSInt16 pixelValue;
-                        ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
-
-                        // Convert from sign+magnitude to 2CL
-                        if (isSigned)
-                        {
-                            ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
-                            MSInt16 magnitude = (pixelValue & low15Bits);
-                            ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
-                        }
-                        else
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
-
-                        pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
-
-                        pixels[px][ch] = pixelValue;
-                        floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
-                        floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
-                    }
-                }
-
-                MFloat preWeightedPixels[16][3];
-
-                BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
-
-                MAInt16 bestEndPoints[2][2][3];
-                MUInt15 bestIndexes[16];
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestMode = ParallelMath::MakeUInt15(0);
-                MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
-
-                UnfinishedEndpoints<3> partitionedUFEP[32][2];
-                UnfinishedEndpoints<3> singleUFEP;
-
-                // Generate UFEP for partitions
-                for (int p = 0; p < 32; p++)
-                {
-                    int partitionMask = BC7Data::g_partitionMap[p];
-
-                    EndpointSelector<3, 8> epSelectors[2];
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int subset = (partitionMask >> px) & 1;
-                            epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-                        }
-
-                        for (int subset = 0; subset < 2; subset++)
-                            epSelectors[subset].FinishPass(pass);
-                    }
-
-                    for (int subset = 0; subset < 2; subset++)
-                        partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
-                }
-
-                // Generate UFEP for single
-                {
-                    EndpointSelector<3, 8> epSelector;
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                            epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-
-                        epSelector.FinishPass(pass);
-                    }
-
-                    singleUFEP = epSelector.GetEndpoints(channelWeights);
-                }
-
-                for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
-                {
-                    bool partitioned = (partitionedInt == 1);
-
-                    for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
-                    {
-                        if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
-                            continue;
-
-                        int numPartitions = partitioned ? 32 : 1;
-                        int numSubsets = partitioned ? 2 : 1;
-                        int indexBits = partitioned ? 3 : 4;
-                        int indexRange = (1 << indexBits);
-
-                        for (int p = 0; p < numPartitions; p++)
-                        {
-                            int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
-
-                            const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
-
-                            MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
-                            MUInt15 metaIndexes[MaxMetaRounds][16];
-                            MFloat metaError[MaxMetaRounds][2];
-
-                            bool roundValid[MaxMetaRounds][2];
-
-                            for (int r = 0; r < MaxMetaRounds; r++)
-                                for (int subset = 0; subset < 2; subset++)
-                                    roundValid[r][subset] = true;
-
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
-                                {
-                                    EndpointRefiner<3> refiners[2];
-
-                                    bool abortRemainingRefines = false;
-                                    for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
-                                    {
-                                        int metaRound = tweak * MaxRefineRounds + refinePass;
-
-                                        if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
-                                            abortRemainingRefines = true;
-
-                                        if (abortRemainingRefines)
-                                        {
-                                            roundValid[metaRound][subset] = false;
-                                            continue;
-                                        }
-
-                                        MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
-                                        MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
-
-                                        MSInt16 endPointsColorSpace[2][3];
-
-                                        if (refinePass == 0)
-                                        {
-                                            UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
-
-                                            if (isSigned)
-                                                ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                            else
-                                                ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                        }
-                                        else
-                                            refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
-
-                                        refiners[subset].Init(indexRange, channelWeights);
-
-                                        int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
-
-                                        IndexSelectorHDR<3> indexSelector;
-                                        if (isSigned)
-                                            QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-                                        else
-                                            QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-
-                                        if (metaRound > 0)
-                                        {
-                                            ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
-
-                                            for (int prevRound = 0; prevRound < metaRound; prevRound++)
-                                            {
-                                                MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
-
-                                                ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
-
-                                                for (int epi = 0; epi < 2; epi++)
-                                                    for (int ch = 0; ch < 3; ch++)
-                                                        same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
-
-                                                anySame = (anySame | same);
-                                                if (ParallelMath::AllSet(anySame))
-                                                    break;
-                                            }
-
-                                            if (ParallelMath::AllSet(anySame))
-                                            {
-                                                roundValid[metaRound][subset] = false;
-                                                continue;
-                                            }
-                                        }
-
-                                        MFloat subsetError = ParallelMath::MakeFloatZero();
-
-                                        {
-                                            for (int px = 0; px < 16; px++)
-                                            {
-                                                if (subset != ((partitionMask >> px) & 1))
-                                                    continue;
-
-                                                MUInt15 index;
-                                                if (px == fixupIndex)
-                                                    index = mrIndexes[px];
-                                                else
-                                                {
-                                                    index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
-                                                    mrIndexes[px] = index;
-                                                }
-
-                                                MSInt16 reconstructed[3];
-                                                if (isSigned)
-                                                    indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
-                                                else
-                                                    indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
-
-                                                subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
-
-                                                if (refinePass != numRefineRounds - 1)
-                                                    refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
-                                            }
-                                        }
-
-                                        metaError[metaRound][subset] = subsetError;
-                                    }
-                                }
-                            }
-
-                            // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
-                            int numMeta1 = partitioned ? MaxMetaRounds : 1;
-                            for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
-                            {
-                                if (!roundValid[meta0][0])
-                                    continue;
-
-                                for (int meta1 = 0; meta1 < numMeta1; meta1++)
-                                {
-                                    MFloat combinedError = metaError[meta0][0];
-                                    if (partitioned)
-                                    {
-                                        if (!roundValid[meta1][1])
-                                            continue;
-
-                                        combinedError = combinedError + metaError[meta1][1];
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
-                                    if (!ParallelMath::AnySet(errorBetter))
-                                        continue;
-
-                                    ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    // Figure out if this is encodable
-                                    for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
-                                    {
-                                        const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
-
-                                        if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
-                                            continue;
-
-                                        MAInt16 encodedEPs[2][2][3];
-                                        ParallelMath::Int16CompFlag isLegal;
-                                        if (partitioned)
-                                            EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
-                                        else
-                                            EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
-
-                                        ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
-                                        if (!ParallelMath::AnySet(isLegalAndBetter))
-                                            continue;
-
-                                        ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
-
-                                        ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
-                                        ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
-                                        ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
-
-                                        for (int subset = 0; subset < numSubsets; subset++)
-                                        {
-                                            for (int epi = 0; epi < 2; epi++)
-                                            {
-                                                for (int ch = 0; ch < 3; ch++)
-                                                    ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
-                                            }
-                                        }
-
-                                        for (int px = 0; px < 16; px++)
-                                        {
-                                            int subset = ((partitionMask >> px) & 1);
-                                            if (subset == 0)
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
-                                            else
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
-                                        }
-
-                                        needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
-                                        if (!ParallelMath::AnySet(needsCommit))
-                                            break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                // At this point, everything should be set
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
-                    int32_t eps[2][2][3];
-                    ParallelMath::ScalarUInt16 indexes[16];
-
-                    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-
-                    const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                    const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-
-                    for (int subset = 0; subset < 2; subset++)
-                    {
-                        for (int epi = 0; epi < 2; epi++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
-                        }
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                        indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
-
-                    uint16_t modeID = modeInfo.m_modeID;
-
-                    PackingVector pv;
-                    pv.Init();
-
-                    for (size_t i = 0; i < headerBits; i++)
-                    {
-                        int32_t codedValue = 0;
-                        switch (desc[i].m_eField)
-                        {
-                        case BC6HData::M:  codedValue = modeID; break;
-                        case BC6HData::D:  codedValue = partition; break;
-                        case BC6HData::RW: codedValue = eps[0][0][0]; break;
-                        case BC6HData::RX: codedValue = eps[0][1][0]; break;
-                        case BC6HData::RY: codedValue = eps[1][0][0]; break;
-                        case BC6HData::RZ: codedValue = eps[1][1][0]; break;
-                        case BC6HData::GW: codedValue = eps[0][0][1]; break;
-                        case BC6HData::GX: codedValue = eps[0][1][1]; break;
-                        case BC6HData::GY: codedValue = eps[1][0][1]; break;
-                        case BC6HData::GZ: codedValue = eps[1][1][1]; break;
-                        case BC6HData::BW: codedValue = eps[0][0][2]; break;
-                        case BC6HData::BX: codedValue = eps[0][1][2]; break;
-                        case BC6HData::BY: codedValue = eps[1][0][2]; break;
-                        case BC6HData::BZ: codedValue = eps[1][1][2]; break;
-                        default: assert(false); break;
-                        }
-
-                        pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
-                    }
-
-                    int fixupIndex1 = 0;
-                    int indexBits = 4;
-                    if (modeInfo.m_partitioned)
-                    {
-                        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                        indexBits = 3;
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
-                        if (px == 0 || px == fixupIndex1)
-                            pv.Pack(index, indexBits - 1);
-                        else
-                            pv.Pack(index, indexBits);
-                    }
-
-                    pv.Flush(packedBlocks + 16 * block);
-                }
-            }
-
-            static void SignExtendSingle(int &v, int bits)
-            {
-                if (v & (1 << (bits - 1)))
-                    v |= -(1 << bits);
-            }
-
-            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
-            {
-                UnpackingVector pv;
-                pv.Init(pBC);
-
-                int numModeBits = 2;
-                int modeBits = pv.Unpack(2);
-                if (modeBits != 0 && modeBits != 1)
-                {
-                    modeBits |= pv.Unpack(3) << 2;
-                    numModeBits += 3;
-                }
-
-                int mode = -1;
-                for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
-                {
-                    if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
-                    {
-                        mode = possibleMode;
-                        break;
-                    }
-                }
-
-                if (mode < 0)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            output.m_pixels[px][ch] = 0;
-                        output.m_pixels[px][3] = 0x3c00;	// 1.0
-                    }
-                    return;
-                }
-
-                const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-                const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-                const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                int32_t partition = 0;
-                int32_t eps[2][2][3];
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            eps[subset][epi][ch] = 0;
-
-                for (size_t i = numModeBits; i < headerBits; i++)
-                {
-                    int32_t *pCodedValue = NULL;
-
-                    switch (desc[i].m_eField)
-                    {
-                    case BC6HData::D:  pCodedValue = &partition; break;
-                    case BC6HData::RW: pCodedValue = &eps[0][0][0]; break;
-                    case BC6HData::RX: pCodedValue = &eps[0][1][0]; break;
-                    case BC6HData::RY: pCodedValue = &eps[1][0][0]; break;
-                    case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break;
-                    case BC6HData::GW: pCodedValue = &eps[0][0][1]; break;
-                    case BC6HData::GX: pCodedValue = &eps[0][1][1]; break;
-                    case BC6HData::GY: pCodedValue = &eps[1][0][1]; break;
-                    case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break;
-                    case BC6HData::BW: pCodedValue = &eps[0][0][2]; break;
-                    case BC6HData::BX: pCodedValue = &eps[0][1][2]; break;
-                    case BC6HData::BY: pCodedValue = &eps[1][0][2]; break;
-                    case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break;
-                    default: assert(false); break;
-                    }
-
-                    (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
-                }
-
-
-                uint16_t modeID = modeInfo.m_modeID;
-
-                int fixupIndex1 = 0;
-                int indexBits = 4;
-                int numSubsets = 1;
-                if (modeInfo.m_partitioned)
-                {
-                    fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                    indexBits = 3;
-                    numSubsets = 2;
-                }
-
-                int indexes[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    if (px == 0 || px == fixupIndex1)
-                        indexes[px] = pv.Unpack(indexBits - 1);
-                    else
-                        indexes[px] = pv.Unpack(indexBits);
-                }
-
-                if (modeInfo.m_partitioned)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                        {
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
-                        }
-                    }
-                }
-                else
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                    }
-                }
-
-                int aPrec = modeInfo.m_aPrec;
-
-                if (modeInfo.m_transformed)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int wrapMask = (1 << aPrec) - 1;
-
-                        eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
-                        if (isSigned)
-                            SignExtendSingle(eps[0][1][ch], aPrec);
-
-                        if (modeInfo.m_partitioned)
-                        {
-                            eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
-                            eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
-
-                            if (isSigned)
-                            {
-                                SignExtendSingle(eps[1][0][ch], aPrec);
-                                SignExtendSingle(eps[1][1][ch], aPrec);
-                            }
-                        }
-                    }
-                }
-
-                // Unquantize endpoints
-                for (int subset = 0; subset < numSubsets; subset++)
-                {
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            int &v = eps[subset][epi][ch];
-
-                            if (isSigned)
-                            {
-                                if (aPrec >= 16)
-                                {
-                                    // Nothing
-                                }
-                                else
-                                {
-                                    bool s = false;
-                                    int comp = v;
-                                    if (v < 0)
-                                    {
-                                        s = true;
-                                        comp = -comp;
-                                    }
-
-                                    int unq = 0;
-                                    if (comp == 0)
-                                        unq = 0;
-                                    else if (comp >= ((1 << (aPrec - 1)) - 1))
-                                        unq = 0x7fff;
-                                    else
-                                        unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
-
-                                    if (s)
-                                        unq = -unq;
-
-                                    v = unq;
-                                }
-                            }
-                            else
-                            {
-                                if (aPrec >= 15)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == 0)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == ((1 << aPrec) - 1))
-                                    v = 0xffff;
-                                else
-                                    v = ((v << 16) + 0x8000) >> aPrec;
-                            }
-                        }
-                    }
-                }
-
-                const int *weights = BC7Data::g_weightTables[indexBits];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    int subset = 0;
-                    if (modeInfo.m_partitioned)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-
-                    int w = weights[indexes[px]];
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
-
-                        if (isSigned)
-                        {
-                            if (comp < 0)
-                                comp = -(((-comp) * 31) >> 5);
-                            else
-                                comp = (comp * 31) >> 5;
-
-                            int s = 0;
-                            if (comp < 0)
-                            {
-                                s = 0x8000;
-                                comp = -comp;
-                            }
-
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
-                        }
-                        else
-                        {
-                            comp = (comp * 31) >> 6;
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
-                        }
-                    }
-                    output.m_pixels[px][3] = 0x3c00;	// 1.0
-                }
-            }
-        };
-
-        namespace S3TCSingleColorTables
-        {
-            struct SingleColorTableEntry
-            {
-                uint8_t m_min;
-                uint8_t m_max;
-                uint8_t m_actualColor;
-                uint8_t m_span;
-            };
-
-            SingleColorTableEntry g_singleColor5_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
-                { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
-                { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
-                { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
-                { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
-                { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-        }
-
-        class S3TCComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static void Init(MFloat& error)
-            {
-                error = ParallelMath::MakeFloat(FLT_MAX);
-            }
-
-            static void QuantizeTo6Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
-                v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
-            }
-
-            static void QuantizeTo5Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
-                v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
-            }
-
-            static void QuantizeTo565(MUInt15 endPoint[3])
-            {
-                QuantizeTo5Bits(endPoint[0]);
-                QuantizeTo6Bits(endPoint[1]);
-                QuantizeTo5Bits(endPoint[2]);
-            }
-
-            static MFloat ParanoidFactorForSpan(const MSInt16& span)
-            {
-                return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
-            }
-
-            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
-            {
-                MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
-                absDiff = absDiff + d;
-                return absDiff * absDiff;
-            }
-
-            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        totals[ch] = totals[ch] + pixels[px][ch];
-                }
-
-                MUInt15 average[3];
-                for (int ch = 0; ch < 3; ch++)
-                    average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
-
-                const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL;
-                const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL;
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3_p;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2_p;
-                    }
-                }
-                else
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2;
-                    }
-                }
-
-                MUInt15 interpolated[3];
-                MUInt15 eps[2][3];
-                MSInt16 spans[3];
-                for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        uint16_t avg = ParallelMath::Extract(average[ch], i);
-                        const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
-                        ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
-                        ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
-                        ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
-                        ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
-                    }
-                }
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    MFloat spanParanoidFactors[3];
-                    for (int ch = 0; ch < 3; ch++)
-                        spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
-                    }
-                }
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-                ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
-
-                if (ParallelMath::AnySet(better16))
-                {
-                    bestError = ParallelMath::Min(bestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
-
-                    MUInt15 vindexes = ParallelMath::MakeUInt15(1);
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
-
-                    ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
-                }
-            }
-
-            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 endPoints[2][3];
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
-
-                QuantizeTo565(endPoints[0]);
-                QuantizeTo565(endPoints[1]);
-
-                IndexSelector<3> selector;
-                selector.Init<false>(channelWeights, endPoints, range);
-
-                MUInt15 indexes[16];
-
-                MFloat paranoidFactors[3];
-                for (int ch = 0; ch < 3; ch++)
-                    paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                AggregatedError<3> aggError;
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
-                    indexes[px] = index;
-
-                    if (refiner)
-                        refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
-
-                    MUInt15 reconstructed[3];
-                    selector.ReconstructLDRPrecise(index, reconstructed);
-
-                    if (flags & Flags::S3TC_Paranoid)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                    else
-                        BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
-                }
-
-                if (!(flags & Flags::S3TC_Paranoid))
-                    error = aggError.Finalize(flags, channelWeightsSq);
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-
-                if (ParallelMath::AnySet(better))
-                {
-                    ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
-
-                    ParallelMath::ConditionalSet(bestError, better, error);
-
-                    for (int ep = 0; ep < 2; ep++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
-
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
-
-                    ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
-                }
-            }
-
-            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
-                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
-                const ParallelMath::RoundTowardNearestForScope* rtn)
-            {
-                UNREFERENCED_PARAMETER(alphaTest);
-                UNREFERENCED_PARAMETER(flags);
-
-                EndpointRefiner<3> refiner;
-
-                refiner.Init(nCounts, channelWeights);
-
-                bool escape = false;
-                int e = 0;
-                for (int i = 0; i < nCounts; i++)
-                {
-                    for (int n = 0; n < counts[i]; n++)
-                    {
-                        ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
-                        if (!ParallelMath::AnySet(valid))
-                        {
-                            escape = true;
-                            break;
-                        }
-
-                        if (ParallelMath::AllSet(valid))
-                            refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                        else
-                        {
-                            MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
-                            refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
-                        }
-                    }
-
-                    if (escape)
-                        break;
-                }
-
-                MUInt15 endPoints[2][3];
-                refiner.GetRefinedEndpointsLDR(endPoints, rtn);
-
-                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
-            }
-
-            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
-            {
-                UNREFERENCED_PARAMETER(flags);
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float weights[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
-
-                IndexSelector<1> selector;
-                selector.Init<false>(weights, ep, 16);
-
-                MUInt15 indexes[16];
-
-                for (int px = 0; px < 16; px++)
-                    indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    for (int px = 0; px < 16; px += 8)
-                    {
-                        int index0 = ParallelMath::Extract(indexes[px], block);
-                        int index1 = ParallelMath::Extract(indexes[px], block);
-
-                        packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
-            {
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float oneWeight[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
-                MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-
-                    if (isSigned)
-                        pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
-
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 sortedPixels[16];
-                for (int px = 0; px < 16; px++)
-                    sortedPixels[px] = pixels[px];
-
-                for (int sortEnd = 15; sortEnd > 0; sortEnd--)
-                {
-                    for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
-                    {
-                        MUInt15 a = sortedPixels[sortOffset];
-                        MUInt15 b = sortedPixels[sortOffset + 1];
-
-                        sortedPixels[sortOffset] = ParallelMath::Min(a, b);
-                        sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
-                    }
-                }
-
-                MUInt15 zero = ParallelMath::MakeUInt15(0);
-                MUInt15 one = ParallelMath::MakeUInt15(1);
-
-                MUInt15 bestIsFullRange = zero;
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestEP[2] = { zero, zero };
-                MUInt15 bestIndexes[16] = {
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero
-                };
-
-                // Full-precision
-                {
-                    MUInt15 minEP = sortedPixels[0];
-                    MUInt15 maxEP = sortedPixels[15];
-
-                    MFloat base[1] = { ParallelMath::ToFloat(minEP) };
-                    MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
-
-                    UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                    int numTweakRounds = BCCommon::TweakRoundsForRange(8);
-                    if (numTweakRounds > maxTweakRounds)
-                        numTweakRounds = maxTweakRounds;
-
-                    for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                    {
-                        MUInt15 ep[2][1];
-
-                        ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                        for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                        {
-                            EndpointRefiner<1> refiner;
-                            refiner.Init(8, oneWeight);
-
-                            if (isSigned)
-                                for (int epi = 0; epi < 2; epi++)
-                                    ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                            IndexSelector<1> indexSelector;
-                            indexSelector.Init<false>(oneWeight, ep, 8);
-
-                            MUInt15 indexes[16];
-
-                            AggregatedError<1> aggError;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                MUInt15 reconstructedPixel;
-
-                                indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
-                                BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
-
-                                if (refinePass != numRefineRounds - 1)
-                                    refiner.ContributeUnweightedPW(&floatPixels[px], index);
-
-                                indexes[px] = index;
-                            }
-                            MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            if (ParallelMath::AnySet(errorBetter16))
-                            {
-                                bestError = ParallelMath::Min(error, bestError);
-                                ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
-                                for (int px = 0; px < 16; px++)
-                                    ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                            }
-
-                            if (refinePass != numRefineRounds - 1)
-                                refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                        }
-                    }
-                }
-
-                // Reduced precision with special endpoints
-                {
-                    MUInt15 bestHeuristicMin = sortedPixels[0];
-                    MUInt15 bestHeuristicMax = sortedPixels[15];
-
-                    ParallelMath::Int16CompFlag canTryClipping;
-
-                    // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
-                    // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
-                    // This will usually not find anything, but it's cheap to check.
-
-                    {
-                        MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
-                        MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
-
-                        MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
-                        canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
-                    }
-
-                    if (ParallelMath::AnySet(canTryClipping))
-                    {
-                        MUInt15 lowClearances[16];
-                        MUInt15 highClearances[16];
-                        MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
-
-                        lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
-
-                        for (int px = 1; px < 16; px++)
-                        {
-                            lowClearances[px] = sortedPixels[px - 1];
-                            highClearances[px] = highTerminal - sortedPixels[16 - px];
-                        }
-
-                        for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
-                        {
-                            uint16_t numSkippedLow = firstIndex;
-
-                            MUInt15 lowClearance = lowClearances[firstIndex];
-
-                            for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
-                            {
-                                uint16_t numSkippedHigh = 15 - lastIndex;
-                                uint16_t numSkipped = numSkippedLow + numSkippedHigh;
-
-                                MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
-
-                                ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
-
-                                if (!ParallelMath::AnySet(areMoreSkipped))
-                                    continue;
-
-                                MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
-                                MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
-
-                                MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
-
-                                ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
-                                ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
-                                ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
-                            }
-                        }
-                    }
-
-                    MUInt15 bestSimpleMin = one;
-                    MUInt15 bestSimpleMax = highTerminalMinusOne;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
-                        ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
-                    }
-
-                    MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
-                    MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
-
-                    int minEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
-                        minEPRange = 1;
-
-                    int maxEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
-                        maxEPRange = 1;
-
-                    for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
-                    {
-                        for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
-                        {
-                            MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
-                            MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
-
-                            UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                            int numTweakRounds = BCCommon::TweakRoundsForRange(6);
-                            if (numTweakRounds > maxTweakRounds)
-                                numTweakRounds = maxTweakRounds;
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 ep[2][1];
-
-                                ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                                for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                                {
-                                    EndpointRefiner<1> refiner;
-                                    refiner.Init(6, oneWeight);
-
-                                    if (isSigned)
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                                    IndexSelector<1> indexSelector;
-                                    indexSelector.Init<false>(oneWeight, ep, 6);
-
-                                    MUInt15 indexes[16];
-                                    MFloat error = ParallelMath::MakeFloatZero();
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                        MUInt15 reconstructedPixel;
-
-                                        indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
-
-                                        MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
-                                        MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
-                                        MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
-
-                                        MFloat bestPixelError = zeroError;
-                                        MUInt15 index = ParallelMath::MakeUInt15(6);
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
-                                        bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
-
-                                        ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
-
-                                        if (ParallelMath::AllSet(selectedIndexBetter))
-                                        {
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
-                                        }
-                                        else
-                                        {
-                                            MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
-
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
-                                        }
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
-                                        bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
-
-                                        error = error + bestPixelError;
-
-                                        indexes[px] = index;
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                                    ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    if (ParallelMath::AnySet(errorBetter16))
-                                    {
-                                        bestError = ParallelMath::Min(error, bestError);
-                                        ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                                    }
-
-                                    if (refinePass != numRefineRounds - 1)
-                                        refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    int ep0 = ParallelMath::Extract(bestEP[0], block);
-                    int ep1 = ParallelMath::Extract(bestEP[1], block);
-                    int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
-
-                    if (isSigned)
-                    {
-                        ep0 -= 127;
-                        ep1 -= 127;
-
-                        assert(ep0 >= -127 && ep0 <= 127);
-                        assert(ep1 >= -127 && ep1 <= 127);
-                    }
-
-
-                    bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
-
-                    if (swapEndpoints)
-                        std::swap(ep0, ep1);
-
-                    uint16_t dumpBits = 0;
-                    int dumpBitsOffset = 0;
-                    int dumpByteOffset = 2;
-                    packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
-
-                    int maxValue = (isFullRange != 0) ? 7 : 5;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int index = ParallelMath::Extract(bestIndexes[px], block);
-
-                        if (swapEndpoints && index <= maxValue)
-                            index = maxValue - index;
-
-                        if (index != 0)
-                        {
-                            if (index == maxValue)
-                                index = 1;
-                            else if (index < maxValue)
-                                index++;
-                        }
-
-                        assert(index >= 0 && index < 8);
-
-                        dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
-                        dumpBitsOffset += 3;
-
-                        if (dumpBitsOffset >= 8)
-                        {
-                            assert(dumpByteOffset < 8);
-                            packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
-                            dumpBits >>= 8;
-                            dumpBitsOffset -= 8;
-                            dumpByteOffset++;
-                        }
-                    }
-
-                    assert(dumpBitsOffset == 0);
-                    assert(dumpByteOffset == 8);
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
-            {
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                EndpointSelector<3, 8> endpointSelector;
-
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                MFloat preWeightedPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                if (alphaTest)
-                {
-                    MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
-                        pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
-                    }
-                }
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-
-                for (int px = 0; px < 16; px++)
-                    minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
-
-                MFloat pixelWeights[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
-                    if (alphaTest)
-                    {
-                        ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                        ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
-                    }
-                }
-
-                for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                {
-                    for (int px = 0; px < 16; px++)
-                        endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
-
-                    endpointSelector.FinishPass(pass);
-                }
-
-                UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
-
-                MUInt15 bestEndpoints[2][3];
-                MUInt15 bestIndexes[16];
-                MUInt15 bestRange = ParallelMath::MakeUInt15(0);
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
-
-                if (exhaustive)
-                {
-                    MSInt16 sortBins[16];
-
-                    {
-                        // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
-                        // and pack the original indexes into the low bits.
-
-                        MUInt15 sortEP[2][3];
-                        ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
-
-                        IndexSelector<3> sortSelector;
-                        sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
-
-                        for (int16_t px = 0; px < 16; px++)
-                        {
-                            MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
-
-                            if (alphaTest)
-                            {
-                                ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                                ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
-                            }
-
-                            sortBin = sortBin + ParallelMath::MakeSInt16(px);
-
-                            sortBins[px] = sortBin;
-                        }
-                    }
-
-                    // Sort bins
-                    for (int sortEnd = 1; sortEnd < 16; sortEnd++)
-                    {
-                        for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
-                        {
-                            MSInt16 a = sortBins[sortLoc];
-                            MSInt16 b = sortBins[sortLoc - 1];
-
-                            sortBins[sortLoc] = ParallelMath::Max(a, b);
-                            sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
-                        }
-                    }
-
-                    MUInt15 firstElement = ParallelMath::MakeUInt15(0);
-                    for (uint16_t e = 0; e < 16; e++)
-                    {
-                        ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
-                        ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
-                        if (!ParallelMath::AnySet(isInvalid))
-                            break;
-                    }
-
-                    MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
-
-                    MUInt15 sortedInputs[16][4];
-                    MFloat floatSortedInputs[16][4];
-                    MFloat pwFloatSortedInputs[16][4];
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                            sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
-                    }
-
-                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                    {
-                        for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
-                        {
-                            ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
-                            int originalIndex = (sortBin & 15);
-
-                            for (int ch = 0; ch < 4; ch++)
-                                ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
-                        }
-                    }
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                        {
-                            MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
-                            floatSortedInputs[e][ch] = f;
-                            pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
-                        }
-                    }
-
-                    for (int n0 = 0; n0 <= 15; n0++)
-                    {
-                        int remainingFor1 = 16 - n0;
-                        if (remainingFor1 == 16)
-                            remainingFor1 = 15;
-
-                        for (int n1 = 0; n1 <= remainingFor1; n1++)
-                        {
-                            int remainingFor2 = 16 - n1 - n0;
-                            if (remainingFor2 == 16)
-                                remainingFor2 = 15;
-
-                            for (int n2 = 0; n2 <= remainingFor2; n2++)
-                            {
-                                int n3 = 16 - n2 - n1 - n0;
-
-                                if (n3 == 16)
-                                    continue;
-
-                                int counts[4] = { n0, n1, n2, n3 };
-
-                                TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-                    }
-
-                    TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-
-                    if (alphaTest)
-                    {
-                        for (int n0 = 0; n0 <= 15; n0++)
-                        {
-                            int remainingFor1 = 16 - n0;
-                            if (remainingFor1 == 16)
-                                remainingFor1 = 15;
-
-                            for (int n1 = 0; n1 <= remainingFor1; n1++)
-                            {
-                                int n2 = 16 - n1 - n0;
-
-                                if (n2 == 16)
-                                    continue;
-
-                                int counts[3] = { n0, n1, n2 };
-
-                                TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-
-                        TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                    }
-                }
-                else
-                {
-                    int minRange = alphaTest ? 3 : 4;
-
-                    for (int range = minRange; range <= 4; range++)
-                    {
-                        int tweakRounds = BCCommon::TweakRoundsForRange(range);
-                        if (tweakRounds > maxTweakRounds)
-                            tweakRounds = maxTweakRounds;
-
-                        for (int tweak = 0; tweak < tweakRounds; tweak++)
-                        {
-                            MUInt15 endPoints[2][3];
-
-                            ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
-
-                            for (int refine = 0; refine < numRefineRounds; refine++)
-                            {
-                                EndpointRefiner<3> refiner;
-                                refiner.Init(range, channelWeights);
-
-                                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
-
-                                if (refine != numRefineRounds - 1)
-                                    refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
-                    assert(range == 3 || range == 4);
-
-                    ParallelMath::ScalarUInt16 compressedEP[2];
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        ParallelMath::ScalarUInt16 endPoint[3];
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
-
-                        int compressed = (endPoint[0] & 0xf8) << 8;
-                        compressed |= (endPoint[1] & 0xfc) << 3;
-                        compressed |= (endPoint[2] & 0xf8) >> 3;
-
-                        compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
-                    }
-
-                    int indexOrder[4];
-
-                    if (range == 4)
-                    {
-                        if (compressedEP[0] == compressedEP[1])
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 0;
-                            indexOrder[2] = 0;
-                            indexOrder[3] = 0;
-                        }
-                        else if (compressedEP[0] < compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 3;
-                            indexOrder[2] = 2;
-                            indexOrder[3] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 3;
-                            indexOrder[3] = 1;
-                        }
-                    }
-                    else
-                    {
-                        assert(range == 3);
-
-                        if (compressedEP[0] > compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 1;
-                        }
-                        indexOrder[3] = 3;
-                    }
-
-                    packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
-                    packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
-                    packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
-
-                    for (int i = 0; i < 16; i += 4)
-                    {
-                        int packedIndexes = 0;
-                        for (int subi = 0; subi < 4; subi++)
-                        {
-                            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
-                            packedIndexes |= (indexOrder[index] << (subi * 2));
-                        }
-
-                        packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-        };
-
-        // Signed input blocks are converted into unsigned space, with the maximum value being 254
-        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
-        {
-            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
-            {
-                const PixelBlockS8& inputSignedBlock = inputSigned[block];
-                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
-
-                for (size_t px = 0; px < 16; px++)
-                {
-                    for (size_t ch = 0; ch < 4; ch++)
-                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
-                }
-            }
-        }
-
-        void FillWeights(const Options &options, float channelWeights[4])
-        {
-            if (options.flags & Flags::Uniform)
-                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
-            else
-            {
-                channelWeights[0] = options.redWeight;
-                channelWeights[1] = options.greenWeight;
-                channelWeights[2] = options.blueWeight;
-                channelWeights[3] = options.alphaWeight;
-            }
-        }
-    }
-
-    namespace Kernels
-    {
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
-                pBC += 16;
-            }
-        }
-    }
-}
diff --git a/thirdparty/cvtt/ConvectionKernels.h b/thirdparty/cvtt/ConvectionKernels.h
index fb5ca130f9..3da48405ff 100644
--- a/thirdparty/cvtt/ConvectionKernels.h
+++ b/thirdparty/cvtt/ConvectionKernels.h
@@ -25,21 +25,13 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __CVTT_CONVECTION_KERNELS__
 #define __CVTT_CONVECTION_KERNELS__
 
+#include <stddef.h>
 #include <stdint.h>
 
 namespace cvtt
 {
     namespace Flags
     {
-        // Enable partitioned modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnablePartitioning   = 0x001;
-
-        // Enable 3-partition modes in BC7 encoding (slower, better quality, requires BC7_EnablePartitioning)
-        const uint32_t BC7_Enable3Subsets       = 0x002;
-
-        // Enable dual-plane modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnableDualPlane      = 0x004;
-
         // Use fast indexing in BC7 encoding (about 2x faster, slightly worse quality)
         const uint32_t BC7_FastIndexing         = 0x008;
 
@@ -61,13 +53,19 @@ namespace cvtt
         // Uniform color channel importance
         const uint32_t Uniform                  = 0x200;
 
+        // Use fake BT.709 color space for etc2comp compatibility (slower)
+        const uint32_t ETC_UseFakeBT709         = 0x400;
+
+        // Use accurate quantization functions when quantizing fake BT.709 (much slower, marginal improvement on specific blocks)
+        const uint32_t ETC_FakeBT709Accurate    = 0x800;
+
         // Misc useful default flag combinations
-        const uint32_t Fastest = (BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Faster = (BC7_EnableDualPlane | BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Fast = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Default = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Better = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | S3TC_Paranoid | S3TC_Exhaustive);
-        const uint32_t Ultra = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Fastest = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Faster = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Fast = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Default = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Better = (S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Ultra = (BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive | ETC_FakeBT709Accurate);
     }
 
     const unsigned int NumParallelBlocks = 8;
@@ -81,7 +79,7 @@ namespace cvtt
         float blueWeight;       // Blue channel importance
         float alphaWeight;      // Alpha channel importance
 
-        int refineRoundsBC7;    // Number of refine rounds for BC7
+        int refineRoundsBC7;   // Number of refine rounds for BC7
         int refineRoundsBC6H;   // Number of refine rounds for BC6H (max 3)
         int refineRoundsIIC;    // Number of refine rounds for independent interpolated channels (BC3 alpha, BC4, BC5)
         int refineRoundsS3TC;   // Number of refine rounds for S3TC RGB
@@ -104,6 +102,102 @@ namespace cvtt
         }
     };
 
+    struct BC7FineTuningParams
+    {
+        // Seed point counts for each mode+configuration combination
+        uint8_t mode0SP[16];
+        uint8_t mode1SP[64];
+        uint8_t mode2SP[64];
+        uint8_t mode3SP[64];
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        uint8_t mode6SP;
+        uint8_t mode7SP[64];
+
+        BC7FineTuningParams()
+        {
+            for (int i = 0; i < 16; i++)
+                this->mode0SP[i] = 4;
+
+            for (int i = 0; i < 64; i++)
+            {
+                this->mode1SP[i] = 4;
+                this->mode2SP[i] = 4;
+                this->mode3SP[i] = 4;
+                this->mode7SP[i] = 4;
+            }
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+
+            this->mode6SP = 4;
+        }
+    };
+
+    struct BC7EncodingPlan
+    {
+        static const int kNumRGBAShapes = 129;
+        static const int kNumRGBShapes = 243;
+
+        uint64_t mode1PartitionEnabled;
+        uint64_t mode2PartitionEnabled;
+        uint64_t mode3PartitionEnabled;
+        uint16_t mode0PartitionEnabled;
+        uint64_t mode7RGBAPartitionEnabled;
+        uint64_t mode7RGBPartitionEnabled;
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        bool mode6Enabled;
+
+        uint8_t seedPointsForShapeRGB[kNumRGBShapes];
+        uint8_t seedPointsForShapeRGBA[kNumRGBAShapes];
+
+        uint8_t rgbaShapeList[kNumRGBAShapes];
+        uint8_t rgbaNumShapesToEvaluate;
+
+        uint8_t rgbShapeList[kNumRGBShapes];
+        uint8_t rgbNumShapesToEvaluate;
+
+        BC7EncodingPlan()
+        {
+            for (int i = 0; i < kNumRGBShapes; i++)
+            {
+                this->rgbShapeList[i] = i;
+                this->seedPointsForShapeRGB[i] = 4;
+            }
+            this->rgbNumShapesToEvaluate = kNumRGBShapes;
+
+            for (int i = 0; i < kNumRGBAShapes; i++)
+            {
+                this->rgbaShapeList[i] = i;
+                this->seedPointsForShapeRGBA[i] = 4;
+            }
+            this->rgbaNumShapesToEvaluate = kNumRGBAShapes;
+
+
+            this->mode0PartitionEnabled = 0xffff;
+            this->mode1PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode2PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode3PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode6Enabled = true;
+            this->mode7RGBPartitionEnabled = 0xffffffffffffffffULL;
+            this->mode7RGBAPartitionEnabled = 0xffffffffffffffffULL;
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+        }
+    };
+
     // RGBA input block for unsigned 8-bit formats
     struct PixelBlockU8
     {
@@ -116,14 +210,34 @@ namespace cvtt
         int8_t m_pixels[16][4];
     };
 
+    struct PixelBlockScalarS16
+    {
+        int16_t m_pixels[16];
+    };
+
     // RGBA input block for half-precision float formats (bit-cast to int16_t)
     struct PixelBlockF16
     {
         int16_t m_pixels[16][4];
     };
 
+    class ETC2CompressionData
+    {
+    protected:
+        ETC2CompressionData() {}
+    };
+
+    class ETC1CompressionData
+    {
+    protected:
+        ETC1CompressionData() {}
+    };
+
     namespace Kernels
     {
+        typedef void* allocFunc_t(void *context, size_t size);
+        typedef void freeFunc_t(void *context, void* ptr, size_t size);
+
         // NOTE: All functions accept and output NumParallelBlocks blocks at once
         void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
         void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
@@ -134,7 +248,28 @@ namespace cvtt
         void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options);
         void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
         void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, const BC7EncodingPlan &encodingPlan);
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC1CompressionData *compressionData);
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC2CompressionData *compressionData);
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options);
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options);
+
+        // Generates a BC7 encoding plan from a quality parameter that ranges from 1 (fastest) to 100 (best)
+        void ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality);
+
+        // Generates a BC7 encoding plan from fine-tuning parameters.
+        bool ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params);
+
+        // ETC compression requires temporary storage that normally consumes a large amount of stack space.
+        // To allocate and release it, use one of these functions.
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc);
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context);
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc);
 
         void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC);
         void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC);
diff --git a/thirdparty/cvtt/ConvectionKernels_API.cpp b/thirdparty/cvtt/ConvectionKernels_API.cpp
new file mode 100644
index 0000000000..707e71d474
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_API.cpp
@@ -0,0 +1,346 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include <stdint.h>
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Util.h"
+#include "ConvectionKernels_BC67.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_S3TC.h"
+
+#include <assert.h>
+
+namespace cvtt
+{
+    namespace Kernels
+    {
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, const BC7EncodingPlan &encodingPlan)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, encodingPlan, options.refineRoundsBC7);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC1CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC1Block(pBC, pBlocks + blockBase, compressionData, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, false);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, true);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2AlphaBlock(pBC, pBlocks + blockBase, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressEACBlock(pBC, pBlocks + blockBase, isSigned, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            uint8_t alphaBlockData[cvtt::NumParallelBlocks * 8];
+            uint8_t colorBlockData[cvtt::NumParallelBlocks * 8];
+
+            EncodeETC2(colorBlockData, pBlocks, options, compressionData);
+            EncodeETC2Alpha(alphaBlockData, pBlocks, options);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + blockData] = alphaBlockData[blockBase * 8 + blockData];
+
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + 8 + blockData] = colorBlockData[blockBase * 8 + blockData];
+            }
+        }
+
+        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
+                pBC += 16;
+            }
+        }
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC1Data(allocFunc, context);
+        }
+
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC1Data(compressionData, freeFunc);
+        }
+
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC2Data(allocFunc, context, options);
+        }
+
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC2Data(compressionData, freeFunc);
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_AggregatedError.h b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
new file mode 100644
index 0000000000..9f9356a345
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
@@ -0,0 +1,55 @@
+#pragma once
+#ifndef __CVTT_AGGREGATEDERROR_H__
+#define __CVTT_AGGREGATEDERROR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class AggregatedError
+        {
+        public:
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::Float MFloat;
+
+            AggregatedError()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
+            }
+
+            void Add(const MUInt16 &channelErrorUnweighted, int ch)
+            {
+                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
+            }
+
+            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
+            {
+                if (flags & cvtt::Flags::Uniform)
+                {
+                    MUInt31 total = m_errorUnweighted[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + m_errorUnweighted[ch];
+                    return ParallelMath::ToFloat(total);
+                }
+                else
+                {
+                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
+                    return total;
+                }
+            }
+
+        private:
+            MUInt31 m_errorUnweighted[TVectorSize];
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.cpp b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
new file mode 100644
index 0000000000..791859b232
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
@@ -0,0 +1,3485 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC67.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_BC7_Prio.h"
+#include "ConvectionKernels_BC7_SingleColor.h"
+#include "ConvectionKernels_BC6H_IO.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelectorHDR.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        namespace BC67
+        {
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt15 MUInt15;
+
+            struct WorkInfo
+            {
+                MUInt15 m_mode;
+                MFloat m_error;
+                MUInt15 m_ep[3][2][4];
+                MUInt15 m_indexes[16];
+                MUInt15 m_indexes2[16];
+
+                union
+                {
+                    MUInt15 m_partition;
+                    struct IndexSelectorAndRotation
+                    {
+                        MUInt15 m_indexSelector;
+                        MUInt15 m_rotation;
+                    } m_isr;
+                } m_u;
+            };
+        }
+
+        namespace BC7Data
+        {
+            enum AlphaMode
+            {
+                AlphaMode_Combined,
+                AlphaMode_Separate,
+                AlphaMode_None,
+            };
+
+            enum PBitMode
+            {
+                PBitMode_PerEndpoint,
+                PBitMode_PerSubset,
+                PBitMode_None
+            };
+
+            struct BC7ModeInfo
+            {
+                PBitMode m_pBitMode;
+                AlphaMode m_alphaMode;
+                int m_rgbBits;
+                int m_alphaBits;
+                int m_partitionBits;
+                int m_numSubsets;
+                int m_indexBits;
+                int m_alphaIndexBits;
+                bool m_hasIndexSelector;
+            };
+
+            BC7ModeInfo g_modes[] =
+            {
+                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
+                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
+                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
+                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
+
+                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
+                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
+            };
+
+            const int g_weight2[] = { 0, 21, 43, 64 };
+            const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+            const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+
+            const int *g_weightTables[] =
+            {
+                NULL,
+                NULL,
+                g_weight2,
+                g_weight3,
+                g_weight4
+            };
+
+            struct BC6HModeInfo
+            {
+                uint16_t m_modeID;
+                bool m_partitioned;
+                bool m_transformed;
+                int m_aPrec;
+                int m_bPrec[3];
+            };
+
+            // [partitioned][precision]
+            bool g_hdrModesExistForPrecision[2][17] =
+            {
+                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
+                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
+                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
+            };
+
+            BC6HModeInfo g_hdrModes[] =
+            {
+                { 0x00, true,  true,  10,{ 5, 5, 5 } },
+                { 0x01, true,  true,  7,{ 6, 6, 6 } },
+                { 0x02, true,  true,  11,{ 5, 4, 4 } },
+                { 0x06, true,  true,  11,{ 4, 5, 4 } },
+                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
+                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
+                { 0x12, true,  true,  8,{ 6, 5, 5 } },
+                { 0x16, true,  true,  8,{ 5, 6, 5 } },
+                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
+                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
+                { 0x03, false, false, 10,{ 10, 10, 10 } },
+                { 0x07, false, true,  11,{ 9, 9, 9 } },
+                { 0x0b, false, true,  12,{ 8, 8, 8 } },
+                { 0x0f, false, true,  16,{ 4, 4, 4 } },
+            };
+
+            const int g_maxHDRPrecision = 16;
+
+            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
+
+            static uint16_t g_partitionMap[64] =
+            {
+                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+                0xC800, 0xFFEC, 0xFE80, 0xE800,
+                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+                0xF710, 0x008E, 0x7100, 0x08CE,
+                0x008C, 0x7310, 0x3100, 0x8CCE,
+                0x088C, 0x3110, 0x6666, 0x366C,
+                0x17E8, 0x0FF0, 0x718E, 0x399C,
+                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
+                0x3c3c, 0x55aa, 0x9696, 0xa55a,
+                0x73ce, 0x13c8, 0x324c, 0x3bdc,
+                0x6996, 0xc33c, 0x9966, 0x660,
+                0x272, 0x4e4, 0x4e40, 0x2720,
+                0xc936, 0x936c, 0x39c6, 0x639c,
+                0x9336, 0x9cc6, 0x817e, 0xe718,
+                0xccf0, 0xfcc, 0x7744, 0xee22,
+            };
+
+            static uint32_t g_partitionMap2[64] =
+            {
+                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+            };
+
+            static int g_fixupIndexes2[64] =
+            {
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15, 2, 8, 2,
+                2, 8, 8,15,
+                2, 8, 2, 2,
+                8, 8, 2, 2,
+
+                15,15, 6, 8,
+                2, 8,15,15,
+                2, 8, 2, 2,
+                2,15,15, 6,
+                6, 2, 6, 8,
+                15,15, 2, 2,
+                15,15,15,15,
+                15, 2, 2,15,
+            };
+
+            static int g_fixupIndexes3[64][2] =
+            {
+                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
+                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
+                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
+                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
+                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
+                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
+                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
+                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
+
+                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
+                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
+                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
+                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
+                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
+                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
+                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
+                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
+            };
+
+            static const unsigned char g_fragments[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
+                0, 1, 2, 3,  // 16, 4
+                0, 1, 4,  // 20, 3
+                0, 1, 2, 4,  // 23, 4
+                2, 3, 7,  // 27, 3
+                1, 2, 3, 7,  // 30, 4
+                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
+                0, 1, 4, 8,  // 42, 4
+                0, 1, 2, 4, 5, 8,  // 46, 6
+                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
+                1, 4, 5, 6, 9,  // 60, 5
+                2, 5, 6, 7, 10,  // 65, 5
+                5, 6, 9, 10,  // 70, 4
+                2, 3, 7, 11,  // 74, 4
+                1, 2, 3, 6, 7, 11,  // 78, 6
+                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
+                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
+                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
+                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
+                0, 4, 8, 12,  // 128, 4
+                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
+                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
+                3, 6, 7, 8, 9, 12,  // 158, 6
+                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
+                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
+                5, 8, 9, 10, 13,  // 192, 5
+                8, 12, 13,  // 197, 3
+                4, 8, 12, 13,  // 200, 4
+                2, 3, 6, 9, 12, 13,  // 204, 6
+                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
+                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
+                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
+                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
+                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
+                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
+                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
+                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
+                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
+                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
+                6, 9, 10, 11, 14,  // 317, 5
+                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
+                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
+                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
+                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
+                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
+                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
+                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
+                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
+                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
+                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
+                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
+                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
+                8, 12, 13, 14,  // 418, 4
+                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
+                4, 8, 9, 12, 13, 14,  // 430, 6
+                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
+                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
+                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
+                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
+                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
+                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
+                3, 7, 11, 15,  // 490, 4
+                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
+                0, 4, 5, 10, 11, 15,  // 502, 6
+                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
+                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
+                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
+                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
+                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
+                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
+                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
+                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
+                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
+                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
+                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
+                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
+                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
+                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
+                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
+                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
+                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
+                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
+                0, 1, 5, 10, 14, 15,  // 669, 6
+                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
+                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
+                11, 14, 15,  // 691, 3
+                7, 11, 14, 15,  // 694, 4
+                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
+                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
+                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
+                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
+                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
+                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
+                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
+                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
+                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
+                11, 13, 14, 15,  // 792, 4
+                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
+                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
+                7, 10, 11, 13, 14, 15,  // 814, 6
+                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
+                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
+                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
+                12, 13, 14, 15,  // 848, 4
+                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
+                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
+                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
+                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
+                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
+                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
+                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
+                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
+                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
+                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
+                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
+                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
+                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
+                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
+                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
+                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
+                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
+                0, 2,  // 1040, 2
+                1, 3,  // 1042, 2
+                0, 1, 4, 5,  // 1044, 4
+                0, 1, 2, 4, 5,  // 1048, 5
+                2, 3, 6,  // 1053, 3
+                0, 2, 4, 6,  // 1056, 4
+                1, 2, 5, 6,  // 1060, 4
+                0, 1, 2, 3, 5, 6,  // 1064, 6
+                0, 1, 2, 4, 5, 6,  // 1070, 6
+                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
+                0, 3, 4, 7,  // 1083, 4
+                0, 1, 2, 3, 4, 7,  // 1087, 6
+                1, 3, 5, 7,  // 1093, 4
+                2, 3, 6, 7,  // 1097, 4
+                1, 2, 3, 6, 7,  // 1101, 5
+                1, 2, 3, 5, 6, 7,  // 1106, 6
+                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
+                4, 5, 6, 7,  // 1119, 4
+                0, 8,  // 1123, 2
+                0, 1, 4, 5, 8,  // 1125, 5
+                0, 1, 8, 9,  // 1130, 4
+                4, 5, 8, 9,  // 1134, 4
+                0, 1, 4, 5, 8, 9,  // 1138, 6
+                2, 6, 8, 9,  // 1144, 4
+                6, 7, 8, 9,  // 1148, 4
+                0, 2, 4, 6, 8, 10,  // 1152, 6
+                1, 2, 5, 6, 9, 10,  // 1158, 6
+                0, 3, 4, 7, 9, 10,  // 1164, 6
+                0, 1, 2, 8, 9, 10,  // 1170, 6
+                4, 5, 6, 8, 9, 10,  // 1176, 6
+                3, 11,  // 1182, 2
+                2, 3, 6, 7, 11,  // 1184, 5
+                0, 3, 8, 11,  // 1189, 4
+                0, 3, 4, 7, 8, 11,  // 1193, 6
+                1, 3, 5, 7, 9, 11,  // 1199, 6
+                2, 3, 10, 11,  // 1205, 4
+                1, 5, 10, 11,  // 1209, 4
+                4, 5, 10, 11,  // 1213, 4
+                6, 7, 10, 11,  // 1217, 4
+                2, 3, 6, 7, 10, 11,  // 1221, 6
+                1, 2, 3, 9, 10, 11,  // 1227, 6
+                5, 6, 7, 9, 10, 11,  // 1233, 6
+                8, 9, 10, 11,  // 1239, 4
+                4, 12,  // 1243, 2
+                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
+                8, 9, 12,  // 1253, 3
+                0, 4, 5, 8, 9, 12,  // 1256, 6
+                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
+                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
+                1, 5, 9, 13,  // 1276, 4
+                6, 7, 9, 13,  // 1280, 4
+                1, 4, 7, 10, 13,  // 1284, 5
+                1, 6, 8, 11, 13,  // 1289, 5
+                0, 1, 12, 13,  // 1294, 4
+                4, 5, 12, 13,  // 1298, 4
+                0, 1, 6, 7, 12, 13,  // 1302, 6
+                0, 1, 4, 8, 12, 13,  // 1308, 6
+                8, 9, 12, 13,  // 1314, 4
+                4, 8, 9, 12, 13,  // 1318, 5
+                4, 5, 8, 9, 12, 13,  // 1323, 6
+                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
+                0, 1, 6, 10, 12, 13,  // 1336, 6
+                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
+                0, 1, 10, 11, 12, 13,  // 1349, 6
+                2, 4, 7, 9, 14,  // 1355, 5
+                4, 5, 10, 14,  // 1360, 4
+                2, 6, 10, 14,  // 1364, 4
+                2, 5, 8, 11, 14,  // 1368, 5
+                0, 2, 12, 14,  // 1373, 4
+                8, 10, 12, 14,  // 1377, 4
+                4, 6, 8, 10, 12, 14,  // 1381, 6
+                13, 14,  // 1387, 2
+                9, 10, 13, 14,  // 1389, 4
+                5, 6, 9, 10, 13, 14,  // 1393, 6
+                0, 1, 2, 12, 13, 14,  // 1399, 6
+                4, 5, 6, 12, 13, 14,  // 1405, 6
+                8, 9, 12, 13, 14,  // 1411, 5
+                8, 9, 10, 12, 13, 14,  // 1416, 6
+                7, 15,  // 1422, 2
+                0, 5, 10, 15,  // 1424, 4
+                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
+                10, 11, 15,  // 1436, 3
+                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
+                3, 6, 7, 10, 11, 15,  // 1446, 6
+                12, 15,  // 1452, 2
+                0, 3, 12, 15,  // 1454, 4
+                4, 7, 12, 15,  // 1458, 4
+                0, 3, 6, 9, 12, 15,  // 1462, 6
+                0, 3, 5, 10, 12, 15,  // 1468, 6
+                8, 11, 12, 15,  // 1474, 4
+                5, 6, 8, 11, 12, 15,  // 1478, 6
+                4, 7, 8, 11, 12, 15,  // 1484, 6
+                1, 3, 13, 15,  // 1490, 4
+                9, 11, 13, 15,  // 1494, 4
+                5, 7, 9, 11, 13, 15,  // 1498, 6
+                2, 3, 14, 15,  // 1504, 4
+                2, 3, 4, 5, 14, 15,  // 1508, 6
+                6, 7, 14, 15,  // 1514, 4
+                2, 3, 5, 9, 14, 15,  // 1518, 6
+                2, 3, 8, 9, 14, 15,  // 1524, 6
+                10, 14, 15,  // 1530, 3
+                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
+                2, 3, 7, 11, 14, 15,  // 1540, 6
+                10, 11, 14, 15,  // 1546, 4
+                7, 10, 11, 14, 15,  // 1550, 5
+                6, 7, 10, 11, 14, 15,  // 1555, 6
+                1, 2, 3, 13, 14, 15,  // 1561, 6
+                5, 6, 7, 13, 14, 15,  // 1567, 6
+                10, 11, 13, 14, 15,  // 1573, 5
+                9, 10, 11, 13, 14, 15,  // 1578, 6
+                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
+                9, 10, 12, 13, 14, 15,  // 1592, 6
+                8, 11, 12, 13, 14, 15,  // 1598, 6
+                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
+            };
+            static const int g_shapeRanges[][2] =
+            {
+                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
+                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
+                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
+                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
+                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
+                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
+                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
+                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
+                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
+                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
+                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
+                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
+                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
+                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
+                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
+                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
+                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
+                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
+                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
+                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
+                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
+                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
+                { 1604, 8 },
+            };
+            static const int g_shapes1[][2] =
+            {
+                { 0, 16 }
+            };
+            static const int g_shapes2[64][2] =
+            {
+                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
+                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
+                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
+                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
+                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
+                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
+                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
+                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
+            };
+            static const int g_shapes3[64][3] =
+            {
+                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
+                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
+                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
+                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
+                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
+                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
+                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
+                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
+            };
+
+            static const int g_shapeList1[] =
+            {
+                0,
+            };
+
+            static const int g_shapeList2[] =
+            {
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+                122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList12[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList3[] =
+            {
+                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
+                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
+                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
+                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
+                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
+                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
+                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
+                235, 236, 237, 238, 239, 240, 241, 242,
+            };
+
+            static const int g_shapeList3Short[] =
+            {
+                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
+                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
+                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
+                233, 237, 240,
+            };
+
+            static const int g_shapeListAll[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
+                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
+                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
+                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
+                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+                242,
+            };
+
+            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
+            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
+            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
+            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
+            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
+            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
+            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
+        }
+
+        struct PackingVector
+        {
+            uint32_t m_vector[4];
+            int m_offset;
+
+            void Init()
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                m_offset = 0;
+            }
+
+            void InitPacked(const uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    m_vector[b / 32] = v[b / 32];
+
+                m_offset = bits;
+            }
+
+            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
+            {
+                int vOffset = m_offset >> 5;
+                int bitOffset = m_offset & 0x1f;
+
+                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
+
+                int overflowBits = bitOffset + bits - 32;
+                if (overflowBits > 0)
+                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
+
+                m_offset += bits;
+            }
+
+            inline void Flush(uint8_t* output)
+            {
+                assert(m_offset == 128);
+
+                for (int v = 0; v < 4; v++)
+                {
+                    uint32_t chunk = m_vector[v];
+                    for (int b = 0; b < 4; b++)
+                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
+                }
+            }
+        };
+
+
+        struct UnpackingVector
+        {
+            uint32_t m_vector[4];
+
+            void Init(const uint8_t *bytes)
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                for (int b = 0; b < 16; b++)
+                    m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
+            }
+
+            inline void UnpackStart(uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    v[b / 32] = m_vector[b / 32];
+
+                int entriesShifted = bits / 32;
+                int carry = bits % 32;
+
+                for (int i = entriesShifted; i < 4; i++)
+                    m_vector[i - entriesShifted] = m_vector[i];
+
+                int entriesRemaining = 4 - entriesShifted;
+                if (carry)
+                {
+                    uint32_t bitMask = (1 << carry) - 1;
+                    for (int i = 0; i < 4; i++)
+                    {
+                        m_vector[i] >>= carry;
+                        if (i != 3)
+                            m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
+                    }
+                }
+            }
+
+            inline ParallelMath::ScalarUInt16 Unpack(int bits)
+            {
+                uint32_t bitMask = (1 << bits) - 1;
+
+                ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
+
+                for (int i = 0; i < 4; i++)
+                {
+                    m_vector[i] >>= bits;
+                    if (i != 3)
+                        m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
+                }
+
+                return result;
+            }
+        };
+
+        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
+        {
+            if (isSigned)
+            {
+                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
+                return (v * 32.0f + offset) / 31.0f;
+            }
+            else
+                return (v * 64.0f + 30.0f) / 31.0f;
+        }
+
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
+        {
+#ifdef CVTT_ENABLE_ASSERTS
+            for (int i = 0; i < ParallelMath::ParallelSize; i++)
+                assert(ParallelMath::Extract(v, i) != -32768)
+#endif
+
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
+            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
+
+            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
+            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
+            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
+            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
+
+            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
+        }
+
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
+        {
+            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
+        }
+
+        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    if (isSigned)
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
+                    else
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
+                }
+            }
+        }
+
+        struct SinglePlaneTemporaries
+        {
+            UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
+            UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
+
+            ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
+            ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
+            ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
+        };
+    }
+}
+
+void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
+{
+    ParallelMath::RoundTowardNearestForScope roundingMode;
+
+    float tf[2];
+    Util::ComputeTweakFactors(tweak, range, tf);
+
+    MFloat base = ParallelMath::ToFloat(original[0]);
+    MFloat offs = ParallelMath::ToFloat(original[1]) - base;
+
+    result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
+    result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
+}
+
+void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+        color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
+}
+
+void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
+{
+    int16_t addend;
+    if (p)
+        addend = ((1 << (8 - bits)) - 1);
+    else
+        addend = 255;
+
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
+        ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
+        ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
+        color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
+    }
+}
+
+void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt15 clr = color[ch];
+        clr = clr << (8 - bits);
+        color[ch] = clr | ParallelMath::RightShift(clr, bits);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 4, p[j], 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 6, p, 3);
+        Unquantize(ep[j], 7, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(ep[j], 5, 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 7, p[j], 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 5, 3);
+        Unquantize(epRGB[j], 5, 3);
+
+        Quantize(epA + j, 6, 1);
+        Unquantize(epA + j, 6, 1);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 7, 3);
+        Unquantize(epRGB[j], 7, 3);
+    }
+
+    // Alpha is full precision
+    (void)epA;
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+        QuantizeP(ep[j], 7, p[j], 4);
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 5, p[j], 4);
+        Unquantize(ep[j], 6, 4);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 intAverage[4];
+    for (int ch = 0; ch < 4; ch++)
+        intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
+
+    MUInt15 eps[2][4];
+    MUInt15 reconstructed[4];
+    MUInt15 index = ParallelMath::MakeUInt15(0);
+
+    for (int epi = 0; epi < 2; epi++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            eps[epi][ch] = ParallelMath::MakeUInt15(0);
+        eps[epi][3] = ParallelMath::MakeUInt15(255);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        reconstructed[ch] = ParallelMath::MakeUInt15(0);
+    reconstructed[3] = ParallelMath::MakeUInt15(255);
+
+    // Depending on the target index and parity bits, there are multiple valid solid colors.
+    // We want to find the one closest to the actual average.
+    MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
+    for (int t = 0; t < numTables; t++)
+    {
+        const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
+
+        ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
+
+        MUInt15 candidateReconstructed[4];
+        MUInt15 candidateEPs[2][4];
+
+        for (int i = 0; i < ParallelMath::ParallelSize; i++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+            {
+                ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
+                assert(avgValue >= 0 && avgValue <= 255);
+
+                const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
+
+                ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
+                ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
+                ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
+            }
+        }
+
+        MFloat avgError = ParallelMath::MakeFloatZero();
+        for (int ch = 0; ch < numRealChannels; ch++)
+        {
+            MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
+            avgError = avgError + delta * delta * channelWeightsSq[ch];
+        }
+
+        ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
+        better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
+
+        if (ParallelMath::AnySet(better))
+        {
+            ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
+
+            MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
+
+            ParallelMath::ConditionalSet(index, better, candidateIndex);
+
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
+
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
+        }
+    }
+
+    AggregatedError<4> aggError;
+    for (int pxi = 0; pxi < shapeLength; pxi++)
+    {
+        int px = fragmentStart[pxi];
+
+        BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+    }
+
+    MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
+
+    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
+    if (ParallelMath::AnySet(better))
+    {
+        shapeBestError = ParallelMath::Min(shapeBestError, error);
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
+        }
+
+        for (int pxi = 0; pxi < shapeLength; pxi++)
+            ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    SinglePlaneTemporaries temps;
+
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+    ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 a = pixels[px][3];
+        maxAlpha = ParallelMath::Max(maxAlpha, a);
+        minAlpha = ParallelMath::Min(minAlpha, a);
+
+        isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
+    }
+
+    ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
+    ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
+
+    bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
+
+    // Try RGB modes if any block has a min alpha 251 or higher
+    bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
+
+    // Try mode 7 if any block has alpha.
+    // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
+    // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
+    // situations, and only by at most 1 unit of error per pixel.
+    bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);
+
+    MFloat preWeightedPixels[16][4];
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    // Get initial RGB endpoints
+    if (allowRGBModes)
+    {
+        const uint8_t *shapeList = encodingPlan.rgbShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+            EndpointSelector<3, 8> epSelector;
+
+            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+            {
+                for (int spx = 0; spx < shapeSize; spx++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + spx];
+                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                }
+                epSelector.FinishPass(epPass);
+            }
+            temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
+        }
+    }
+
+    // Get initial RGBA endpoints
+    {
+        const uint8_t *shapeList = encodingPlan.rgbaShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            if (anyBlockHasAlpha || !allowRGBModes)
+            {
+                int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+                EndpointSelector<4, 8> epSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int spx = 0; spx < shapeSize; spx++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + spx];
+                        epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                    }
+                    epSelector.FinishPass(epPass);
+                }
+                temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
+            }
+            else
+            {
+                temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
+            }
+        }
+    }
+
+    for (uint16_t mode = 0; mode <= 7; mode++)
+    {
+        if (mode == 4 || mode == 5)
+            continue;
+
+        if (mode < 4 && !allowRGBModes)
+            continue;
+
+        if (mode == 7 && !allowMode7)
+            continue;
+
+        uint64_t partitionEnabledBits = 0;
+        switch (mode)
+        {
+        case 0:
+            partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        }
+
+        bool isRGB = (mode < 4);
+
+        unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
+        int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
+        int indexPrec = BC7Data::g_modes[mode].m_indexBits;
+
+        int parityBitMax = 1;
+        if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+            parityBitMax = 4;
+        else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
+            parityBitMax = 2;
+
+        int numRealChannels = isRGB ? 3 : 4;
+
+        int numShapes;
+        const int *shapeList;
+
+        if (numSubsets == 1)
+        {
+            numShapes = BC7Data::g_numShapes1;
+            shapeList = BC7Data::g_shapeList1;
+        }
+        else if (numSubsets == 2)
+        {
+            numShapes = BC7Data::g_numShapes2;
+            shapeList = BC7Data::g_shapeList2;
+        }
+        else
+        {
+            assert(numSubsets == 3);
+            if (numPartitions == 16)
+            {
+                numShapes = BC7Data::g_numShapes3Short;
+                shapeList = BC7Data::g_shapeList3Short;
+            }
+            else
+            {
+                assert(numPartitions == 64);
+                numShapes = BC7Data::g_numShapes3;
+                shapeList = BC7Data::g_shapeList3;
+            }
+        }
+
+        for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
+            temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
+
+        for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int numTweakRounds = 0;
+            if (isRGB)
+                numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
+            else
+                numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];
+
+            if (numTweakRounds == 0)
+                continue;
+
+            if (numTweakRounds > MaxTweakRounds)
+                numTweakRounds = MaxTweakRounds;
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+            AggregatedError<1> alphaAggError;
+            if (isRGB && anyBlockHasAlpha)
+            {
+                MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    MUInt15 original[1] = { pixels[px][3] };
+                    BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
+                }
+            }
+
+            float alphaWeightsSq[1] = { channelWeightsSq[3] };
+            MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
+
+            MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
+
+            for (int tweak = 0; tweak < numTweakRounds; tweak++)
+            {
+                if (isRGB)
+                {
+                    temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                    tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
+                }
+                else
+                {
+                    temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                }
+            }
+
+            ParallelMath::Int16CompFlag punchThroughInvalid[4];
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
+
+                if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
+                {
+                    // Modes 6 and 7 have parity bits that affect alpha
+                    if (pIter == 0)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
+                    else if (pIter == parityBitMax - 1)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
+                    else
+                        punchThroughInvalid[pIter] = isPunchThrough;
+                }
+            }
+
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
+                    continue;
+
+                bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    uint16_t p[2];
+                    p[0] = (pIter & 1);
+                    p[1] = ((pIter >> 1) & 1);
+
+                    MUInt15 ep[2][4];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        switch (mode)
+                        {
+                        case 0:
+                            CompressEndpoints0(ep, p);
+                            break;
+                        case 1:
+                            CompressEndpoints1(ep, p[0]);
+                            break;
+                        case 2:
+                            CompressEndpoints2(ep);
+                            break;
+                        case 3:
+                            CompressEndpoints3(ep, p);
+                            break;
+                        case 6:
+                            CompressEndpoints6(ep, p);
+                            break;
+                        case 7:
+                            CompressEndpoints7(ep, p);
+                            break;
+                        default:
+                            assert(false);
+                            break;
+                        };
+
+                        MFloat shapeError = ParallelMath::MakeFloatZero();
+
+                        IndexSelector<4> indexSelector;
+                        indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
+
+                        EndpointRefiner<4> epRefiner;
+                        epRefiner.Init(1 << indexPrec, channelWeights);
+
+                        MUInt15 indexes[16];
+
+                        AggregatedError<4> aggError;
+                        for (int pxi = 0; pxi < shapeLength; pxi++)
+                        {
+                            int px = BC7Data::g_fragments[shapeStart + pxi];
+
+                            MUInt15 index;
+                            MUInt15 reconstructed[4];
+
+                            index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
+                            indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                                BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+                            else
+                            {
+                                MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+
+                                MUInt15 altIndexes[2];
+                                altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
+
+                                    MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+                                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
+                                    error = ParallelMath::Min(error, altError);
+                                    ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
+                                }
+
+                                shapeError = shapeError + error;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                                epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
+
+                            indexes[pxi] = index;
+                        }
+
+                        if (flags & cvtt::Flags::BC7_FastIndexing)
+                            shapeError = aggError.Finalize(flags, channelWeightsSq);
+
+                        if (isRGB)
+                            shapeError = shapeError + staticAlphaError;
+
+                        ParallelMath::FloatCompFlag shapeErrorBetter;
+                        ParallelMath::Int16CompFlag shapeErrorBetter16;
+
+                        shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
+                        shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
+
+                        if (ParallelMath::AnySet(shapeErrorBetter16))
+                        {
+                            bool punchThroughOK = true;
+                            if (needPunchThroughCheck)
+                            {
+                                shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
+                                shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
+
+                                if (!ParallelMath::AnySet(shapeErrorBetter16))
+                                    punchThroughOK = false;
+                            }
+
+                            if (punchThroughOK)
+                            {
+                                ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
+                                for (int epi = 0; epi < 2; epi++)
+                                    for (int ch = 0; ch < numRealChannels; ch++)
+                                        ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);
+
+                                for (int pxi = 0; pxi < shapeLength; pxi++)
+                                    ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
+                            }
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                            epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
+                    } // refine
+                } // tweak
+            } // p
+
+            if (flags & cvtt::Flags::BC7_TrySingleColor)
+            {
+                MUInt15 total[4];
+                for (int ch = 0; ch < 4; ch++)
+                    total[ch] = ParallelMath::MakeUInt15(0);
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    for (int ch = 0; ch < 4; ch++)
+                        total[ch] = total[ch] + pixels[pxi][ch];
+                }
+
+                MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
+                MFloat average[4];
+                for (int ch = 0; ch < 4; ch++)
+                    average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
+
+                const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
+                MFloat &shapeBestError = temps.shapeBestError[shape];
+                MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
+                MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
+
+                const cvtt::Tables::BC7SC::Table **scTables = NULL;
+                int numSCTables = 0;
+
+                const cvtt::Tables::BC7SC::Table *tables0[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables1[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables2[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode2,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables3[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode3_p0,
+                    &cvtt::Tables::BC7SC::g_mode3_p1,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables6[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i7,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i7,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables7[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode7_p00,
+                    &cvtt::Tables::BC7SC::g_mode7_p01,
+                    &cvtt::Tables::BC7SC::g_mode7_p10,
+                    &cvtt::Tables::BC7SC::g_mode7_p11,
+                };
+
+                switch (mode)
+                {
+                case 0:
+                {
+                    scTables = tables0;
+                    numSCTables = sizeof(tables0) / sizeof(tables0[0]);
+                }
+                break;
+                case 1:
+                {
+                    scTables = tables1;
+                    numSCTables = sizeof(tables1) / sizeof(tables1[0]);
+                }
+                break;
+                case 2:
+                {
+
+                    scTables = tables2;
+                    numSCTables = sizeof(tables2) / sizeof(tables2[0]);
+                }
+                break;
+                case 3:
+                {
+                    scTables = tables3;
+                    numSCTables = sizeof(tables3) / sizeof(tables3[0]);
+                }
+                break;
+                case 6:
+                {
+                    scTables = tables6;
+                    numSCTables = sizeof(tables6) / sizeof(tables6[0]);
+                }
+                break;
+                case 7:
+                {
+                    scTables = tables7;
+                    numSCTables = sizeof(tables7) / sizeof(tables7[0]);
+                }
+                break;
+                default:
+                    assert(false);
+                    break;
+                }
+
+                TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
+            }
+        } // shapeIter
+
+        uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;
+
+        switch (mode)
+        {
+        case 0:
+            partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        };
+
+        for (uint16_t partition = 0; partition < numPartitions; partition++)
+        {
+            if (((partitionsEnabledBits >> partition) & 1) == 0)
+                continue;
+
+            const int *partitionShapes;
+            if (numSubsets == 1)
+                partitionShapes = BC7Data::g_shapes1[partition];
+            else if (numSubsets == 2)
+                partitionShapes = BC7Data::g_shapes2[partition];
+            else
+            {
+                assert(numSubsets == 3);
+                partitionShapes = BC7Data::g_shapes3[partition];
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            for (int subset = 0; subset < numSubsets; subset++)
+                totalError = totalError + temps.shapeBestError[partitionShapes[subset]];
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
+            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+            if (mode == 7 && anyBlockHasAlpha)
+            {
+                // Some lanes could be better, but we filter them out to ensure consistency with scalar
+                bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);
+
+                if (!isRGBAllowedForThisPartition)
+                {
+                    errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
+                    errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
+                }
+            }
+
+            if (ParallelMath::AnySet(errorBetter16))
+            {
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    int shape = partitionShapes[subset];
+                    int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                    int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);
+
+                    for (int pxi = 0; pxi < shapeLength; pxi++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + pxi];
+                        ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
+                    }
+                }
+
+                ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
+            }
+        }
+    }
+}
+
+void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
+    // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
+    // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
+    // - Separate alpha channel, then weighted RGB
+    // - Alpha+2 other channels, then the independent channel
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (uint16_t mode = 4; mode <= 5; mode++)
+    {
+        int numSP[2] = { 0, 0 };
+
+        for (uint16_t rotation = 0; rotation < 4; rotation++)
+        {
+            if (mode == 4)
+            {
+                numSP[0] = encodingPlan.mode4SP[rotation][0];
+                numSP[1] = encodingPlan.mode4SP[rotation][1];
+            }
+            else
+                numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];
+
+            if (numSP[0] == 0 && numSP[1] == 0)
+                continue;
+
+            int alphaChannel = (rotation + 3) & 3;
+            int redChannel = (rotation == 1) ? 3 : 0;
+            int greenChannel = (rotation == 2) ? 3 : 1;
+            int blueChannel = (rotation == 3) ? 3 : 2;
+
+            MUInt15 rotatedRGB[16][3];
+            MFloat floatRotatedRGB[16][3];
+
+            for (int px = 0; px < 16; px++)
+            {
+                rotatedRGB[px][0] = pixels[px][redChannel];
+                rotatedRGB[px][1] = pixels[px][greenChannel];
+                rotatedRGB[px][2] = pixels[px][blueChannel];
+
+                for (int ch = 0; ch < 3; ch++)
+                    floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
+            }
+
+            uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
+
+            float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
+            float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
+            float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
+            float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
+
+            float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
+
+            MFloat preWeightedRotatedRGB[16][3];
+            BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
+
+            for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
+            {
+                int numTweakRounds = numSP[indexSelector];
+
+                if (numTweakRounds <= 0)
+                    continue;
+
+                if (numTweakRounds > MaxTweakRounds)
+                    numTweakRounds = MaxTweakRounds;
+
+                EndpointSelector<3, 8> rgbSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int px = 0; px < 16; px++)
+                        rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
+
+                    rgbSelector.FinishPass(epPass);
+                }
+
+                MUInt15 alphaRange[2];
+
+                alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
+                for (int px = 1; px < 16; px++)
+                {
+                    alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
+                    alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
+                }
+
+                int rgbPrec = 0;
+                int alphaPrec = 0;
+
+                if (mode == 4)
+                {
+                    rgbPrec = indexSelector ? 3 : 2;
+                    alphaPrec = indexSelector ? 2 : 3;
+                }
+                else
+                    rgbPrec = alphaPrec = 2;
+
+                UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
+
+                MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
+                MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
+
+                MUInt15 bestRGBIndexes[16];
+                MUInt15 bestAlphaIndexes[16];
+                MUInt15 bestEP[2][4];
+
+                for (int px = 0; px < 16; px++)
+                    bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 rgbEP[2][3];
+                    MUInt15 alphaEP[2];
+
+                    unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
+
+                    TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        if (mode == 4)
+                            CompressEndpoints4(rgbEP, alphaEP);
+                        else
+                            CompressEndpoints5(rgbEP, alphaEP);
+
+
+                        IndexSelector<1> alphaIndexSelector;
+                        IndexSelector<3> rgbIndexSelector;
+
+                        {
+                            MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
+                            alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
+                        }
+                        rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
+
+                        EndpointRefiner<3> rgbRefiner;
+                        EndpointRefiner<1> alphaRefiner;
+
+                        rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
+                        alphaRefiner.Init(1 << alphaPrec, uniformWeight);
+
+                        MFloat errorRGB = ParallelMath::MakeFloatZero();
+                        MFloat errorA = ParallelMath::MakeFloatZero();
+
+                        MUInt15 rgbIndexes[16];
+                        MUInt15 alphaIndexes[16];
+
+                        AggregatedError<3> rgbAggError;
+                        AggregatedError<1> alphaAggError;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
+                            MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
+
+                            MUInt15 reconstructedRGB[3];
+                            MUInt15 reconstructedAlpha[1];
+
+                            rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
+                            alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                            {
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
+                            }
+                            else
+                            {
+                                AggregatedError<3> baseRGBAggError;
+                                AggregatedError<1> baseAlphaAggError;
+
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
+
+                                MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                MUInt15 altRGBIndexes[2];
+                                MUInt15 altAlphaIndexes[2];
+
+                                altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
+
+                                altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
+                                    alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
+
+                                    AggregatedError<3> altRGBAggError;
+                                    AggregatedError<1> altAlphaAggError;
+
+                                    BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
+                                    BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
+
+                                    MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                    MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                    ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
+                                    ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
+
+                                    rgbError = ParallelMath::Min(altRGBError, rgbError);
+                                    alphaError = ParallelMath::Min(altAlphaError, alphaError);
+
+                                    ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
+                                    ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
+                                }
+
+                                errorRGB = errorRGB + rgbError;
+                                errorA = errorA + alphaError;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                            {
+                                rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
+                                alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
+                            }
+
+                            if (flags & Flags::BC7_FastIndexing)
+                            {
+                                errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+                            }
+
+                            rgbIndexes[px] = rgbIndex;
+                            alphaIndexes[px] = alphaIndex;
+                        }
+
+                        ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
+                        ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
+
+                        ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
+                        ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
+
+                        if (ParallelMath::AnySet(rgbBetterInt16))
+                        {
+                            bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                            {
+                                for (int ch = 0; ch < 3; ch++)
+                                    ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
+                            }
+                        }
+
+                        if (ParallelMath::AnySet(alphaBetterInt16))
+                        {
+                            bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                                ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                        {
+                            rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
+
+                            MUInt15 alphaEPTemp[2][1];
+                            alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
+
+                            for (int i = 0; i < 2; i++)
+                                alphaEP[i] = alphaEPTemp[i][0];
+                        }
+                    }	// refine
+                } // tweak
+
+                MFloat combinedError = bestRGBError + bestAlphaError;
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                work.m_error = ParallelMath::Min(combinedError, work.m_error);
+
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
+
+                for (int px = 0; px < 16; px++)
+                {
+                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
+                    ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
+                }
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < 4; ch++)
+                        ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
+            }
+        }
+    }
+}
+
+template<class T>
+void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
+{
+    T temp = a;
+    a = b;
+    b = temp;
+}
+
+void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
+{
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    BC67::WorkInfo work;
+    memset(&work, 0, sizeof(work));
+
+    work.m_error = ParallelMath::MakeFloat(FLT_MAX);
+
+    {
+        ParallelMath::RoundTowardNearestForScope rtn;
+        TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+        TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        PackingVector pv;
+        pv.Init();
+
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
+        ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
+
+        const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        ParallelMath::ScalarUInt16 indexes2[16];
+        ParallelMath::ScalarUInt16 endPoints[3][2][4];
+
+        for (int i = 0; i < 16; i++)
+        {
+            indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
+            if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+                indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
+        }
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 4; ch++)
+                    endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
+            }
+        }
+
+        int fixups[3] = { 0, 0, 0 };
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+            bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
+
+            if (flipRGB)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes[px] = highIndex - indexes[px];
+            }
+
+            if (flipAlpha)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes2[px] = highIndex - indexes2[px];
+            }
+
+            if (indexSelector)
+                Swap(flipRGB, flipAlpha);
+
+            if (flipRGB)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
+            }
+            if (flipAlpha)
+                Swap(endPoints[0][0][3], endPoints[0][1][3]);
+
+        }
+        else
+        {
+            if (modeInfo.m_numSubsets == 2)
+                fixups[1] = BC7Data::g_fixupIndexes2[partition];
+            else if (modeInfo.m_numSubsets == 3)
+            {
+                fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+                fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+            }
+
+            bool flip[3] = { false, false, false };
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+
+            if (flip[0] || flip[1] || flip[2])
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                {
+                    int subset = 0;
+                    if (modeInfo.m_numSubsets == 2)
+                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+                    else if (modeInfo.m_numSubsets == 3)
+                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+                    if (flip[subset])
+                        indexes[px] = highIndex - indexes[px];
+                }
+
+                int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
+                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                {
+                    if (flip[subset])
+                        for (int ch = 0; ch < maxCH; ch++)
+                            Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
+                }
+            }
+        }
+
+        pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
+
+        if (modeInfo.m_partitionBits)
+            pv.Pack(partition, modeInfo.m_partitionBits);
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
+            pv.Pack(rotation, 2);
+        }
+
+        if (modeInfo.m_hasIndexSelector)
+            pv.Pack(indexSelector, 1);
+
+        // Encode RGB
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
+                    epPart >>= (8 - modeInfo.m_rgbBits);
+
+                    pv.Pack(epPart, modeInfo.m_rgbBits);
+                }
+            }
+        }
+
+        // Encode alpha
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
+                    epPart >>= (8 - modeInfo.m_alphaBits);
+
+                    pv.Pack(epPart, modeInfo.m_alphaBits);
+                }
+            }
+        }
+
+        // Encode parity bits
+        if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
+                epPart >>= (7 - modeInfo.m_rgbBits);
+                epPart &= 1;
+
+                pv.Pack(epPart, 1);
+            }
+        }
+        else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
+                    epPart >>= (7 - modeInfo.m_rgbBits);
+                    epPart &= 1;
+
+                    pv.Pack(epPart, 1);
+                }
+            }
+        }
+
+        // Encode indexes
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_indexBits;
+            if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+                bits--;
+
+            pv.Pack(indexes[px], bits);
+        }
+
+        // Encode secondary indexes
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int bits = modeInfo.m_alphaIndexBits;
+                if (px == 0)
+                    bits--;
+
+                pv.Pack(indexes2[px], bits);
+            }
+        }
+
+        pv.Flush(packedBlocks);
+
+        packedBlocks += 16;
+    }
+}
+
+void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
+{
+    UnpackingVector pv;
+    pv.Init(packedBlock);
+
+    int mode = 8;
+    for (int i = 0; i < 8; i++)
+    {
+        if (pv.Unpack(1) == 1)
+        {
+            mode = i;
+            break;
+        }
+    }
+
+    if (mode > 7)
+    {
+        for (int px = 0; px < 16; px++)
+            for (int ch = 0; ch < 4; ch++)
+                output.m_pixels[px][ch] = 0;
+
+        return;
+    }
+
+    const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
+
+    int partition = 0;
+    if (modeInfo.m_partitionBits)
+        partition = pv.Unpack(modeInfo.m_partitionBits);
+
+    int rotation = 0;
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        rotation = pv.Unpack(2);
+
+    int indexSelector = 0;
+    if (modeInfo.m_hasIndexSelector)
+        indexSelector = pv.Unpack(1);
+
+    // Resolve fixups
+    int fixups[3] = { 0, 0, 0 };
+
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
+    {
+        if (modeInfo.m_numSubsets == 2)
+            fixups[1] = BC7Data::g_fixupIndexes2[partition];
+        else if (modeInfo.m_numSubsets == 3)
+        {
+            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+        }
+    }
+
+    int endPoints[3][2][4];
+
+    // Decode RGB
+    for (int ch = 0; ch < 3; ch++)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
+        }
+    }
+
+    // Decode alpha
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
+        }
+    }
+    else
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = 255;
+        }
+    }
+
+    int parityBits = 0;
+
+    // Decode parity bits
+    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            int p = pv.Unpack(1);
+
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                int p = pv.Unpack(1);
+
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+
+    // Fill endpoint bits
+    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+    {
+        for (int ep = 0; ep < 2; ep++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
+
+            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
+        }
+    }
+
+    int indexes[16];
+    int indexes2[16];
+
+    // Decode indexes
+    for (int px = 0; px < 16; px++)
+    {
+        int bits = modeInfo.m_indexBits;
+        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+            bits--;
+
+        indexes[px] = pv.Unpack(bits);
+    }
+
+    // Decode secondary indexes
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_alphaIndexBits;
+            if (px == 0)
+                bits--;
+
+            indexes2[px] = pv.Unpack(bits);
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            indexes2[px] = 0;
+    }
+
+    const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
+    const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
+
+    // Decode each pixel
+    for (int px = 0; px < 16; px++)
+    {
+        int rgbWeight = 0;
+        int alphaWeight = 0;
+
+        int rgbIndex = indexes[px];
+
+        rgbWeight = rgbWeights[indexes[px]];
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
+            alphaWeight = rgbWeight;
+        else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+            alphaWeight = alphaWeights[indexes2[px]];
+
+        if (indexSelector == 1)
+        {
+            int temp = rgbWeight;
+            rgbWeight = alphaWeight;
+            alphaWeight = temp;
+        }
+
+        int pixel[4] = { 0, 0, 0, 255 };
+
+        int subset = 0;
+
+        if (modeInfo.m_numSubsets == 2)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+        else if (modeInfo.m_numSubsets == 3)
+            subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+        for (int ch = 0; ch < 3; ch++)
+            pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
+
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+            pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
+
+        if (rotation != 0)
+        {
+            int ch = rotation - 1;
+            int temp = pixel[ch];
+            pixel[ch] = pixel[3];
+            pixel[3] = temp;
+        }
+
+        for (int ch = 0; ch < 4; ch++)
+            output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
+    }
+}
+
+cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
+    assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
+
+    // Expand to full range
+    ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
+    MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
+
+    absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
+
+    MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
+
+    return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
+}
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
+    return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
+{
+    MSInt16 zero = ParallelMath::MakeSInt16(0);
+
+    ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
+    MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
+
+    MSInt16 unq;
+    MUInt15 absUnq;
+
+    if (precision >= 16)
+    {
+        unq = comp;
+        absUnq = absComp;
+    }
+    else
+    {
+        MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
+        ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
+        ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
+
+        unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
+    }
+
+    outUnquantized = unq;
+
+    MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
+
+    outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
+{
+    MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
+    if (precision < 15)
+    {
+        MUInt15 zero = ParallelMath::MakeUInt15(0);
+        MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
+
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
+
+        ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
+        ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
+    }
+
+    outUnquantized = unq;
+    outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MSInt16 unquantizedEP[2][3];
+    MSInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
+                UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MUInt16 unquantizedEP[2][3];
+    MUInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
+                UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][0][ch] = ep0[0][ch];
+        outEncodedEPs[0][1][ch] = ep0[1][ch];
+        outEncodedEPs[1][0][ch] = ep1[0][ch];
+        outEncodedEPs[1][1][ch] = ep1[1][ch];
+
+        if (isTransformed)
+        {
+            for (int subset = 0; subset < 2; subset++)
+            {
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    if (epi == 0 && subset == 0)
+                        continue;
+
+                    MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
+
+                    MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
+
+                    outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+                    MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
+                    allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+                }
+            }
+        }
+
+        if (!ParallelMath::AnySet(allLegal))
+            break;
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][ch] = ep[0][ch];
+        outEncodedEPs[1][ch] = ep[1][ch];
+
+        if (isTransformed)
+        {
+            MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
+
+            MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
+
+            outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+            MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
+            allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+        }
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
+{
+    if (numTweakRounds < 1)
+        numTweakRounds = 1;
+    else if (numTweakRounds > MaxTweakRounds)
+        numTweakRounds = MaxTweakRounds;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+    else if (numRefineRounds > MaxRefineRounds)
+        numRefineRounds = MaxRefineRounds;
+
+    bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
+    float channelWeightsSq[3];
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    MSInt16 pixels[16][3];
+    MFloat floatPixels2CL[16][3];
+    MFloat floatPixelsLinearWeighted[16][3];
+
+    MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MSInt16 pixelValue;
+            ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
+
+            // Convert from sign+magnitude to 2CL
+            if (isSigned)
+            {
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
+                MSInt16 magnitude = (pixelValue & low15Bits);
+                ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
+            }
+            else
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
+
+            pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
+
+            pixels[px][ch] = pixelValue;
+            floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
+            floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
+        }
+    }
+
+    MFloat preWeightedPixels[16][3];
+
+    BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
+
+    MAInt16 bestEndPoints[2][2][3];
+    MUInt15 bestIndexes[16];
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestMode = ParallelMath::MakeUInt15(0);
+    MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
+
+    UnfinishedEndpoints<3> partitionedUFEP[32][2];
+    UnfinishedEndpoints<3> singleUFEP;
+
+    // Generate UFEP for partitions
+    for (int p = 0; p < 32; p++)
+    {
+        int partitionMask = BC7Data::g_partitionMap[p];
+
+        EndpointSelector<3, 8> epSelectors[2];
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int subset = (partitionMask >> px) & 1;
+                epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+            }
+
+            for (int subset = 0; subset < 2; subset++)
+                epSelectors[subset].FinishPass(pass);
+        }
+
+        for (int subset = 0; subset < 2; subset++)
+            partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
+    }
+
+    // Generate UFEP for single
+    {
+        EndpointSelector<3, 8> epSelector;
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+                epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+
+            epSelector.FinishPass(pass);
+        }
+
+        singleUFEP = epSelector.GetEndpoints(channelWeights);
+    }
+
+    for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
+    {
+        bool partitioned = (partitionedInt == 1);
+
+        for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
+        {
+            if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
+                continue;
+
+            int numPartitions = partitioned ? 32 : 1;
+            int numSubsets = partitioned ? 2 : 1;
+            int indexBits = partitioned ? 3 : 4;
+            int indexRange = (1 << indexBits);
+
+            for (int p = 0; p < numPartitions; p++)
+            {
+                int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
+
+                const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
+
+                MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
+                MUInt15 metaIndexes[MaxMetaRounds][16];
+                MFloat metaError[MaxMetaRounds][2];
+
+                bool roundValid[MaxMetaRounds][2];
+
+                for (int r = 0; r < MaxMetaRounds; r++)
+                    for (int subset = 0; subset < 2; subset++)
+                        roundValid[r][subset] = true;
+
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
+                    {
+                        EndpointRefiner<3> refiners[2];
+
+                        bool abortRemainingRefines = false;
+                        for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
+                        {
+                            int metaRound = tweak * MaxRefineRounds + refinePass;
+
+                            if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
+                                abortRemainingRefines = true;
+
+                            if (abortRemainingRefines)
+                            {
+                                roundValid[metaRound][subset] = false;
+                                continue;
+                            }
+
+                            MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
+                            MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
+
+                            MSInt16 endPointsColorSpace[2][3];
+
+                            if (refinePass == 0)
+                            {
+                                UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
+
+                                if (isSigned)
+                                    ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                                else
+                                    ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                            }
+                            else
+                                refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
+
+                            refiners[subset].Init(indexRange, channelWeights);
+
+                            int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
+
+                            IndexSelectorHDR<3> indexSelector;
+                            if (isSigned)
+                                QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+                            else
+                                QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+
+                            if (metaRound > 0)
+                            {
+                                ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
+
+                                for (int prevRound = 0; prevRound < metaRound; prevRound++)
+                                {
+                                    MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
+
+                                    ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
+
+                                    for (int epi = 0; epi < 2; epi++)
+                                        for (int ch = 0; ch < 3; ch++)
+                                            same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
+
+                                    anySame = (anySame | same);
+                                    if (ParallelMath::AllSet(anySame))
+                                        break;
+                                }
+
+                                if (ParallelMath::AllSet(anySame))
+                                {
+                                    roundValid[metaRound][subset] = false;
+                                    continue;
+                                }
+                            }
+
+                            MFloat subsetError = ParallelMath::MakeFloatZero();
+
+                            {
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    if (subset != ((partitionMask >> px) & 1))
+                                        continue;
+
+                                    MUInt15 index;
+                                    if (px == fixupIndex)
+                                        index = mrIndexes[px];
+                                    else
+                                    {
+                                        index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
+                                        mrIndexes[px] = index;
+                                    }
+
+                                    MSInt16 reconstructed[3];
+                                    if (isSigned)
+                                        indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
+                                    else
+                                        indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
+
+                                    subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
+
+                                    if (refinePass != numRefineRounds - 1)
+                                        refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
+                                }
+                            }
+
+                            metaError[metaRound][subset] = subsetError;
+                        }
+                    }
+                }
+
+                // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
+                int numMeta1 = partitioned ? MaxMetaRounds : 1;
+                for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
+                {
+                    if (!roundValid[meta0][0])
+                        continue;
+
+                    for (int meta1 = 0; meta1 < numMeta1; meta1++)
+                    {
+                        MFloat combinedError = metaError[meta0][0];
+                        if (partitioned)
+                        {
+                            if (!roundValid[meta1][1])
+                                continue;
+
+                            combinedError = combinedError + metaError[meta1][1];
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
+                        if (!ParallelMath::AnySet(errorBetter))
+                            continue;
+
+                        ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        // Figure out if this is encodable
+                        for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
+                        {
+                            const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
+
+                            if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
+                                continue;
+
+                            MAInt16 encodedEPs[2][2][3];
+                            ParallelMath::Int16CompFlag isLegal;
+                            if (partitioned)
+                                EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
+                            else
+                                EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
+
+                            ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
+                            if (!ParallelMath::AnySet(isLegalAndBetter))
+                                continue;
+
+                            ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
+
+                            ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
+                            ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
+                            ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
+
+                            for (int subset = 0; subset < numSubsets; subset++)
+                            {
+                                for (int epi = 0; epi < 2; epi++)
+                                {
+                                    for (int ch = 0; ch < 3; ch++)
+                                        ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
+                                }
+                            }
+
+                            for (int px = 0; px < 16; px++)
+                            {
+                                int subset = ((partitionMask >> px) & 1);
+                                if (subset == 0)
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
+                                else
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
+                            }
+
+                            needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
+                            if (!ParallelMath::AnySet(needsCommit))
+                                break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // At this point, everything should be set
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
+        int32_t eps[2][2][3];
+        ParallelMath::ScalarUInt16 indexes[16];
+
+        const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+
+        BC6H_IO::WriteFunc_t writeFunc = BC6H_IO::g_writeFuncs[mode];
+
+        const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
+            }
+        }
+
+        for (int px = 0; px < 16; px++)
+            indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
+
+        uint16_t modeID = modeInfo.m_modeID;
+
+        PackingVector pv;
+
+        {
+            uint32_t header[3];
+            writeFunc(header, modeID, partition,
+                eps[0][0][0], eps[0][1][0], eps[1][0][0], eps[1][1][0],
+                eps[0][0][1], eps[0][1][1], eps[1][0][1], eps[1][1][1],
+                eps[0][0][2], eps[0][1][2], eps[1][0][2], eps[1][1][2]
+            );
+
+            pv.InitPacked(header, headerBits);
+        }
+
+        int fixupIndex1 = 0;
+        int indexBits = 4;
+        if (modeInfo.m_partitioned)
+        {
+            fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+            indexBits = 3;
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
+            if (px == 0 || px == fixupIndex1)
+                pv.Pack(index, indexBits - 1);
+            else
+                pv.Pack(index, indexBits);
+        }
+
+        pv.Flush(packedBlocks + 16 * block);
+    }
+}
+
+void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
+{
+    if (v & (1 << (bits - 1)))
+        v |= -(1 << bits);
+}
+
+void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
+{
+    UnpackingVector pv;
+    pv.Init(pBC);
+
+    int numModeBits = 2;
+    int modeBits = pv.Unpack(2);
+    if (modeBits != 0 && modeBits != 1)
+    {
+        modeBits |= pv.Unpack(3) << 2;
+        numModeBits += 3;
+    }
+
+    int mode = -1;
+    for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
+    {
+        if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
+        {
+            mode = possibleMode;
+            break;
+        }
+    }
+
+    if (mode < 0)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                output.m_pixels[px][ch] = 0;
+            output.m_pixels[px][3] = 0x3c00;	// 1.0
+        }
+        return;
+    }
+
+    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+    const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+    const BC6H_IO::ReadFunc_t readFunc = BC6H_IO::g_readFuncs[mode];
+
+    uint16_t partition = 0;
+    int32_t eps[2][2][3];
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                eps[subset][epi][ch] = 0;
+
+    {
+        uint32_t header[3];
+        uint16_t codedEPs[2][2][3];
+        pv.UnpackStart(header, headerBits);
+
+        readFunc(header, partition,
+            codedEPs[0][0][0], codedEPs[0][1][0], codedEPs[1][0][0], codedEPs[1][1][0],
+            codedEPs[0][0][1], codedEPs[0][1][1], codedEPs[1][0][1], codedEPs[1][1][1],
+            codedEPs[0][0][2], codedEPs[0][1][2], codedEPs[1][0][2], codedEPs[1][1][2]
+        );
+
+        for (int subset = 0; subset < 2; subset++)
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = codedEPs[subset][epi][ch];
+    }
+
+    uint16_t modeID = modeInfo.m_modeID;
+
+    int fixupIndex1 = 0;
+    int indexBits = 4;
+    int numSubsets = 1;
+    if (modeInfo.m_partitioned)
+    {
+        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+        indexBits = 3;
+        numSubsets = 2;
+    }
+
+    int indexes[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (px == 0 || px == fixupIndex1)
+            indexes[px] = pv.Unpack(indexBits - 1);
+        else
+            indexes[px] = pv.Unpack(indexBits);
+    }
+
+    if (modeInfo.m_partitioned)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+            {
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
+            }
+        }
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+        }
+    }
+
+    int aPrec = modeInfo.m_aPrec;
+
+    if (modeInfo.m_transformed)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int wrapMask = (1 << aPrec) - 1;
+
+            eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
+            if (isSigned)
+                SignExtendSingle(eps[0][1][ch], aPrec);
+
+            if (modeInfo.m_partitioned)
+            {
+                eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
+                eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
+
+                if (isSigned)
+                {
+                    SignExtendSingle(eps[1][0][ch], aPrec);
+                    SignExtendSingle(eps[1][1][ch], aPrec);
+                }
+            }
+        }
+    }
+
+    // Unquantize endpoints
+    for (int subset = 0; subset < numSubsets; subset++)
+    {
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                int &v = eps[subset][epi][ch];
+
+                if (isSigned)
+                {
+                    if (aPrec >= 16)
+                    {
+                        // Nothing
+                    }
+                    else
+                    {
+                        bool s = false;
+                        int comp = v;
+                        if (v < 0)
+                        {
+                            s = true;
+                            comp = -comp;
+                        }
+
+                        int unq = 0;
+                        if (comp == 0)
+                            unq = 0;
+                        else if (comp >= ((1 << (aPrec - 1)) - 1))
+                            unq = 0x7fff;
+                        else
+                            unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
+
+                        if (s)
+                            unq = -unq;
+
+                        v = unq;
+                    }
+                }
+                else
+                {
+                    if (aPrec >= 15)
+                    {
+                        // Nothing
+                    }
+                    else if (v == 0)
+                    {
+                        // Nothing
+                    }
+                    else if (v == ((1 << aPrec) - 1))
+                        v = 0xffff;
+                    else
+                        v = ((v << 16) + 0x8000) >> aPrec;
+                }
+            }
+        }
+    }
+
+    const int *weights = BC7Data::g_weightTables[indexBits];
+
+    for (int px = 0; px < 16; px++)
+    {
+        int subset = 0;
+        if (modeInfo.m_partitioned)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+
+        int w = weights[indexes[px]];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
+
+            if (isSigned)
+            {
+                if (comp < 0)
+                    comp = -(((-comp) * 31) >> 5);
+                else
+                    comp = (comp * 31) >> 5;
+
+                int s = 0;
+                if (comp < 0)
+                {
+                    s = 0x8000;
+                    comp = -comp;
+                }
+
+                output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
+            }
+            else
+            {
+                comp = (comp * 31) >> 6;
+                output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
+            }
+        }
+        output.m_pixels[px][3] = 0x3c00;	// 1.0
+    }
+}
+
+void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
+{
+    static const int kMaxQuality = 100;
+
+    if (quality < 1)
+        quality = 1;
+    else if (quality > kMaxQuality)
+        quality = kMaxQuality;
+
+    const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
+    const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;
+
+    const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
+    const int prioListSizes[] = { numRGBModes, numRGBAModes };
+
+    BC7FineTuningParams ftParams;
+    memset(&ftParams, 0, sizeof(ftParams));
+
+    for (int listIndex = 0; listIndex < 2; listIndex++)
+    {
+        int prioListSize = prioListSizes[listIndex];
+        const uint16_t *prioList = prioLists[listIndex];
+
+        for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
+        {
+            const uint16_t packedMode = prioList[prioIndex];
+
+            uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
+            int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);
+
+            switch (mode)
+            {
+            case 0:
+                ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 1:
+                ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 2:
+                ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 3:
+                ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 4:
+                ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
+                break;
+            case 5:
+                ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
+                break;
+            case 6:
+                ftParams.mode6SP = seedPoints;
+                break;
+            case 7:
+                ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            }
+        }
+    }
+
+    ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
+}
+
+// Generates a BC7 encoding plan from fine-tuning parameters.
+bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
+{
+    memset(&encodingPlan, 0, sizeof(encodingPlan));
+
+    // Mode 0
+    for (int partition = 0; partition < 16; partition++)
+    {
+        uint8_t sp = params.mode0SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 1
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode1SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 2
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode2SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 3
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode3SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 4
+    for (int rotation = 0; rotation < 4; rotation++)
+    {
+        for (int indexMode = 0; indexMode < 2; indexMode++)
+            encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
+    }
+
+    // Mode 5
+    for (int rotation = 0; rotation < 4; rotation++)
+        encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];
+
+    // Mode 6
+    {
+        uint8_t sp = params.mode6SP;
+        if (sp != 0)
+        {
+            encodingPlan.mode6Enabled = true;
+
+            int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    // Mode 7
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode7SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGB[i] > 0)
+        {
+            encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
+            encodingPlan.rgbNumShapesToEvaluate++;
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
+        {
+            encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
+            encodingPlan.rgbaNumShapesToEvaluate++;
+        }
+    }
+
+    encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);
+
+    return true;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.h b/thirdparty/cvtt/ConvectionKernels_BC67.h
new file mode 100644
index 0000000000..b929711187
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace BC7SC
+        {
+            struct Table;
+        }
+    }
+
+    namespace Internal
+    {
+        namespace BC67
+        {
+            struct WorkInfo;
+        }
+
+        template<int TVectorSize>
+        class IndexSelectorHDR;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BC7Computer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds);
+            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock);
+
+        private:
+            static const int MaxTweakRounds = 4;
+
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2]);
+            static void Quantize(MUInt15* color, int bits, int channels);
+            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels);
+            static void Unquantize(MUInt15* color, int bits, int channels);
+            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p);
+            static void CompressEndpoints2(MUInt15 ep[2][4]);
+            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2]);
+            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+
+            template<class T>
+            static void Swap(T& a, T& b);
+        };
+
+
+        class BC6HComputer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds);
+            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            static const int MaxTweakRounds = 4;
+            static const int MaxRefineRounds = 3;
+
+            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru);
+            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru);
+            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL);
+            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished);
+            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void SignExtendSingle(int &v, int bits);
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
new file mode 100644
index 0000000000..753b6f9000
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
@@ -0,0 +1,881 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        void WriteMode0(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 2) & 0x4u) | ((by >> 1) & 0x8u) | (bz & 0x10u) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode1(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 3) & 0x4u) | ((gz >> 1) & 0x18u) | ((rw << 5) & 0xfe0u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x3f8000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode2(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((rw >> 2) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode3(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((gw << 8) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 5) & 0x20u) | ((bz << 4) & 0x40u) | ((rz << 7) & 0x780u) | ((gy << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode4(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((by << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bw << 18) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 4) & 0x60u) | ((rz << 7) & 0x780u) | ((bz << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode5(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x3fe0u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0xff8000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x3u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode6(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((gz << 9) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 2) & 0x6u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode7(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 13) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((gy << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((gz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode8(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 12) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((by << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode9(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7e0u) | ((gz << 7) & 0x800u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x1f8000u) | ((gy << 16) & 0x200000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0x7e000000u) | ((gz << 26) & 0x80000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode10(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x1ff8u) | ((gx << 13) & 0x7fe000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bx >> 9) & 0x1u);
+        }
+
+        void WriteMode11(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xff8u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x3fe000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode12(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x7f8u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1fe000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7f800000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode13(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 8) & 0x80u) | ((rw >> 6) & 0x100u) | ((rw >> 4) & 0x200u) | ((rw >> 2) & 0x400u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1e000u) | ((gw << 2) & 0x20000u) | ((gw << 4) & 0x40000u) | ((gw << 6) & 0x80000u) | ((gw << 8) & 0x100000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7800000u) | ((bw << 12) & 0x8000000u) | ((bw << 14) & 0x10000000u) | ((bw << 16) & 0x20000000u) | ((bw << 18) & 0x40000000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void ReadMode0(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 2) & 0x10u);
+            by |= ((encoded[0] << 1) & 0x10u);
+            bz |= (encoded[0] & 0x10u);
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode1(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 3) & 0x20u);
+            gz |= ((encoded[0] << 1) & 0x30u);
+            rw |= ((encoded[0] >> 5) & 0x7fu);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x7fu);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode2(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            rw |= ((encoded[1] << 2) & 0x400u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode3(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            gw |= ((encoded[1] >> 8) & 0x400u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 5) & 0x1u);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            gy |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode4(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            by |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bw |= ((encoded[1] >> 18) & 0x400u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 4) & 0x6u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            bz |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode5(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x1ffu);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x1ffu);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x180u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode6(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            gz |= ((encoded[0] >> 9) & 0x10u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 2) & 0x18u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode7(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 13) & 0x1u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            gy |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            gz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode8(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 12) & 0x2u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            by |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode9(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3fu);
+            gz |= ((encoded[0] >> 7) & 0x10u);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x3fu);
+            gy |= ((encoded[0] >> 16) & 0x20u);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x3fu);
+            gz |= ((encoded[0] >> 26) & 0x20u);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode10(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x3ffu);
+            gx |= ((encoded[1] >> 13) & 0x3ffu);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bx |= ((encoded[2] << 9) & 0x200u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode11(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1ffu);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0x1ffu);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode12(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xffu);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xffu);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xffu);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode13(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 8) & 0x8000u);
+            rw |= ((encoded[1] << 6) & 0x4000u);
+            rw |= ((encoded[1] << 4) & 0x2000u);
+            rw |= ((encoded[1] << 2) & 0x1000u);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 2) & 0x8000u);
+            gw |= ((encoded[1] >> 4) & 0x4000u);
+            gw |= ((encoded[1] >> 6) & 0x2000u);
+            gw |= ((encoded[1] >> 8) & 0x1000u);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 12) & 0x8000u);
+            bw |= ((encoded[1] >> 14) & 0x4000u);
+            bw |= ((encoded[1] >> 16) & 0x2000u);
+            bw |= ((encoded[1] >> 18) & 0x1000u);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        const ReadFunc_t g_readFuncs[14] =
+        {
+            ReadMode0,
+            ReadMode1,
+            ReadMode2,
+            ReadMode3,
+            ReadMode4,
+            ReadMode5,
+            ReadMode6,
+            ReadMode7,
+            ReadMode8,
+            ReadMode9,
+            ReadMode10,
+            ReadMode11,
+            ReadMode12,
+            ReadMode13
+        };
+
+        const WriteFunc_t g_writeFuncs[14] =
+        {
+            WriteMode0,
+            WriteMode1,
+            WriteMode2,
+            WriteMode3,
+            WriteMode4,
+            WriteMode5,
+            WriteMode6,
+            WriteMode7,
+            WriteMode8,
+            WriteMode9,
+            WriteMode10,
+            WriteMode11,
+            WriteMode12,
+            WriteMode13
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
new file mode 100644
index 0000000000..a7bb517b54
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdint.h>
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        typedef void (*ReadFunc_t)(const uint32_t *encoded, uint16_t &d, uint16_t &rw, uint16_t &rx, uint16_t &ry, uint16_t &rz, uint16_t &gw, uint16_t &gx, uint16_t &gy, uint16_t &gz, uint16_t &bw, uint16_t &bx, uint16_t &by, uint16_t &bz);
+        typedef void (*WriteFunc_t)(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz);
+
+        extern const ReadFunc_t g_readFuncs[14];
+        extern const WriteFunc_t g_writeFuncs[14];
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
new file mode 100644
index 0000000000..1880e22d0f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    extern const uint16_t *g_bc7PrioCodesRGB;
+    extern const int g_bc7NumPrioCodesRGB;
+
+    extern const uint16_t *g_bc7PrioCodesRGBA;
+    extern const int g_bc7NumPrioCodesRGBA;
+
+    int UnpackMode(uint16_t packed);
+    int UnpackSeedPointCount(uint16_t packed);
+    int UnpackPartition(uint16_t packed);
+    int UnpackRotation(uint16_t packed);
+    int UnpackIndexSelector(uint16_t packed);
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
new file mode 100644
index 0000000000..5b3134f860
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
@@ -0,0 +1,1301 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC7_Prio.h"
+
+#define BC7_PARTITION_BITS  6
+#define BC7_PARTITION_OFFSET_BITS  0
+
+#define BC7_ROTATION_BITS   2
+#define BC7_ROTATION_OFFSET_BITS    0
+
+#define BC7_INDEX_MODE_BITS 1
+#define BC7_INDEX_MODE_OFFSET_BITS (BC7_ROTATION_OFFSET_BITS + BC7_ROTATION_BITS)
+
+#define BC7_MODE_BITS 3
+#define BC7_MODE_OFFSET_BITS (BC7_PARTITION_OFFSET_BITS + BC7_PARTITION_BITS)
+#define BC7_SEED_POINT_COUNT_BITS  2
+#define BC7_SEED_POINT_COUNT_OFFSET_BITS  (BC7_MODE_BITS + BC7_MODE_OFFSET_BITS)
+
+
+
+#define BC7_MODE_PRIO_DUAL_PLANE(subData)   \
+    ( \
+        ((subData / 10) << BC7_ROTATION_OFFSET_BITS) | \
+        ((subData % 10) << BC7_INDEX_MODE_OFFSET_BITS) \
+    )
+
+#define BC7_MODE_PRIO_CODE(seedPointCount, mode, subData)   \
+    (\
+        ((seedPointCount - 1) << BC7_SEED_POINT_COUNT_OFFSET_BITS) |  \
+        (mode << BC7_MODE_OFFSET_BITS) |   \
+        ((mode == 4 || mode == 5) ? BC7_MODE_PRIO_DUAL_PLANE(subData) : (subData << BC7_PARTITION_OFFSET_BITS)) \
+    )
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    const uint16_t g_bc7PrioCodesRGBData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 16),
+        BC7_MODE_PRIO_CODE(1, 1, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 0, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 8),
+        BC7_MODE_PRIO_CODE(1, 3, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 23),
+        BC7_MODE_PRIO_CODE(1, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 1, 21),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 0, 4),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 0),
+        BC7_MODE_PRIO_CODE(2, 0, 10),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 8),
+        BC7_MODE_PRIO_CODE(2, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 12),
+        BC7_MODE_PRIO_CODE(1, 1, 18),
+        BC7_MODE_PRIO_CODE(1, 3, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 25),
+        BC7_MODE_PRIO_CODE(1, 0, 7),
+        BC7_MODE_PRIO_CODE(3, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 0, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 0),
+        BC7_MODE_PRIO_CODE(2, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 14),
+        BC7_MODE_PRIO_CODE(3, 0, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 16),
+        BC7_MODE_PRIO_CODE(2, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 3),
+        BC7_MODE_PRIO_CODE(4, 0, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 0, 2),
+        BC7_MODE_PRIO_CODE(2, 1, 2),
+        BC7_MODE_PRIO_CODE(4, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 0, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 0, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 17),
+        BC7_MODE_PRIO_CODE(2, 1, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 15),
+        BC7_MODE_PRIO_CODE(2, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 2, 63),
+        BC7_MODE_PRIO_CODE(4, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 26),
+        BC7_MODE_PRIO_CODE(2, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 10),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 11),
+        BC7_MODE_PRIO_CODE(4, 0, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 14),
+        BC7_MODE_PRIO_CODE(2, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 1, 18),
+        BC7_MODE_PRIO_CODE(3, 0, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 0, 2),
+        BC7_MODE_PRIO_CODE(3, 1, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 31),
+        BC7_MODE_PRIO_CODE(3, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 1, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 62),
+        BC7_MODE_PRIO_CODE(2, 0, 12),
+        BC7_MODE_PRIO_CODE(3, 0, 4),
+        BC7_MODE_PRIO_CODE(4, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 23),
+        BC7_MODE_PRIO_CODE(2, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 0, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 24),
+        BC7_MODE_PRIO_CODE(1, 1, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 0, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 52),
+        BC7_MODE_PRIO_CODE(4, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 22),
+        BC7_MODE_PRIO_CODE(4, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 3, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 32),
+        BC7_MODE_PRIO_CODE(4, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 15),
+        BC7_MODE_PRIO_CODE(4, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 19),
+        BC7_MODE_PRIO_CODE(4, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 1),
+        BC7_MODE_PRIO_CODE(4, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 1, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 1, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 2, 62),
+        BC7_MODE_PRIO_CODE(2, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 1, 53),
+        BC7_MODE_PRIO_CODE(2, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 2, 13),
+        BC7_MODE_PRIO_CODE(4, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 20),
+        BC7_MODE_PRIO_CODE(3, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 19),
+        BC7_MODE_PRIO_CODE(3, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 1, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 1),
+        BC7_MODE_PRIO_CODE(2, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 1, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 1, 33),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 55),
+        BC7_MODE_PRIO_CODE(3, 1, 17),
+        BC7_MODE_PRIO_CODE(2, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 27),
+        BC7_MODE_PRIO_CODE(1, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 26),
+        BC7_MODE_PRIO_CODE(3, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 12),
+        BC7_MODE_PRIO_CODE(2, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 3, 15),
+        BC7_MODE_PRIO_CODE(4, 2, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 32),
+        BC7_MODE_PRIO_CODE(1, 1, 40),
+        BC7_MODE_PRIO_CODE(4, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 22),
+        BC7_MODE_PRIO_CODE(3, 2, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 4),
+        BC7_MODE_PRIO_CODE(2, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 24),
+        BC7_MODE_PRIO_CODE(2, 1, 58),
+        BC7_MODE_PRIO_CODE(4, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 23),
+        BC7_MODE_PRIO_CODE(1, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 1, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 2, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 53),
+        BC7_MODE_PRIO_CODE(1, 2, 55),
+        BC7_MODE_PRIO_CODE(3, 2, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 60),
+        BC7_MODE_PRIO_CODE(4, 1, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 35),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 42),
+        BC7_MODE_PRIO_CODE(1, 3, 3),
+        BC7_MODE_PRIO_CODE(4, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 30),
+        BC7_MODE_PRIO_CODE(1, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 52),
+        BC7_MODE_PRIO_CODE(1, 2, 61),
+        BC7_MODE_PRIO_CODE(3, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 26),
+        BC7_MODE_PRIO_CODE(2, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 21),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 2, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 1, 27),
+        BC7_MODE_PRIO_CODE(3, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 3, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 61),
+        BC7_MODE_PRIO_CODE(3, 2, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 31),
+        BC7_MODE_PRIO_CODE(1, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 19),
+        BC7_MODE_PRIO_CODE(2, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 40),
+        BC7_MODE_PRIO_CODE(2, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 50),
+        BC7_MODE_PRIO_CODE(2, 1, 41),
+        BC7_MODE_PRIO_CODE(1, 2, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 1, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 3, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 1, 63),
+        BC7_MODE_PRIO_CODE(1, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 30),
+        BC7_MODE_PRIO_CODE(2, 2, 31),
+        BC7_MODE_PRIO_CODE(1, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 22),
+        BC7_MODE_PRIO_CODE(2, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 1, 43),
+        BC7_MODE_PRIO_CODE(3, 1, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 19),
+        BC7_MODE_PRIO_CODE(2, 2, 32),
+        BC7_MODE_PRIO_CODE(2, 2, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 2, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 2, 12),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 56),
+        BC7_MODE_PRIO_CODE(2, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 3, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 2, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 12),
+        BC7_MODE_PRIO_CODE(3, 3, 2),
+        BC7_MODE_PRIO_CODE(4, 2, 14),
+        BC7_MODE_PRIO_CODE(2, 3, 16),
+        BC7_MODE_PRIO_CODE(1, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 43),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 53),
+        BC7_MODE_PRIO_CODE(3, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 61),
+        BC7_MODE_PRIO_CODE(2, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 57),
+        BC7_MODE_PRIO_CODE(3, 1, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 35),
+        BC7_MODE_PRIO_CODE(1, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 23),
+        BC7_MODE_PRIO_CODE(2, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 22),
+        BC7_MODE_PRIO_CODE(2, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 56),
+        BC7_MODE_PRIO_CODE(3, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 7),
+        BC7_MODE_PRIO_CODE(3, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 36),
+        BC7_MODE_PRIO_CODE(3, 2, 36),
+        BC7_MODE_PRIO_CODE(4, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 2, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 62),
+        BC7_MODE_PRIO_CODE(1, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 1, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 1, 39),
+        BC7_MODE_PRIO_CODE(3, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 17),
+        BC7_MODE_PRIO_CODE(4, 1, 57),
+        BC7_MODE_PRIO_CODE(1, 3, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 1, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 51),
+        BC7_MODE_PRIO_CODE(4, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 3, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 2, 27),
+        BC7_MODE_PRIO_CODE(2, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 27),
+        BC7_MODE_PRIO_CODE(4, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 26),
+        BC7_MODE_PRIO_CODE(2, 3, 4),
+        BC7_MODE_PRIO_CODE(2, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 1),
+        BC7_MODE_PRIO_CODE(4, 2, 6),
+        BC7_MODE_PRIO_CODE(1, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 1, 49),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 25),
+        BC7_MODE_PRIO_CODE(1, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 37),
+        BC7_MODE_PRIO_CODE(1, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 3, 35),
+        BC7_MODE_PRIO_CODE(2, 3, 24),
+        BC7_MODE_PRIO_CODE(2, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 2, 53),
+        BC7_MODE_PRIO_CODE(4, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 3),
+        BC7_MODE_PRIO_CODE(2, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 62),
+        BC7_MODE_PRIO_CODE(4, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 58),
+        BC7_MODE_PRIO_CODE(4, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 3, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 1, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 21),
+        BC7_MODE_PRIO_CODE(2, 1, 38),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 23),
+        BC7_MODE_PRIO_CODE(1, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 2, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 52),
+        BC7_MODE_PRIO_CODE(1, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 33),
+        BC7_MODE_PRIO_CODE(3, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 44),
+        BC7_MODE_PRIO_CODE(1, 2, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 2, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 47),
+        BC7_MODE_PRIO_CODE(3, 3, 7),
+        BC7_MODE_PRIO_CODE(2, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 2, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 4),
+        BC7_MODE_PRIO_CODE(3, 2, 0),
+        BC7_MODE_PRIO_CODE(1, 3, 31),
+        BC7_MODE_PRIO_CODE(3, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 2, 51),
+        BC7_MODE_PRIO_CODE(2, 1, 39),
+        BC7_MODE_PRIO_CODE(1, 3, 48),
+        BC7_MODE_PRIO_CODE(1, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 25),
+        BC7_MODE_PRIO_CODE(4, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 28),
+        BC7_MODE_PRIO_CODE(3, 1, 44),
+        BC7_MODE_PRIO_CODE(2, 1, 34),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 2, 54),
+        BC7_MODE_PRIO_CODE(4, 2, 7),
+        BC7_MODE_PRIO_CODE(4, 2, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 43),
+        BC7_MODE_PRIO_CODE(2, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 2, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 27),
+        BC7_MODE_PRIO_CODE(1, 3, 57),
+        BC7_MODE_PRIO_CODE(2, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 19),
+        BC7_MODE_PRIO_CODE(3, 1, 38),
+        BC7_MODE_PRIO_CODE(2, 2, 38),
+        BC7_MODE_PRIO_CODE(2, 3, 31),
+        BC7_MODE_PRIO_CODE(2, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 3, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 50),
+        BC7_MODE_PRIO_CODE(1, 3, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 51),
+        BC7_MODE_PRIO_CODE(2, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 1),
+        BC7_MODE_PRIO_CODE(3, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 49),
+        BC7_MODE_PRIO_CODE(2, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 15),
+        BC7_MODE_PRIO_CODE(3, 3, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 3, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 60),
+        BC7_MODE_PRIO_CODE(1, 3, 53),
+        BC7_MODE_PRIO_CODE(2, 3, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 37),
+        BC7_MODE_PRIO_CODE(1, 3, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 34),
+        BC7_MODE_PRIO_CODE(1, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 62),
+        BC7_MODE_PRIO_CODE(3, 3, 18),
+        BC7_MODE_PRIO_CODE(3, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 2, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 2, 43),
+        BC7_MODE_PRIO_CODE(3, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 36),
+        BC7_MODE_PRIO_CODE(2, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 2, 49),
+        BC7_MODE_PRIO_CODE(3, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 2, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 48),
+        BC7_MODE_PRIO_CODE(2, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 9),
+        BC7_MODE_PRIO_CODE(1, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 10),
+        BC7_MODE_PRIO_CODE(3, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 37),
+        BC7_MODE_PRIO_CODE(2, 3, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 3, 6),
+        BC7_MODE_PRIO_CODE(4, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 7),
+        BC7_MODE_PRIO_CODE(3, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 22),
+        BC7_MODE_PRIO_CODE(3, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 42),
+        BC7_MODE_PRIO_CODE(3, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 19),
+        BC7_MODE_PRIO_CODE(4, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 46),
+        BC7_MODE_PRIO_CODE(3, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 3, 49),
+        BC7_MODE_PRIO_CODE(3, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 2, 26),
+        BC7_MODE_PRIO_CODE(3, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 29),
+        BC7_MODE_PRIO_CODE(1, 3, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 2, 38),
+        BC7_MODE_PRIO_CODE(3, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 25),
+        BC7_MODE_PRIO_CODE(4, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 3, 54),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 63),
+        BC7_MODE_PRIO_CODE(4, 3, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 3, 60),
+        BC7_MODE_PRIO_CODE(4, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 62),
+        BC7_MODE_PRIO_CODE(4, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 57),
+        BC7_MODE_PRIO_CODE(4, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 51),
+        BC7_MODE_PRIO_CODE(4, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 40),
+        BC7_MODE_PRIO_CODE(4, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 3, 44),
+        BC7_MODE_PRIO_CODE(3, 2, 45),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 45),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGB = g_bc7PrioCodesRGBData;
+    const int g_bc7NumPrioCodesRGB = sizeof(g_bc7PrioCodesRGBData) / sizeof(g_bc7PrioCodesRGBData[0]);
+
+    const uint16_t g_bc7PrioCodesRGBAData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 15),
+        BC7_MODE_PRIO_CODE(1, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 1),
+        BC7_MODE_PRIO_CODE(1, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 3),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 7, 16),
+        BC7_MODE_PRIO_CODE(1, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 9),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 13),
+        BC7_MODE_PRIO_CODE(3, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 15),
+        BC7_MODE_PRIO_CODE(2, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 7, 5),
+        BC7_MODE_PRIO_CODE(1, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 1),
+        BC7_MODE_PRIO_CODE(2, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 15),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 7, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 2),
+        BC7_MODE_PRIO_CODE(2, 7, 16),
+        BC7_MODE_PRIO_CODE(2, 7, 3),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 23),
+        BC7_MODE_PRIO_CODE(3, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 61),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 7, 22),
+        BC7_MODE_PRIO_CODE(2, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 18),
+        BC7_MODE_PRIO_CODE(2, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 3),
+        BC7_MODE_PRIO_CODE(3, 7, 16),
+        BC7_MODE_PRIO_CODE(3, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 21),
+        BC7_MODE_PRIO_CODE(1, 7, 62),
+        BC7_MODE_PRIO_CODE(2, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 33),
+        BC7_MODE_PRIO_CODE(2, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 33),
+        BC7_MODE_PRIO_CODE(4, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 11),
+        BC7_MODE_PRIO_CODE(3, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 63),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 24),
+        BC7_MODE_PRIO_CODE(3, 7, 22),
+        BC7_MODE_PRIO_CODE(3, 7, 9),
+        BC7_MODE_PRIO_CODE(1, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 7, 4),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 15),
+        BC7_MODE_PRIO_CODE(3, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 32),
+        BC7_MODE_PRIO_CODE(3, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 28),
+        BC7_MODE_PRIO_CODE(1, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 32),
+        BC7_MODE_PRIO_CODE(1, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 62),
+        BC7_MODE_PRIO_CODE(1, 7, 41),
+        BC7_MODE_PRIO_CODE(1, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 40),
+        BC7_MODE_PRIO_CODE(1, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 7, 53),
+        BC7_MODE_PRIO_CODE(4, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 62),
+        BC7_MODE_PRIO_CODE(4, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 7, 42),
+        BC7_MODE_PRIO_CODE(1, 7, 52),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 56),
+        BC7_MODE_PRIO_CODE(1, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 48),
+        BC7_MODE_PRIO_CODE(2, 7, 28),
+        BC7_MODE_PRIO_CODE(3, 7, 28),
+        BC7_MODE_PRIO_CODE(4, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 7, 35),
+        BC7_MODE_PRIO_CODE(1, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 3),
+        BC7_MODE_PRIO_CODE(1, 7, 47),
+        BC7_MODE_PRIO_CODE(1, 7, 37),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 57),
+        BC7_MODE_PRIO_CODE(4, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 24),
+        BC7_MODE_PRIO_CODE(4, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 50),
+        BC7_MODE_PRIO_CODE(2, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 55),
+        BC7_MODE_PRIO_CODE(4, 7, 18),
+        BC7_MODE_PRIO_CODE(4, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 59),
+        BC7_MODE_PRIO_CODE(1, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 16),
+        BC7_MODE_PRIO_CODE(4, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 4),
+        BC7_MODE_PRIO_CODE(1, 7, 49),
+        BC7_MODE_PRIO_CODE(2, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 7, 62),
+        BC7_MODE_PRIO_CODE(3, 7, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 53),
+        BC7_MODE_PRIO_CODE(3, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 40),
+        BC7_MODE_PRIO_CODE(3, 7, 40),
+        BC7_MODE_PRIO_CODE(2, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 31),
+        BC7_MODE_PRIO_CODE(4, 7, 61),
+        BC7_MODE_PRIO_CODE(1, 7, 36),
+        BC7_MODE_PRIO_CODE(4, 7, 63),
+        BC7_MODE_PRIO_CODE(1, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 52),
+        BC7_MODE_PRIO_CODE(2, 7, 56),
+        BC7_MODE_PRIO_CODE(2, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 57),
+        BC7_MODE_PRIO_CODE(3, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 49),
+        BC7_MODE_PRIO_CODE(3, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 43),
+        BC7_MODE_PRIO_CODE(3, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 28),
+        BC7_MODE_PRIO_CODE(2, 7, 48),
+        BC7_MODE_PRIO_CODE(3, 7, 52),
+        BC7_MODE_PRIO_CODE(3, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 59),
+        BC7_MODE_PRIO_CODE(4, 7, 40),
+        BC7_MODE_PRIO_CODE(4, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 55),
+        BC7_MODE_PRIO_CODE(3, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 54),
+        BC7_MODE_PRIO_CODE(3, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 54),
+        BC7_MODE_PRIO_CODE(2, 7, 47),
+        BC7_MODE_PRIO_CODE(3, 7, 47),
+        BC7_MODE_PRIO_CODE(4, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 37),
+        BC7_MODE_PRIO_CODE(3, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 47),
+        BC7_MODE_PRIO_CODE(2, 7, 36),
+        BC7_MODE_PRIO_CODE(1, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 50),
+        BC7_MODE_PRIO_CODE(3, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 52),
+        BC7_MODE_PRIO_CODE(1, 7, 39),
+        BC7_MODE_PRIO_CODE(1, 7, 34),
+        BC7_MODE_PRIO_CODE(1, 7, 38),
+        BC7_MODE_PRIO_CODE(2, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 38),
+        BC7_MODE_PRIO_CODE(4, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 39),
+        BC7_MODE_PRIO_CODE(2, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 39),
+        BC7_MODE_PRIO_CODE(3, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 39),
+        BC7_MODE_PRIO_CODE(4, 7, 34),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGBA = g_bc7PrioCodesRGBAData;
+    const int g_bc7NumPrioCodesRGBA = sizeof(g_bc7PrioCodesRGBAData) / sizeof(g_bc7PrioCodesRGBA[0]);
+
+    int UnpackMode(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_MODE_OFFSET_BITS) & ((1 << BC7_MODE_BITS) - 1));
+    }
+
+    int UnpackSeedPointCount(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_SEED_POINT_COUNT_OFFSET_BITS) & ((1 << BC7_SEED_POINT_COUNT_BITS) - 1)) + 1;
+    }
+
+    int UnpackPartition(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_PARTITION_OFFSET_BITS) & ((1 << BC7_PARTITION_BITS) - 1));
+    }
+
+    int UnpackRotation(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_ROTATION_OFFSET_BITS) & ((1 << BC7_ROTATION_BITS) - 1));
+    }
+
+    int UnpackIndexSelector(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_INDEX_MODE_OFFSET_BITS) & ((1 << BC7_INDEX_MODE_BITS) - 1));
+    }
+}}}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
index b5564c0dab..b45ba5eca8 100644
--- a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
 namespace cvtt { namespace Tables { namespace BC7SC {
 
 struct TableEntry
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
new file mode 100644
index 0000000000..be16d1db06
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
@@ -0,0 +1,46 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BCCommon.h"
+
+int cvtt::Internal::BCCommon::TweakRoundsForRange(int range)
+{
+    if (range == 3)
+        return 3;
+    return 4;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.h b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
new file mode 100644
index 0000000000..3e13151acd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
@@ -0,0 +1,104 @@
+#pragma once
+#ifndef __CVTT_BCCOMMON_H__
+#define __CVTT_BCCOMMON_H__
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BCCommon
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static int TweakRoundsForRange(int range);
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
+            {
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
+            }
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
+            {
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
+            {
+                AggregatedError<TVectorSize> aggError;
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
+                return aggError.Finalize(flags, channelWeightsSq);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Config.h b/thirdparty/cvtt/ConvectionKernels_Config.h
new file mode 100644
index 0000000000..e79d32b1da
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Config.h
@@ -0,0 +1,12 @@
+#pragma once
+#ifndef __CVTT_CONFIG_H__
+#define __CVTT_CONFIG_H__
+
+#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
+#define CVTT_USE_SSE2
+#endif
+
+// Define this to compile everything as a single source file
+//#define CVTT_SINGLE_FILE
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.cpp b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
new file mode 100644
index 0000000000..cb202a6e9c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
@@ -0,0 +1,3147 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_ETC1.h"
+#include "ConvectionKernels_ETC2.h"
+#include "ConvectionKernels_ETC2_Rounding.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_FakeBT709_Rounding.h"
+
+#include <cmath>
+
+const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
+{
+    {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 }
+    },
+    {
+        { 0, 1, 2, 3, 4, 5, 6, 7 },
+        { 8, 9, 10, 11, 12, 13, 14, 15 }
+    },
+};
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
+{
+    MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
+    MFloat fd0 = ParallelMath::ToFloat(d0);
+    MFloat error = fd0 * fd0;
+    for (int ch = 1; ch < 3; ch++)
+    {
+        MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
+        MFloat fd = ParallelMath::ToFloat(d);
+        error = error + fd * fd;
+    }
+    return error;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
+{
+    MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
+    MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
+    MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
+
+    return dr * dr + dg * dg + db * db;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
+{
+    MFloat yuv[3];
+    ConvertToFakeBT709(yuv, reconstructed);
+
+    MFloat dy = yuv[0] - preWeightedPixel[0];
+    MFloat du = yuv[1] - preWeightedPixel[1];
+    MFloat dv = yuv[2] - preWeightedPixel[2];
+
+    return dy * dy + du * du + dv * dv;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+
+        if (isDifferential)
+            unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+        else
+            unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[4][3];
+    for (unsigned int s = 0; s < 4; s++)
+        for (int ch = 0; ch < 3; ch++)
+            unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
+
+        for (unsigned int s = 0; s < 4; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        totalError = totalError + bestError;
+        selectors = selectors | (bestSelector << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+        unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[3][3];
+    for (int ch = 0; ch < 3; ch++)
+    {
+        unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
+        unquantizedModified[1][ch] = unquantized[ch];
+        unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
+    }
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
+
+        for (unsigned int s = 0; s < 3; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        // Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
+        // the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
+
+        // Remap selector 1 to 2, and 2 to 3
+        bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
+
+        // Mark zero transparent as 
+        ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
+        ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
+
+        totalError = totalError + bestError;
+        selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
+{
+    // We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
+    // and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
+    // and save a lot of time.
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
+        bool canIgnoreEither = canIgnore[0] || canIgnore[1];
+        float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
+        float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
+        uint16_t bestDiffSelectors[2] = { 0, 0 };
+        uint16_t bestDiffColors[2] = { 0, 0 };
+        uint16_t bestDiffTables[2] = { 0, 0 };
+        for (int sector = 0; sector < 2; sector++)
+        {
+            unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+            for (unsigned int i = 0; i < sectorNumAttempts; i++)
+            {
+                float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
+                if (error < bestDiffErrors[sector])
+                {
+                    bestDiffErrors[sector] = error;
+                    bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
+                    bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
+                    bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
+                }
+            }
+        }
+
+        if (canIgnore[0])
+            bestDiffColors[0] = bestDiffColors[1];
+        else if (canIgnore[1])
+            bestDiffColors[1] = bestDiffColors[0];
+
+        // The best differential possibilities must be better than the best total error
+        if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
+        {
+            // Fast path if the best possible case is legal
+            if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
+            {
+                ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
+                ParallelMath::PutUInt15(bestFlip, block, flip);
+                ParallelMath::PutUInt15(bestD, block, d);
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
+                    ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
+                    ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
+                }
+            }
+            else
+            {
+                // Slow path: Sort the possible cases by quality, and search valid combinations
+                // TODO: Pre-flatten the error lists so this is nicer to cache
+                unsigned int numSortIndexes[2] = { 0, 0 };
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+
+                    for (unsigned int i = 0; i < sectorNumAttempts; i++)
+                    {
+                        if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
+                            drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
+                    }
+
+                    struct SortPredicate
+                    {
+                        const MFloat *diffErrors;
+                        int block;
+
+                        bool operator()(uint16_t a, uint16_t b) const
+                        {
+                            float errorA = ParallelMath::Extract(diffErrors[a], block);
+                            float errorB = ParallelMath::Extract(diffErrors[b], block);
+
+                            if (errorA < errorB)
+                                return true;
+                            if (errorA > errorB)
+                                return false;
+
+                            return a < b;
+                        }
+                    };
+
+                    SortPredicate sp;
+                    sp.diffErrors = drs.diffErrors[sector];
+                    sp.block = block;
+
+                    std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
+                }
+
+                int scannedElements = 0;
+                for (unsigned int i = 0; i < numSortIndexes[0]; i++)
+                {
+                    unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
+                    float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
+
+                    scannedElements++;
+
+                    if (error0 >= blockBestTotalError)
+                        break;
+
+                    float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
+                    uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
+
+                    if (maxError1 < bestDiffErrors[1])
+                        break;
+
+                    for (unsigned int j = 0; j < numSortIndexes[1]; j++)
+                    {
+                        unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
+                        float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
+
+                        scannedElements++;
+
+                        if (error1 >= maxError1)
+                            break;
+
+                        uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
+
+                        if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
+                        {
+                            blockBestTotalError = error0 + error1;
+
+                            ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                            ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
+                            ParallelMath::PutUInt15(bestFlip, block, flip);
+                            ParallelMath::PutUInt15(bestD, block, d);
+                            ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
+                            ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
+                            ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
+                            ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
+                            ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
+                            ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
+{
+    MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
+
+    return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
+        & ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
+        & ETCDifferentialIsLegalForChannel(a & mask, b & mask);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
+{
+    int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
+
+    return (-4 <= diff) && (diff <= 3);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
+        & ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
+        & ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
+}
+
+void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+
+    // To speed this up, we compute line total as the sum, then subtract out isolated
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
+
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                else
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[3][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = unquantizedColor;
+                lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MSInt32 selectors = ParallelMath::MakeSInt32(0);
+            MFloat error = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat pixelError = isolatedError[px];
+
+                MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
+                for (int i = 0; i < 3; i++)
+                {
+                    MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
+                    pixelError = ParallelMath::Min(error, pixelError);
+                    pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
+                }
+
+                error = error + pixelError;
+                selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
+            }
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+            bestError = ParallelMath::Min(error, bestError);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
+
+            EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    MUInt15 zero15 = ParallelMath::MakeUInt15(0);
+
+    MUInt15 counts[2] = { zero15, zero15 };
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 totals[2][3] =
+    {
+        { zero15, zero15, zero15 },
+        { zero15, zero15, zero15 }
+    };
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            totals[0][ch] = totals[0][ch] + pixels[px][ch];
+            totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
+        }
+        counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        totals[0][ch] = totals[0][ch] - totals[1][ch];
+    counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
+
+    MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
+    MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
+    MUInt15 bestColors[2] = { zero15, zero15 };
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+
+    for (int table = 0; table < 8; table++)
+    {
+        MUInt15 numUniqueColors = zero15;
+
+        int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
+
+        for (int sector = 0; sector < 2; sector++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int blockNumUniqueColors = 0;
+                uint16_t blockUniqueQuantizedColors[31];
+
+                int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
+                int minOffsetMultiplier = -maxOffsetMultiplier;
+
+                int modifierOffset = modifier * 2;
+
+                int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
+                int blockSectorTotals[3];
+                for (int ch = 0; ch < 3; ch++)
+                    blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
+
+                for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
+                {
+                    // TODO: This isn't ideal for FakeBT709
+                    int16_t quantized[3];
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        if (blockSectorCounts == 0)
+                            quantized[ch] = 0;
+                        else
+                            quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
+                    }
+
+                    uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+                    if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
+                    {
+                        assert(blockNumUniqueColors < 32);
+                        blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
+                    }
+                }
+
+                ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
+
+                int baseIndex = 0;
+                if (sector == 1)
+                    baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+                for (int i = 0; i < blockNumUniqueColors; i++)
+                    ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
+            }
+        }
+
+        MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
+        int maxErrorColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int lastColor = ParallelMath::Extract(totalColors, block);
+            uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
+            for (int i = lastColor; i < maxErrorColors; i++)
+                ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
+        }
+
+        for (int ci = 0; ci < maxErrorColors; ci++)
+        {
+            MUInt15 fifteen = ParallelMath::MakeUInt15(15);
+            MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
+            MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
+
+            MUInt15 colors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
+                colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
+            }
+
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errors[2];
+                for (int i = 0; i < 2; i++)
+                {
+                    if (isFakeBT709)
+                        errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
+                    else if (isUniform)
+                        errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
+                    else
+                        errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
+                }
+
+                ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
+                he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
+                signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
+            }
+            he.signBits[ci] = signBits;
+        }
+
+        int maxUniqueColorCombos = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
+            if (numUniqueColorCombos > maxUniqueColorCombos)
+                maxUniqueColorCombos = numUniqueColorCombos;
+        }
+
+        MUInt15 indexes[2] = { zero15, zero15 };
+        MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
+
+        int block1Starts[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+        for (int combo = 0; combo < maxUniqueColorCombos; combo++)
+        {
+            MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
+            ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
+            ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
+
+            MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
+            indexes[0] = index0;
+            indexes[1] = index1;
+
+            int ci0[ParallelMath::ParallelSize];
+            int ci1[ParallelMath::ParallelSize];
+            MUInt15 color0;
+            MUInt15 color1;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                ci0[block] = ParallelMath::Extract(index0, block);
+                ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
+                ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
+                ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errorCI0;
+                MFloat errorCI1;
+                MUInt16 signBits0;
+                MUInt16 signBits1;
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
+                    ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
+                    ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
+                    ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
+                }
+
+                totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
+
+                MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
+
+                ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
+
+                sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
+                signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
+            }
+
+            ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
+            ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
+            if (ParallelMath::AnySet(totalErrorBetter16))
+            {
+                bestIsThisMode = bestIsThisMode | totalErrorBetter16;
+                ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
+                ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
+                ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
+                ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
+                bestError = ParallelMath::Min(totalError, bestError);
+            }
+        }
+    }
+
+    if (ParallelMath::AnySet(bestIsThisMode))
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(bestIsThisMode, block))
+                continue;
+
+            ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
+            ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
+            ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
+            ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
+
+            EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
+{
+    // We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
+    //
+    // T mode: C1, C2+M, Transparent, C2-M
+    // H mode: C1+M, C1-M, Transparent, C2-M
+    //
+    // So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
+    // The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
+    //
+    // Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
+    // but unlike opaque blocks, we can't flip them.
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::FloatCompFlag isTransparentF[16];
+    for (int px = 0; px < 16; px++)
+        isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
+
+    ParallelMath::Int16CompFlag isIsolated[16];
+    ParallelMath::Int16CompFlag isLine[16];
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
+        isIsolated[px] = isIsolatedBase[px] & isOpaque;
+        isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
+    }
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+        numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
+    }
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 hModeIsolatedQuantized[8][3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            MUInt15 hModeIsolatedNumerators[8];
+            for (int table = 0; table < 8; table++)
+            {
+                // FIXME: Handle fake BT.709 correctly
+                MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
+
+                hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
+            }
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
+                }
+                else
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
+                }
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    for (int table = 0; table < 8; table++)
+        for (int ch = 0; ch < 3; ch++)
+            hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+
+        ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
+    ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        MFloat hModeErrors[16];
+        MUInt15 hModeUnquantizedColor[3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
+
+            MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
+            hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
+            ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+        }
+
+        MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
+        ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MUInt15 bestLineSelector[16];
+            MFloat bestLineError[16];
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat lineErrors[2];
+                for (int i = 0; i < 2; i++)
+                    lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+
+                ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
+                bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
+                bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
+
+                ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+            }
+
+            // One case considered here was if it was possible to force H mode to be valid when the line color is unused.
+            // That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
+            // which is always checked after a swap.
+            MFloat tModeError = ParallelMath::MakeFloatZero();
+            MFloat hModeError = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
+                hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
+            }
+
+            ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
+
+            MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
+
+            ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
+
+            ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
+            ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
+
+            MFloat roundBestError = tModeError;
+            ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
+            ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                MSInt32 selectors = ParallelMath::MakeSInt32(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 selector = bestLineSelector[px];
+
+                    MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
+                    ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
+
+                    ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
+                    ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
+                    selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
+                }
+
+                bestError = ParallelMath::Min(bestError, roundBestError);
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
+                ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
+                
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
+
+            if (ParallelMath::Extract(bestIsHMode, block))
+            {
+                // T mode: C1, C2+M, Transparent, C2-M
+                // H mode: C1+M, C1-M, Transparent, C2-M
+                static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
+                static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
+
+                // Remap selectors
+                ParallelMath::ScalarUInt16 signBits = 0;
+                ParallelMath::ScalarUInt16 sectorBits = 0;
+                int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+                for (int px = 0; px < 16; px++)
+                {
+                    int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
+                    sectorBits |= (selectorRemapSector[selector] << px);
+                    signBits |= (selectorRemapSign[selector] << px);
+                }
+
+                ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
+
+                EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
+            }
+            else
+                EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
+        }
+    }
+}
+
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
+{
+    if (ch == 1)
+        return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
+    else
+        return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
+}
+
+void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    // NOTE: If it's desired to do this in another color space, the best way to do it would probably be
+    // to do everything in that color space and then transform it back to RGB.
+
+    // We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
+
+    // error = (x*H + y*V + O - C)^2
+    MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    MFloat totalError = ParallelMath::MakeFloatZero();
+    MUInt15 bestCoeffs[3][3];	// [Channel][Coeff]
+    for (int ch = 0; ch < 3; ch++)
+    {
+        float fhh = 0.f;
+        float fho = 0.f;
+        float fhv = 0.f;
+        float foo = 0.f;
+        float fov = 0.f;
+        float fvv = 0.f;
+        MFloat fc = ParallelMath::MakeFloatZero();
+        MFloat fh = ParallelMath::MakeFloatZero();
+        MFloat fv = ParallelMath::MakeFloatZero();
+        MFloat fo = ParallelMath::MakeFloatZero();
+
+        float &foh = fho;
+        float &fvh = fhv;
+        float &fvo = fov;
+
+        for (int px = 0; px < 16; px++)
+        {
+            float x = static_cast<float>(px % 4);
+            float y = static_cast<float>(px / 4);
+            MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
+
+            // (x*H + y*V + O - C)^2
+            fhh += x * x;
+            fhv += x * y;
+            fho += x;
+            fh = fh - c * x;
+
+            fvh += y * x;
+            fvv += y * y;
+            fvo += y;
+            fv = fv - c * y;
+
+            foh += x;
+            fov += y;
+            foo += 1;
+            fo = fo - c;
+
+            fh = fh - c * x;
+            fv = fv - c * y;
+            fo = fo - c;
+            fc = fc + c * c;
+        }
+
+        //float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
+
+        // error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
+        // derror/dh = 2*fhh*h + fho*o + fhv*v + fh
+        // derror/dv = fhv*h + fov*o + 2*fvv*v + fv
+        // derror/do = fho*h + 2*foo*o + fov*v + fo
+
+        // Solve system of equations
+        // h o v 1 = 0
+        // -------
+        // d e f g  R0
+        // i j k l  R1
+        // m n p q  R2
+
+        float d = 2.0f * fhh;
+        float e = fho;
+        float f = fhv;
+        MFloat gD = fh;
+
+        float i = fhv;
+        float j = fov;
+        float k = 2.0f * fvv;
+        MFloat lD = fv;
+
+        float m = fho;
+        float n = 2.0f * foo;
+        float p = fov;
+        MFloat qD = fo;
+
+        {
+            // Factor out first column from R1 and R2
+            float r0to1 = -i / d;
+            float r0to2 = -m / d;
+
+            // 0 j1 k1 l1D
+            float j1 = j + r0to1 * e;
+            float k1 = k + r0to1 * f;
+            MFloat l1D = lD + gD * r0to1;
+
+            // 0 n1 p1 q1D
+            float n1 = n + r0to2 * e;
+            float p1 = p + r0to2 * f;
+            MFloat q1D = qD + gD * r0to2;
+
+            // Factor out third column from R2
+            float r1to2 = -p1 / k1;
+
+            // 0 n2 0 q2D
+            float n2 = n1 + r1to2 * j1;
+            MFloat q2D = q1D + l1D * r1to2;
+
+            o[ch] = -q2D / n2;
+
+            // Factor out second column from R1
+            // 0 n2 0 q2D
+
+            float r2to1 = -j1 / n2;
+
+            // 0 0 k1 l2D
+            // 0 n2 0 q2D
+            MFloat l2D = l1D + q2D * r2to1;
+
+            float elim2 = -f / k1;
+            float elim1 = -e / n2;
+
+            // d 0 0 g2D
+            MFloat g2D = gD + l2D * elim2 + q2D * elim1;
+
+            // n2*o + q2 = 0
+            // o = -q2 / n2
+            h[ch] = -g2D / d;
+            v[ch] = -l2D / k1;
+        }
+
+        // Undo the local transformation
+        h[ch] = h[ch] * 4.0f + o[ch];
+        v[ch] = v[ch] * 4.0f + o[ch];
+    }
+
+    if (isFakeBT709)
+    {
+        MFloat oRGB[3];
+        MFloat hRGB[3];
+        MFloat vRGB[3];
+
+        ConvertFromFakeBT709(oRGB, o);
+        ConvertFromFakeBT709(hRGB, h);
+        ConvertFromFakeBT709(vRGB, v);
+
+        // Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
+        {
+            ParallelMath::RoundTowardNearestForScope rtn;
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
+
+                for (int c = 0; c < 3; c++)
+                {
+                    MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                    if (ch == 1)
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                    else
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                    fcoeffs[c] = coeff;
+                }
+
+                for (int c = 0; c < 3; c++)
+                    bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
+            }
+        }
+
+        MUInt15 reconstructed[16][3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
+            MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
+            MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
+
+            MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+            MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+            MFloat error = ParallelMath::MakeFloatZero();
+
+            MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+            for (int px = 0; px < 16; px++)
+            {
+                MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+            }
+        }
+
+        totalError = ParallelMath::MakeFloatZero();
+        for (int px = 0; px < 16; px++)
+            totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
+            MUInt15 coeffRanges[3][2];
+
+            for (int c = 0; c < 3; c++)
+            {
+                MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                if (ch == 1)
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                else
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                fcoeffs[c] = coeff;
+            }
+
+            {
+                ParallelMath::RoundDownForScope rd;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
+            }
+
+            {
+                ParallelMath::RoundUpForScope ru;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
+            }
+
+            MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
+            for (int io = 0; io < 2; io++)
+            {
+                MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
+
+                for (int ih = 0; ih < 2; ih++)
+                {
+                    MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
+                    MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                    for (int iv = 0; iv < 2; iv++)
+                    {
+                        MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
+                        MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                            MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                            MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                            MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                            MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                            MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+
+                            MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
+
+                            MFloat deltaF = ParallelMath::ToFloat(delta);
+                            error = error + deltaF * deltaF;
+                        }
+
+                        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
+                        if (ParallelMath::AnySet(errorBetter))
+                        {
+                            bestChannelError = ParallelMath::Min(error, bestChannelError);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
+                        }
+                    }
+                }
+            }
+
+            if (!isUniform)
+            {
+                switch (ch)
+                {
+                case 0:
+                    bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
+                    break;
+                case 1:
+                    bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
+                    break;
+                case 2:
+                    bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
+                    break;
+                default:
+                    break;
+                }
+            }
+
+            totalError = totalError + bestChannelError;
+        }
+    }
+
+    ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
+    if (ParallelMath::AnySet(errorBetter))
+    {
+        bestError = ParallelMath::Min(bestError, totalError);
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(errorBetter, block))
+                continue;
+
+            int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
+            int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
+            int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
+
+            int go = ParallelMath::Extract(bestCoeffs[1][0], block);
+            int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
+            int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
+
+            int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
+            int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
+            int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
+
+            int go1 = go >> 6;
+            int go2 = go & 63;
+
+            int bo1 = bo >> 5;
+            int bo2 = (bo >> 3) & 3;
+            int bo3 = bo & 7;
+
+            int rh1 = (rh >> 1);
+            int rh2 = rh & 1;
+
+            int fakeR = ro >> 2;
+            int fakeDR = go1 | ((ro & 3) << 1);
+
+            int fakeG = (go2 >> 2);
+            int fakeDG = ((go2 & 3) << 1) | bo1;
+
+            int fakeB = bo2;
+            int fakeDB = bo3 >> 1;
+
+            uint32_t highBits = 0;
+            uint32_t lowBits = 0;
+
+            // Avoid overflowing R
+            if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
+                highBits |= 1 << (63 - 32);
+
+            // Avoid overflowing G
+            if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
+                highBits |= 1 << (55 - 32);
+
+            // Overflow B
+            if (fakeB + fakeDB < 4)
+            {
+                // Overflow low
+                highBits |= 1 << (42 - 32);
+            }
+            else
+            {
+                // Overflow high
+                highBits |= 7 << (45 - 32);
+            }
+
+            highBits |= ro << (57 - 32);
+            highBits |= go1 << (56 - 32);
+            highBits |= go2 << (49 - 32);
+            highBits |= bo1 << (48 - 32);
+            highBits |= bo2 << (43 - 32);
+            highBits |= bo3 << (39 - 32);
+            highBits |= rh1 << (34 - 32);
+            highBits |= 1 << (33 - 32);
+            highBits |= rh2 << (32 - 32);
+
+            lowBits |= gh << 25;
+            lowBits |= bh << 19;
+            lowBits |= rv << 13;
+            lowBits |= gv << 6;
+            lowBits |= bv << 0;
+
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
+{
+    ParallelMath::Int16CompFlag pixelIsTransparent[16];
+    ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
+
+    if (punchthroughAlpha)
+    {
+        const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
+
+        // +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            MUInt15 alpha;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
+
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
+            anyTransparent = (anyTransparent | isTransparent);
+            allTransparent = (allTransparent & isTransparent);
+            pixelIsTransparent[px] = isTransparent;
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
+
+        allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
+            ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
+                ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
+            }
+        }
+    }
+
+    if (!ParallelMath::AllSet(allTransparent))
+        EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
+
+    MFloat chromaDelta[16][2];
+
+    MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
+    for (int px = 0; px < 16; px++)
+        numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
+
+    if (options.flags & cvtt::Flags::Uniform)
+    {
+        MSInt16 chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+            chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+        }
+
+        MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
+                    MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
+            }
+        }
+
+        const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
+
+        for (int px = 0; px < 16; px++)
+            chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
+    }
+    else
+    {
+        const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
+        const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
+
+        MFloat chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            const MFloat &px0 = preWeightedPixels[px][0];
+            const MFloat &px1 = preWeightedPixels[px][1];
+            const MFloat &px2 = preWeightedPixels[px][2];
+
+            chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
+            chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
+        }
+
+        MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
+                    MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = delta;
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
+            }
+        }
+    }
+
+
+    MFloat covXX = ParallelMath::MakeFloatZero();
+    MFloat covYY = ParallelMath::MakeFloatZero();
+    MFloat covXY = ParallelMath::MakeFloatZero();
+
+    for (int px = 0; px < 16; px++)
+    {
+        MFloat nx = chromaDelta[px][0];
+        MFloat ny = chromaDelta[px][1];
+
+        covXX = covXX + nx * nx;
+        covYY = covYY + ny * ny;
+        covXY = covXY + nx * ny;
+    }
+
+    MFloat halfTrace = (covXX + covYY) * 0.5f;
+    MFloat det = covXX * covYY - covXY * covXY;
+
+    MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
+
+    MFloat ev = halfTrace + mm;
+
+    MFloat dx = (covYY - ev + covXY);
+    MFloat dy = -(covXX - ev + covXY);
+
+    // If evenly distributed, pick an arbitrary plane
+    ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
+    ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
+
+    ParallelMath::Int16CompFlag sectorAssignments[16];
+    for (int px = 0; px < 16; px++)
+        sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
+
+    if (!ParallelMath::AllSet(allTransparent))
+    {
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
+
+        CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
+    }
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        if (!ParallelMath::AllSet(allTransparent))
+        {
+            // Flip sector assignments
+            for (int px = 0; px < 16; px++)
+                sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+        }
+
+        // Reset the error of any transparent blocks to max and retry with punchthrough modes
+        ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
+{
+    MUInt15 pixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
+{
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
+        maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
+    }
+
+    MUInt15 alphaSpan = maxAlpha - minAlpha;
+    MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
+
+    MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
+    MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
+    MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
+    MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIndexes[16];
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    const int numAlphaRanges = 10;
+    for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
+    {
+        for (int r = 0; r < numAlphaRanges; r++)
+        {
+            int subrange = r % 3;
+            int mainRange = r / 3;
+
+            int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
+            int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
+            uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
+
+            MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
+            MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
+            MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
+
+            MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
+
+                uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
+                ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
+            }
+
+            if (is11Bit)
+            {
+                // Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
+                minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
+            }
+            else
+            {
+                // We cap at 1 and 14 so both multipliers are valid and dividable
+                // Cases where offset span is 0 should be caught by multiplier 1 of table 13
+                minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
+            }
+
+            for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
+            {
+                MUInt15 multiplier = minMultiplier;
+
+                if (is11Bit)
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(8);
+                    else
+                        multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
+                }
+                else
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(1);
+                }
+
+                MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
+                MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
+
+                // codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
+                MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
+
+                MUInt15 baseAlpha;
+                if (is11Bit)
+                {
+                    // In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
+                    if (isSigned)
+                        unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
+
+                    // -128 is illegal for some reason
+                    MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
+
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
+
+                    if (!isSigned)
+                        baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
+                }
+                else
+                {
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
+                }
+
+                MUInt15 indexes[16];
+                MUInt31 totalError = ParallelMath::MakeUInt31(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 quantizedValues;
+                    QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
+
+                    if (is11Bit)
+                    {
+                        MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
+                        MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
+                        totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
+                    }
+                    else
+                        totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
+                }
+
+                ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
+                if (ParallelMath::AnySet(isBetter))
+                {
+                    ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
+                    ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
+                    ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
+                    ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
+
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
+                }
+
+                // TODO: Do one refine pass
+            }
+        }
+    }
+
+    if (is11Bit)
+    {
+        bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
+
+        if (isSigned)
+            bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint8_t *output = outputBuffer + block * 8;
+
+        output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
+
+        ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
+        ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
+
+        output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
+
+        static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        for (int px = 0; px < 16; px++)
+            indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
+
+        int outputOffset = 2;
+        int outputBits = 0;
+        int numOutputBits = 0;
+        for (int s = 0; s < 16; s++)
+        {
+            outputBits = (outputBits << 3) | indexes[s];
+            numOutputBits += 3;
+
+            if (numOutputBits >= 8)
+            {
+                output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
+                numOutputBits -= 8;
+
+                outputBits &= ((1 << numOutputBits) - 1);
+            }
+        }
+
+        assert(outputOffset == 8 && numOutputBits == 0);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
+{
+    MUInt15 pixels[16];
+    for (int px = 0; px < 16; px++)
+    {
+        MSInt16 adjustedPixel;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
+
+        // We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
+        // That is, signed range is 1..2047, and unsigned range is 0..2047
+        if (isSigned)
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
+        }
+        else
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
+        }
+
+
+        pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
+{
+    DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
+    MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
+
+    CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
+}
+
+void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
+{
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
+        }
+
+        if (isFakeBT709)
+            ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
+        else if (isUniform)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+        }
+        else
+        {
+            preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
+            preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
+            preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    for (int ch = 0; ch < 3; ch++)
+    {
+        const MUInt15& cu15 = sectorCumulative[ch];
+
+        if (isDifferential)
+        {
+            //quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 11)
+            );
+        }
+        else
+        {
+            //quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 12)
+            );
+        }
+    }
+
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized;
+        MUInt15 unquantizedNext;
+        if (isDifferential)
+        {
+            unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+            MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
+            unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
+        }
+        else
+        {
+            unquantized = (quantized[ch] << 4) | quantized[ch];
+            unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+        }
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    // sectorCumulative range is 0..2040 (11 bits)
+    MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
+
+    MUInt15 rOffset;
+    MUInt15 gOffset;
+    MUInt15 bOffset;
+    MUInt15 quantizedBase[3];
+    MUInt15 upperBound;
+
+    MUInt15 sectorCumulativeFillIn[3];
+    for (int ch = 0; ch < 3; ch++)
+        sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
+
+    if (isDifferential)
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
+
+        upperBound = ParallelMath::MakeUInt15(31);
+    }
+    else
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
+
+        upperBound = ParallelMath::MakeUInt15(15);
+    }
+
+    MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
+
+    MUInt15 octant;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
+
+    quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
+    quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
+    quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
+}
+
+void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
+{
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
+        MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
+{
+    MFloat floatRGB[3];
+    for (int ch = 0; ch < 3; ch++)
+        floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
+
+    ConvertToFakeBT709(yuv, floatRGB);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
+{
+    ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
+{
+    MFloat r = pr;
+    MFloat g = pg;
+    MFloat b = pb;
+
+    yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
+    yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
+    yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
+}
+
+void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
+{
+    MFloat yy = yuv[0] * 0.57735026466774571071f;
+    MFloat u = yuv[1];
+    MFloat v = yuv[2];
+
+    rgb[0] = yy + u * 1.5748000207960953486f;
+    rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
+    rgb[2] = yy + v * 2.6242146882856944069f;
+}
+
+
+void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
+{
+    MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
+    MSInt16 offsetTimes2 = offset + offset;
+
+    // ETC2's offset tables all have a reflect about 0.5*multiplier
+    MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
+
+    MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
+    MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
+
+    MUInt15 positiveIndex;
+    MUInt15 positiveOffsetUnmultiplied;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
+        if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
+            blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
+        uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
+        ParallelMath::PutUInt15(positiveIndex, block, index);
+        ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
+
+        // TODO: This is suboptimal when the offset is capped.  We should detect 0 and 255 values and always map them to the maximum offsets.
+        // Doing that will also affect refinement though.
+    }
+
+    MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
+    MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
+    MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
+
+    MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
+
+    if (is11Bit)
+    {
+        if (isSigned)
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
+        else
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+    }
+    else
+        outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+
+    MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
+
+    outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
+}
+
+
+void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    int rh = ((isolatedColor[0] >> 2) & 3);
+    int rl = (isolatedColor[0] & 3);
+
+    if (rh + rl < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (58 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (61 - 32);
+    }
+
+    highBits |= rh << (59 - 32);
+    highBits |= rl << (56 - 32);
+    highBits |= isolatedColor[1] << (52 - 32);
+    highBits |= isolatedColor[2] << (48 - 32);
+    highBits |= lineColor[0] << (44 - 32);
+    highBits |= lineColor[1] << (40 - 32);
+    highBits |= lineColor[2] << (36 - 32);
+    highBits |= ((table >> 1) & 3) << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= (table & 1) << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
+        if ((sel & 0x1) != 0)
+            lowBits |= (1 << px);
+        if ((sel & 0x2) != 0)
+            lowBits |= (1 << (16 + px));
+    }
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    if (blockColors[0] == blockColors[1])
+    {
+        // Base colors are the same.
+        // If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
+        // on their order.
+        // Instead, we encode this as T mode where all of the indexes are on the line.
+
+        ParallelMath::ScalarUInt16 lineColor[3];
+        ParallelMath::ScalarUInt16 isolatedColor[3];
+
+        lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
+        lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
+        lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
+
+        int32_t packedSelectors = 0x55555555;
+        for (int px = 0; px < 16; px++)
+            packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
+
+        EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
+        return;
+    }
+
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int16_t colors[2][3];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
+    }
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
+    {
+        for (int ch = 0; ch < 3; ch++)
+            std::swap(colors[0][ch], colors[1][ch]);
+        sectorBits ^= 0xffff;
+    }
+
+    int r1 = colors[0][0];
+    int g1a = colors[0][1] >> 1;
+    int g1b = (colors[0][1] & 1);
+    int b1a = colors[0][2] >> 3;
+    int b1b = colors[0][2] & 7;
+    int r2 = colors[1][0];
+    int g2 = colors[1][1];
+    int b2 = colors[1][2];
+
+    // Avoid overflowing R
+    if ((g1a & 4) != 0 && r1 + g1a < 8)
+        highBits |= 1 << (63 - 32);
+
+    int fakeDG = b1b >> 1;
+    int fakeG = b1a | (g1b << 1);
+
+    if (fakeG + fakeDG < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (50 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (53 - 32);
+    }
+
+    int da = (table >> 2) & 1;
+    int db = (table >> 1) & 1;
+
+    highBits |= r1 << (59 - 32);
+    highBits |= g1a << (56 - 32);
+    highBits |= g1b << (52 - 32);
+    highBits |= b1a << (51 - 32);
+    highBits |= b1b << (47 - 32);
+    highBits |= r2 << (43 - 32);
+    highBits |= g2 << (39 - 32);
+    highBits |= b2 << (35 - 32);
+    highBits |= da << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= db << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
+        int signBit = (signBits >> selectorOrder[px]) & 1;
+
+        lowBits |= (signBit << px);
+        lowBits |= (sectorBit << (16 + px));
+    }
+
+    uint8_t *output = outputBuffer;
+
+    for (int i = 0; i < 4; i++)
+        output[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
+{
+    uint32_t highBits = 0;
+    uint32_t lowBits = 0;
+
+    if (blockBestD == 0)
+    {
+        highBits |= blockBestColors[0][0] << 28;
+        highBits |= blockBestColors[1][0] << 24;
+        highBits |= blockBestColors[0][1] << 20;
+        highBits |= blockBestColors[1][1] << 16;
+        highBits |= blockBestColors[0][2] << 12;
+        highBits |= blockBestColors[1][2] << 8;
+    }
+    else
+    {
+        highBits |= blockBestColors[0][0] << 27;
+        highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
+        highBits |= blockBestColors[0][1] << 19;
+        highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
+        highBits |= blockBestColors[0][2] << 11;
+        highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
+    }
+
+    highBits |= (blockBestTables[0] << 5);
+    highBits |= (blockBestTables[1] << 2);
+    if (!transparent)
+        highBits |= (blockBestD << 1);
+    highBits |= blockBestFlip;
+
+    const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
+
+    uint8_t unpackedSelectors[16];
+    uint8_t unpackedSelectorCodes[16];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        int blockSectorBestSelectors = blockBestSelectors[sector];
+
+        for (int px = 0; px < 8; px++)
+        {
+            int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
+            unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
+            unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
+        }
+    }
+
+    const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int lowBitOffset = 0;
+    for (int sb = 0; sb < 2; sb++)
+        for (int px = 0; px < 16; px++)
+            lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+    MUInt15 bestD = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+			}
+		}
+	}
+
+	static const MSInt16 modifierTables[8][4] =
+	{
+		{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
+		{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
+		{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
+		{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
+		{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
+		{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
+		{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
+		{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    int minD = punchthrough ? 1 : 0;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
+		MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
+		MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
+		MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
+
+		for (int d = minD; d < 2; d++)
+		{
+			for (int sector = 0; sector < 2; sector++)
+			{
+				const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
+
+				for (int table = 0; table < 8; table++)
+				{
+					int16_t numOffsets = *potentialOffsets++;
+
+					MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
+
+                    MUInt15 quantized[3];
+                    for (int oi = 0; oi < numOffsets; oi++)
+                    {
+                        if (!isFakeBT709)
+                        {
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                if (d == 1)
+                                {
+                                    //quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                            (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
+                                            , 11)
+                                        );
+                                }
+                                else
+                                {
+                                    //quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                        (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
+                                            , 12)
+                                    );
+                                }
+						    }
+                        }
+                        else
+                        {
+                            MUInt15 offsetCumulative[3];
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                offsetCumulative[ch] = cu15;
+						    }
+
+                            if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
+                                ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
+                            else
+                                ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
+                        }
+
+						possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+					}
+
+					potentialOffsets += numOffsets;
+
+                    ParallelMath::UInt15 numUniqueColors;
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t blockNumUniqueColors = 1;
+                        for (int i = 1; i < numOffsets; i++)
+                        {
+                            uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                            if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                                ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                        }
+
+                        ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                    }
+
+                    int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                    for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                        maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                        for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                            ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                    }
+
+					for (int i = 0; i < maxUniqueColors; i++)
+					{
+						MFloat error = ParallelMath::MakeFloatZero();
+						MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                        MUInt15 quantized = possibleColors[i];
+						TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
+
+						if (d == 0)
+						{
+                            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
+							if (ParallelMath::AnySet(errorBetter))
+							{
+								bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
+								ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
+                                ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
+                                ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
+							}
+						}
+						else
+						{
+                            ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+							MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                            drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            {
+                                int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                                ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                                ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                                ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                                ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                            }
+						}
+					}
+				}
+			}
+
+			if (d == 0)
+			{
+				MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
+                ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
+				if (ParallelMath::AnySet(errorBetter))
+				{
+                    bestIsThisMode = bestIsThisMode | errorBetter;
+
+					bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
+					ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
+                    ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
+					for (int sector = 0; sector < 2; sector++)
+					{
+                        ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
+                        ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
+                        ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
+					}
+				}
+			}
+			else
+			{
+                ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
+                FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
+			}
+		}
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        uint32_t highBits = 0;
+        uint32_t lowBits = 0;
+
+        int blockBestFlip = ParallelMath::Extract(bestFlip, block);
+        int blockBestD = ParallelMath::Extract(bestD, block);
+        int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
+        ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
+
+        int colors[2][3];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
+    }
+}
+
+
+void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+
+                sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
+			}
+		}
+	}
+
+	static const MUInt15 modifiers[8] =
+	{
+		ParallelMath::MakeUInt15(8),
+		ParallelMath::MakeUInt15(17),
+		ParallelMath::MakeUInt15(29),
+		ParallelMath::MakeUInt15(42),
+		ParallelMath::MakeUInt15(60),
+		ParallelMath::MakeUInt15(80),
+		ParallelMath::MakeUInt15(106),
+		ParallelMath::MakeUInt15(183),
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    const int maxSectorCumulativeOffsets = 17;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+        ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
+
+        for (int sector = 0; sector < 2; sector++)
+            for (int px = 0; px < 8; px++)
+                canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
+
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		for (int sector = 0; sector < 2; sector++)
+		{
+            MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
+            for (int px = 0; px < 8; px++)
+                sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
+
+            int sectorMaxOpaque = 0;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
+
+            int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
+
+            MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
+            MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
+
+            MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
+            MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
+
+            MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
+
+			for (int table = 0; table < 8; table++)
+			{
+				MUInt15 possibleColors[maxSectorCumulativeOffsets];
+
+                MUInt15 quantized[3];
+                for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
+                {
+                    MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
+                    MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
+
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        // cu is in range 0..255*numOpaque (at most 0..2040)
+                        MUInt15 cu15 = ParallelMath::Min(
+                            sectorCumulativeMax,
+                            ParallelMath::ToUInt15(
+                                ParallelMath::Max(
+                                    ParallelMath::MakeSInt16(0),
+                                    ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
+                                )
+                            )
+                        );
+
+                        //quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
+                        MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
+                        MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
+                        MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
+                        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
+                    }
+
+					possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+				}
+
+                ParallelMath::UInt15 numUniqueColors;
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t blockNumUniqueColors = 1;
+                    for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
+                    {
+                        uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                        if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                            ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                    }
+
+                    ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                }
+
+                int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                    maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                    for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                        ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                }
+
+				for (int i = 0; i < maxUniqueColors; i++)
+				{
+					MFloat error = ParallelMath::MakeFloatZero();
+					MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                    MUInt15 quantized = possibleColors[i];
+					TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
+
+                    ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+					MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                    drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                        ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                        ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                        ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                        ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                    }
+                }
+            }
+        }
+
+        MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
+        FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        int blockBestColors[2][3];
+        int blockBestTables[2];
+        ParallelMath::ScalarUInt16 blockBestSelectors[2];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+
+            blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
+            blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
+    }
+}
+
+
+cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
+    return static_cast<ETC1CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC1CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+}
+
+cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
+    return static_cast<ETC2CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC2CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+}
+
+cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
+    : m_context(context)
+{
+    const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
+    const float rotCD[3] = { cd[1], cd[2], cd[0] };
+
+    const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
+
+    const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
+
+    const float chromaAxis1Unnormalized[3] =
+    {
+        chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
+        chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
+        chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
+    };
+
+    const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
+    const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
+    const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
+
+    const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
+
+    for (int i = 0; i < 3; i++)
+    {
+        m_chromaSideAxis0[i] = chromaAxis0[i];
+        m_chromaSideAxis1[i] = chromaAxis1[i];
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.h b/thirdparty/cvtt/ConvectionKernels_ETC.h
new file mode 100644
index 0000000000..5e3c4d74fd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.h
@@ -0,0 +1,126 @@
+#pragma once
+#ifndef __CVTT_CONVECTIONKERNELS_ETC_H__
+#define __CVTT_CONVECTIONKERNELS_ETC_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct Options;
+
+    namespace Internal
+    {
+        class ETCComputer
+        {
+        public:
+            static void CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options);
+            static void CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha);
+            static void CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, const Options &options);
+            static void CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options);
+
+            static ETC2CompressionData *AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+            static void ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+            static ETC1CompressionData *AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context);
+            static void ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            struct DifferentialResolveStorage
+            {
+                static const unsigned int MaxAttemptsPerSector = 57 + 81 + 81 + 81 + 81 + 81 + 81 + 81;
+
+                MUInt15 diffNumAttempts[2];
+                MFloat diffErrors[2][MaxAttemptsPerSector];
+                MUInt16 diffSelectors[2][MaxAttemptsPerSector];
+                MUInt15 diffColors[2][MaxAttemptsPerSector];
+                MUInt15 diffTables[2][MaxAttemptsPerSector];
+
+                uint16_t attemptSortIndexes[2][MaxAttemptsPerSector];
+            };
+
+            struct HModeEval
+            {
+                MFloat errors[62][16];
+                MUInt16 signBits[62];
+                MUInt15 uniqueQuantizedColors[62];
+                MUInt15 numUniqueColors[2];
+            };
+
+            struct ETC1CompressionDataInternal : public cvtt::ETC1CompressionData
+            {
+                explicit ETC1CompressionDataInternal(void *context)
+                    : m_context(context)
+                {
+                }
+
+                DifferentialResolveStorage m_drs;
+                void *m_context;
+            };
+
+            struct ETC2CompressionDataInternal : public cvtt::ETC2CompressionData
+            {
+                explicit ETC2CompressionDataInternal(void *context, const cvtt::Options &options);
+
+                HModeEval m_h;
+                DifferentialResolveStorage m_drs;
+
+                void *m_context;
+                float m_chromaSideAxis0[3];
+                float m_chromaSideAxis1[3];
+            };
+
+            static MFloat ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3]);
+            static MFloat ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat pixelB[3], const Options options);
+            static MFloat ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat pixelB[3]);
+
+            static void TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options);
+            static void TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options);
+            static void FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs);
+
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b);
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b);
+            static bool ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b);
+            static bool ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b);
+
+            static void EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+            static void EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options);
+
+            static MUInt15 DecodePlanarCoeff(const MUInt15 &coeff, int ch);
+            static void EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage& compressionData, const Options &options, bool punchthrough);
+            static void CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage& compressionData, const Options &options);
+            static void CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options);
+
+            static void ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options);
+
+            static void ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 target[3], const MUInt15 &granularity);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat &r, const MFloat &g, const MFloat &b);
+            static void ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3]);
+
+            static void QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues);
+
+            static void EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent);
+
+            static const int g_flipTables[2][2][8];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC1.h b/thirdparty/cvtt/ConvectionKernels_ETC1.h
new file mode 100644
index 0000000000..775e41669f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC1.h
@@ -0,0 +1,29 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC1
+        {
+            const int16_t g_potentialOffsets4[] =
+            {
+                57, -64, -58, -54, -52, -48, -46, -44, -42, -40, -38, -36, -34, -32, -30, -28, -26, -24, -22, -20, -18, -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 52, 54, 58, 64,
+                81, -136, -124, -114, -112, -102, -100, -92, -90, -88, -80, -78, -76, -70, -68, -66, -64, -58, -56, -54, -52, -48, -46, -44, -42, -40, -36, -34, -32, -30, -26, -24, -22, -20, -18, -14, -12, -10, -8, -4, -2, 0, 2, 4, 8, 10, 12, 14, 18, 20, 22, 24, 26, 30, 32, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 64, 66, 68, 70, 76, 78, 80, 88, 90, 92, 100, 102, 112, 114, 124, 136,
+                81, -232, -212, -194, -192, -174, -172, -156, -154, -152, -136, -134, -132, -118, -116, -114, -112, -98, -96, -94, -92, -80, -78, -76, -74, -72, -60, -58, -56, -54, -42, -40, -38, -36, -34, -22, -20, -18, -16, -4, -2, 0, 2, 4, 16, 18, 20, 22, 34, 36, 38, 40, 42, 54, 56, 58, 60, 72, 74, 76, 78, 80, 92, 94, 96, 98, 112, 114, 116, 118, 132, 134, 136, 152, 154, 156, 172, 174, 192, 194, 212, 232,
+                81, -336, -307, -281, -278, -252, -249, -226, -223, -220, -197, -194, -191, -171, -168, -165, -162, -142, -139, -136, -133, -116, -113, -110, -107, -104, -87, -84, -81, -78, -61, -58, -55, -52, -49, -32, -29, -26, -23, -6, -3, 0, 3, 6, 23, 26, 29, 32, 49, 52, 55, 58, 61, 78, 81, 84, 87, 104, 107, 110, 113, 116, 133, 136, 139, 142, 162, 165, 168, 171, 191, 194, 197, 220, 223, 226, 249, 252, 278, 281, 307, 336,
+                81, -480, -438, -402, -396, -360, -354, -324, -318, -312, -282, -276, -270, -246, -240, -234, -228, -204, -198, -192, -186, -168, -162, -156, -150, -144, -126, -120, -114, -108, -90, -84, -78, -72, -66, -48, -42, -36, -30, -12, -6, 0, 6, 12, 30, 36, 42, 48, 66, 72, 78, 84, 90, 108, 114, 120, 126, 144, 150, 156, 162, 168, 186, 192, 198, 204, 228, 234, 240, 246, 270, 276, 282, 312, 318, 324, 354, 360, 396, 402, 438, 480,
+                81, -640, -584, -536, -528, -480, -472, -432, -424, -416, -376, -368, -360, -328, -320, -312, -304, -272, -264, -256, -248, -224, -216, -208, -200, -192, -168, -160, -152, -144, -120, -112, -104, -96, -88, -64, -56, -48, -40, -16, -8, 0, 8, 16, 40, 48, 56, 64, 88, 96, 104, 112, 120, 144, 152, 160, 168, 192, 200, 208, 216, 224, 248, 256, 264, 272, 304, 312, 320, 328, 360, 368, 376, 416, 424, 432, 472, 480, 528, 536, 584, 640,
+                81, -848, -775, -709, -702, -636, -629, -570, -563, -556, -497, -490, -483, -431, -424, -417, -410, -358, -351, -344, -337, -292, -285, -278, -271, -264, -219, -212, -205, -198, -153, -146, -139, -132, -125, -80, -73, -66, -59, -14, -7, 0, 7, 14, 59, 66, 73, 80, 125, 132, 139, 146, 153, 198, 205, 212, 219, 264, 271, 278, 285, 292, 337, 344, 351, 358, 410, 417, 424, 431, 483, 490, 497, 556, 563, 570, 629, 636, 702, 709, 775, 848,
+                81, -1464, -1328, -1234, -1192, -1098, -1056, -1004, -962, -920, -868, -826, -784, -774, -732, -690, -648, -638, -596, -554, -544, -512, -502, -460, -418, -408, -376, -366, -324, -314, -282, -272, -230, -188, -178, -146, -136, -94, -84, -52, -42, 0, 42, 52, 84, 94, 136, 146, 178, 188, 230, 272, 282, 314, 324, 366, 376, 408, 418, 460, 502, 512, 544, 554, 596, 638, 648, 690, 732, 774, 784, 826, 868, 920, 962, 1004, 1056, 1098, 1192, 1234, 1328, 1464
+            };
+
+            const unsigned int g_maxPotentialOffsets = 81;
+
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2.h b/thirdparty/cvtt/ConvectionKernels_ETC2.h
new file mode 100644
index 0000000000..4befc8e8c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2.h
@@ -0,0 +1,35 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC2
+        {
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+
+            const int16_t g_alphaModifierTablePositive[16][4] =
+            {
+                { 2, 5, 8, 14, },
+                { 2, 6, 9, 12, },
+                { 1, 4, 7, 12, },
+                { 1, 3, 5, 12, },
+                { 2, 5, 7, 11, },
+                { 2, 6, 8, 10, },
+                { 3, 6, 7, 10, },
+                { 2, 4, 7, 10, },
+                { 1, 5, 7, 9, },
+                { 1, 4, 7, 9, },
+                { 1, 3, 7, 9, },
+                { 1, 4, 6, 9, },
+                { 2, 3, 6, 9, },
+                { 0, 1, 2, 9, },
+                { 3, 5, 7, 8, },
+                { 2, 4, 6, 8, },
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
new file mode 100644
index 0000000000..a4f5a3ddfa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace ETC2 {
+    const int g_alphaRoundingTableWidth = 13;
+    const uint8_t g_alphaRoundingTables[16][13] =
+    {
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
new file mode 100644
index 0000000000..c1276553b2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
@@ -0,0 +1,181 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTREFINER_H__
+#define __CVTT_ENDPOINTREFINER_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        // Solve for a, b where v = a*t + b
+        // This allows endpoints to be mapped to where T=0 and T=1
+        // Least squares from totals:
+        // a = (tv - t*v/w)/(tt - t*t/w)
+        // b = (v - a*t)/w
+        template<int TVectorSize>
+        class EndpointRefiner
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            MFloat m_tv[TVectorSize];
+            MFloat m_v[TVectorSize];
+            MFloat m_tt;
+            MFloat m_t;
+            MFloat m_w;
+            int m_wu;
+
+            float m_rcpMaxIndex;
+            float m_channelWeights[TVectorSize];
+            float m_rcpChannelWeights[TVectorSize];
+
+            void Init(int indexRange, const float channelWeights[TVectorSize])
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_tv[ch] = ParallelMath::MakeFloatZero();
+                    m_v[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_tt = ParallelMath::MakeFloatZero();
+                m_t = ParallelMath::MakeFloatZero();
+                m_w = ParallelMath::MakeFloatZero();
+
+                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_channelWeights[ch] = channelWeights[ch];
+                    m_rcpChannelWeights[ch] = 1.0f;
+                    if (m_channelWeights[ch] != 0.0f)
+                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
+                }
+
+                m_wu = 0;
+            }
+
+            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch] * weight;
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + weight * t * t;
+                m_t = m_t + weight * t;
+                m_w = m_w + weight;
+            }
+
+            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch];
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + t * t;
+                m_t = m_t + t;
+                m_wu++;
+            }
+
+            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
+            {
+                ContributeUnweightedPW(floatPixel, index, TVectorSize);
+            }
+
+            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
+            {
+                // a = (tv - t*v/w)/(tt - t*t/w)
+                // b = (v - a*t)/w
+                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
+
+                ParallelMath::MakeSafeDenominator(w);
+                MFloat wRcp = ParallelMath::Reciprocal(w);
+
+                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
+
+                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
+                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    /*
+                    if (adenom == 0.0)
+                    p1 = p2 = er.v / er.w;
+                    else
+                    {
+                    float4 a = (er.tv - er.t*er.v / er.w) / adenom;
+                    float4 b = (er.v - a * er.t) / er.w;
+                    p1 = b;
+                    p2 = a + b;
+                    }
+                    */
+
+                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
+                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
+
+                    MFloat p1 = b;
+                    MFloat p2 = a + b;
+
+                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
+                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
+
+                    // Unweight
+                    float inverseWeight = m_rcpChannelWeights[ch];
+
+                    endPoint[0][ch] = p1 * inverseWeight;
+                    endPoint[1][ch] = p2 * inverseWeight;
+                }
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
+            }
+
+            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                    {
+                        MFloat f = floatEndPoint[epi][ch];
+                        if (isSigned)
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
+                        else
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
+                    }
+                }
+            }
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
new file mode 100644
index 0000000000..e09dfd248c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
@@ -0,0 +1,153 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTSELECTOR_H__
+#define __CVTT_ENDPOINTSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_PackedCovarianceMatrix.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        static const int NumEndpointSelectorPasses = 3;
+
+        template<int TVectorSize, int TIterationCount>
+        class EndpointSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+
+            EndpointSelector()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_centroid[ch] = ParallelMath::MakeFloatZero();
+                    m_direction[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_weightTotal = ParallelMath::MakeFloatZero();
+                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
+                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
+            }
+
+            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
+            {
+                if (pass == 0)
+                    ContributeCentroid(value, weight);
+                else if (pass == 1)
+                    ContributeDirection(value, weight);
+                else if (pass == 2)
+                    ContributeMinMax(value);
+            }
+
+            void FinishPass(int pass)
+            {
+                if (pass == 0)
+                    FinishCentroid();
+                else if (pass == 1)
+                    FinishDirection();
+            }
+
+            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
+            {
+                MFloat unweightedBase[TVectorSize];
+                MFloat unweightedOffset[TVectorSize];
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
+                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
+
+                    float safeWeight = channelWeights[ch];
+                    if (safeWeight == 0.f)
+                        safeWeight = 1.0f;
+
+                    unweightedBase[ch] = min / channelWeights[ch];
+                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
+                }
+
+                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
+            }
+
+        private:
+            void ContributeCentroid(const MFloat *value, const MFloat &weight)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
+                m_weightTotal = m_weightTotal + weight;
+            }
+
+            void FinishCentroid()
+            {
+                MFloat denom = m_weightTotal;
+                ParallelMath::MakeSafeDenominator(denom);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] / denom;
+            }
+
+            void ContributeDirection(const MFloat *value, const MFloat &weight)
+            {
+                MFloat diff[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    diff[ch] = value[ch] - m_centroid[ch];
+
+                m_covarianceMatrix.Add(diff, weight);
+            }
+
+            void FinishDirection()
+            {
+                MFloat approx[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approx[ch] = ParallelMath::MakeFloat(1.0f);
+
+                for (int i = 0; i < TIterationCount; i++)
+                {
+                    MFloat product[TVectorSize];
+                    m_covarianceMatrix.Product(product, approx);
+
+                    MFloat largestComponent = product[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
+
+                    // product = largestComponent*newApprox
+                    ParallelMath::MakeSafeDenominator(largestComponent);
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        approx[ch] = product[ch] / largestComponent;
+                }
+
+                // Normalize
+                MFloat approxLen = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approxLen = approxLen + approx[ch] * approx[ch];
+
+                approxLen = ParallelMath::Sqrt(approxLen);
+
+                ParallelMath::MakeSafeDenominator(approxLen);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_direction[ch] = approx[ch] / approxLen;
+            }
+
+            void ContributeMinMax(const MFloat *value)
+            {
+                MFloat dist = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
+
+                m_minDist = ParallelMath::Min(m_minDist, dist);
+                m_maxDist = ParallelMath::Max(m_maxDist, dist);
+            }
+
+            ParallelMath::Float m_centroid[TVectorSize];
+            ParallelMath::Float m_direction[TVectorSize];
+            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
+            ParallelMath::Float m_weightTotal;
+
+            ParallelMath::Float m_minDist;
+            ParallelMath::Float m_maxDist;
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
new file mode 100644
index 0000000000..1eb924befe
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
@@ -0,0 +1,282 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace FakeBT709 {
+    const uint8_t g_rounding16[] =
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
new file mode 100644
index 0000000000..b3d1b5497e
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
@@ -0,0 +1,66 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        const ParallelMath::UInt16 g_weightReciprocals[17] =
+        {
+            ParallelMath::MakeUInt16(0),        // -1 
+            ParallelMath::MakeUInt16(0),        // 0
+            ParallelMath::MakeUInt16(32768),    // 1
+            ParallelMath::MakeUInt16(16384),    // 2
+            ParallelMath::MakeUInt16(10923),    // 3
+            ParallelMath::MakeUInt16(8192),     // 4
+            ParallelMath::MakeUInt16(6554),     // 5
+            ParallelMath::MakeUInt16(5461),     // 6
+            ParallelMath::MakeUInt16(4681),     // 7
+            ParallelMath::MakeUInt16(4096),     // 8
+            ParallelMath::MakeUInt16(3641),     // 9
+            ParallelMath::MakeUInt16(3277),     // 10
+            ParallelMath::MakeUInt16(2979),     // 11
+            ParallelMath::MakeUInt16(2731),     // 12
+            ParallelMath::MakeUInt16(2521),     // 13
+            ParallelMath::MakeUInt16(2341),     // 14
+            ParallelMath::MakeUInt16(2185),     // 15
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.h b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
new file mode 100644
index 0000000000..0f9d209183
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
@@ -0,0 +1,147 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTOR_H__
+#define __CVTT_INDEXSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        extern const ParallelMath::UInt16 g_weightReciprocals[17];
+
+        template<int TVectorSize>
+        class IndexSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+
+            template<class TInterpolationEPType, class TColorEPType>
+            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
+            {
+                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
+                // We need to select indexes using the color-space endpoints.
+
+                m_isUniform = true;
+                for (int ch = 1; ch < TVectorSize; ch++)
+                {
+                    if (channelWeights[ch] != channelWeights[0])
+                        m_isUniform = false;
+                }
+
+                // To work with channel weights, we need something where:
+                // pxDiff = px - ep[0]
+                // epDiff = ep[1] - ep[0]
+                //
+                // weightedEPDiff = epDiff * channelWeights
+                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
+                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
+                // index = normalizedIndex * maxValue
+                //
+                // Equivalent to:
+                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
+                // index = dot(axis, pxDiff)
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
+
+                m_range = range;
+                m_maxValue = static_cast<float>(range - 1);
+
+                MFloat epDiffWeighted[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
+                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
+                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
+                }
+
+                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
+
+                ParallelMath::MakeSafeDenominator(lenSquared);
+
+                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
+            }
+
+            template<bool TSigned>
+            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
+            {
+                MAInt16 converted[2][TVectorSize];
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
+
+                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
+                }
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
+                }
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDR_BC7(index, pixel, TVectorSize);
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDRPrecise(index, pixel, TVectorSize);
+            }
+
+            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
+
+                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
+            }
+
+        protected:
+            MAInt16 m_endPoint[2][TVectorSize];
+
+        private:
+            MFloat m_origin[TVectorSize];
+            MFloat m_axis[TVectorSize];
+            int m_range;
+            float m_maxValue;
+            bool m_isUniform;
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
new file mode 100644
index 0000000000..84795cd689
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
@@ -0,0 +1,155 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTORHDR_H__
+#define __CVTT_INDEXSELECTORHDR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v);
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v);
+
+        template<int TVectorSize>
+        class IndexSelectorHDR : public IndexSelector<TVectorSize>
+        {
+        public:
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+        private:
+
+            MUInt15 InvertSingle(const MUInt15& anIndex) const
+            {
+                MUInt15 inverted = m_maxValueMinusOne - anIndex;
+                return ParallelMath::Select(m_isInverted, inverted, anIndex);
+            }
+
+            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
+                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
+
+                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
+                }
+            }
+
+            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
+                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
+
+                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
+                }
+            }
+
+            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
+            {
+                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
+                return diff * diff;
+            }
+
+            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
+            {
+                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
+                return error;
+            }
+
+        public:
+
+            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
+            {
+                assert(range <= 16);
+
+                m_range = range;
+
+                m_isInverted = ParallelMath::MakeBoolInt16(false);
+                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
+
+                if (!fastIndexing)
+                {
+                    for (int i = 0; i < range; i++)
+                    {
+                        MSInt16 recon2CL[TVectorSize];
+
+                        if (isSigned)
+                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+                        else
+                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+
+                        for (int ch = 0; ch < TVectorSize; ch++)
+                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
+                    }
+                }
+            }
+
+            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
+            {
+                m_isInverted = invert;
+            }
+
+            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
+            {
+                MUInt15 index = ParallelMath::MakeUInt15(0);
+
+                MFloat bestError = ErrorForInterpolator(0, pixel);
+                for (int i = 1; i < m_range; i++)
+                {
+                    MFloat error = ErrorForInterpolator(i, pixel);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+                    bestError = ParallelMath::Min(bestError, error);
+                }
+
+                return InvertSingle(index);
+            }
+
+            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
+            }
+
+        private:
+            MFloat m_reconstructedInterpolators[16][TVectorSize];
+            ParallelMath::Int16CompFlag m_isInverted;
+            MUInt15 m_maxValueMinusOne;
+            int m_range;
+        };
+    }
+}
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
new file mode 100644
index 0000000000..7ac3d4fdda
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
@@ -0,0 +1,68 @@
+#pragma once
+#ifndef __CVTT_COVARIANCEMATRIX_H__
+#define __CVTT_COVARIANCEMATRIX_H__
+
+namespace cvtt
+{
+    namespace Internal
+    {
+
+        template<int TMatrixSize>
+        class PackedCovarianceMatrix
+        {
+        public:
+            // 0: xx,
+            // 1: xy, yy
+            // 3: xz, yz, zz 
+            // 6: xw, yw, zw, ww
+            // ... etc.
+            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
+
+            typedef ParallelMath::Float MFloat;
+
+            PackedCovarianceMatrix()
+            {
+                for (int i = 0; i < PyramidSize; i++)
+                    m_values[i] = ParallelMath::MakeFloatZero();
+            }
+
+            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
+            {
+                int index = 0;
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    for (int col = 0; col <= row; col++)
+                    {
+                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
+                        index++;
+                    }
+                }
+            }
+
+            void Product(MFloat *outVec, const MFloat *inVec)
+            {
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    MFloat sum = ParallelMath::MakeFloatZero();
+
+                    int index = (row * (row + 1)) >> 1;
+                    for (int col = 0; col < TMatrixSize; col++)
+                    {
+                        sum = sum + inVec[col] * m_values[index];
+                        if (col >= row)
+                            index += col + 1;
+                        else
+                            index++;
+                    }
+
+                    outVec[row] = sum;
+                }
+            }
+
+        private:
+            ParallelMath::Float m_values[PyramidSize];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ParallelMath.h b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
new file mode 100644
index 0000000000..9e25280f45
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
@@ -0,0 +1,1816 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+#pragma once
+#ifndef __CVTT_PARALLELMATH_H__
+#define __CVTT_PARALLELMATH_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Config.h"
+
+#ifdef CVTT_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#include <float.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+#include <math.h>
+
+#define UNREFERENCED_PARAMETER(n) ((void)n)
+
+// Parallel math implementation
+//
+// After preprocessor defs are handled, what this should do is expose the following types:
+// SInt16 - Signed 16-bit integer
+// UInt16 - Signed 16-bit integer
+// UInt15 - Unsigned 15-bit integer
+// SInt32 - Signed 32-bit integer
+// UInt31 - Unsigned 31-bit integer
+// AInt16 - 16-bit integer of unknown signedness (only used for storage)
+// Int16CompFlag - Comparison flags from comparing 16-bit integers
+// Int32CompFlag - Comparison flags from comparing 32-bit integers
+// FloatCompFlag - Comparison flags from comparing 32-bit floats
+//
+// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
+// (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
+// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
+// can elide the bit flips if unsigned versions are not available.
+
+namespace cvtt
+{
+#ifdef CVTT_USE_SSE2
+    // SSE2 version
+    struct ParallelMath
+    {
+        typedef uint16_t ScalarUInt16;
+        typedef int16_t ScalarSInt16;
+
+        template<unsigned int TRoundingMode>
+        struct RoundForScope
+        {
+            unsigned int m_oldCSR;
+
+            RoundForScope()
+            {
+                m_oldCSR = _mm_getcsr();
+                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
+            }
+
+            ~RoundForScope()
+            {
+                _mm_setcsr(m_oldCSR);
+            }
+        };
+
+        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
+        {
+        };
+
+        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
+        {
+        };
+
+        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
+        {
+        };
+
+        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
+        {
+        };
+
+        static const int ParallelSize = 8;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        template<int TSubtype>
+        struct VInt16
+        {
+            __m128i m_value;
+
+            inline VInt16 operator+(int16_t other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
+                return result;
+            }
+
+            inline VInt16 operator+(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator|(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator&(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator-(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_sub_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator<<(int bits) const
+            {
+                VInt16 result;
+                result.m_value = _mm_slli_epi16(m_value, bits);
+                return result;
+            }
+
+            inline VInt16 operator^(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_xor_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        typedef VInt16<IntSubtype_Signed> SInt16;
+        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
+        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
+        typedef VInt16<IntSubtype_Abstract> AInt16;
+
+        template<int TSubtype>
+        struct VInt32
+        {
+            __m128i m_values[2];
+
+            inline VInt32 operator+(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator-(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator<<(const int other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
+                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
+                return result;
+            }
+
+            inline VInt32 operator|(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        typedef VInt32<IntSubtype_Signed> SInt32;
+        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
+        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
+        typedef VInt32<IntSubtype_Abstract> AInt32;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+#ifdef CVTT_PERMIT_ALIASING
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt32<TSubtype>&>(src);
+            }
+
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt16<TSubtype>&>(src);
+            }
+#else
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_values[0] = src.m_values[0];
+                result.m_values[1] = src.m_values[1];
+                return result;
+            }
+
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_value = src.m_value;
+                return result;
+            }
+#endif
+        };
+
+        struct Int64
+        {
+            __m128i m_values[4];
+        };
+
+        struct Float
+        {
+            __m128 m_values[2];
+
+            inline Float operator+(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator+(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator-(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator-() const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
+                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator/(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator/(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+        };
+
+        struct Int16CompFlag
+        {
+            __m128i m_value;
+
+            inline Int16CompFlag operator&(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16CompFlag operator|(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        struct Int32CompFlag
+        {
+            __m128i m_values[2];
+
+            inline Int32CompFlag operator&(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Int32CompFlag operator|(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        struct FloatCompFlag
+        {
+            __m128 m_values[2];
+
+            inline FloatCompFlag operator&(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline FloatCompFlag operator|(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
+        {
+            __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
+            __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
+            dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
+            dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
+        }
+
+        static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
+            return result;
+        }
+
+        template<int TSubtype>
+        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
+        }
+
+        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void MakeSafeDenominator(Float& v)
+        {
+            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
+        }
+
+        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 Min(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Min(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Min(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Min(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Max(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Max(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Max(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Max(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Float Clamp(const Float &v, float min, float max)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
+            return result;
+        }
+
+        static Float Reciprocal(const Float &v)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
+            return result;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static Float MakeFloat(float v)
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
+            return f;
+        }
+
+        static Float MakeFloatZero()
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
+            return f;
+        }
+
+        static UInt16 MakeUInt16(uint16_t v)
+        {
+            UInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt16 MakeSInt16(int16_t v)
+        {
+            SInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static AInt16 MakeAInt16(int16_t v)
+        {
+            AInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static UInt15 MakeUInt15(uint16_t v)
+        {
+            UInt15 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt32 MakeSInt32(int32_t v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static UInt31 MakeUInt31(uint32_t v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static uint16_t Extract(const UInt16 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const SInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static uint16_t Extract(const UInt15 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const AInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static int32_t Extract(const SInt32 &v, int offset)
+        {
+            return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static float Extract(const Float &v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
+        }
+
+        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v;
+        }
+
+        static float ExtractFloat(const Float& v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v)[offset];
+        }
+
+        static void PutFloat(Float &dest, int offset, float v)
+        {
+            reinterpret_cast<float*>(&dest)[offset] = v;
+        }
+
+        static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
+        }
+
+        static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
+            result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Less(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        template<int TSubtype>
+        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Equal(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag notResult;
+            notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
+            return Not(notResult);
+        }
+
+        static Float ToFloat(const UInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static UInt31 ToUInt31(const UInt16 &v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt15 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const SInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
+            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
+            return result;
+        }
+
+        static Float ToFloat(const SInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
+            return result;
+        }
+
+        static Float ToFloat(const UInt15 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static Float ToFloat(const UInt31 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
+            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
+        {
+            __m128i lo = _mm_castps_si128(v.m_values[0]);
+            __m128i hi = _mm_castps_si128(v.m_values[1]);
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
+        {
+            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
+            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
+
+            FloatCompFlag result;
+            result.m_values[0] = _mm_castsi128_ps(lo);
+            result.m_values[1] = _mm_castsi128_ps(hi);
+            return result;
+        }
+
+        static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
+        {
+            __m128i lo = v.m_values[0];
+            __m128i hi = v.m_values[1];
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Int16CompFlag MakeBoolInt16(bool b)
+        {
+            Int16CompFlag result;
+            if (b)
+                result.m_value = _mm_set1_epi16(-1);
+            else
+                result.m_value = _mm_setzero_si128();
+            return result;
+        }
+
+        static FloatCompFlag MakeBoolFloat(bool b)
+        {
+            FloatCompFlag result;
+            if (b)
+                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
+            else
+                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
+            return result;
+        }
+
+        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Not(const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static Int32CompFlag Not(const Int32CompFlag &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
+            result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
+            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
+            return result;
+        }
+
+        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Float Sqrt(const Float &f)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Abs(const SInt16 &a)
+        {
+            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
+            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
+
+            UInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
+            return result;
+        }
+
+        static Float Abs(const Float& a)
+        {
+            __m128 invMask = _mm_set1_ps(-0.0f);
+
+            Float result;
+            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
+            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
+            return result;
+        }
+
+        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
+
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(diff, diff);
+            return result;
+        }
+
+        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
+
+            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
+            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
+            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
+            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
+
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
+            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
+
+            return result;
+        }
+
+        static Float TwosCLHalfToFloat(const SInt16 &v)
+        {
+            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
+
+            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
+            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
+            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
+
+            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
+
+            // Convert exponent to high-bits 
+            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
+
+            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
+
+            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
+            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
+
+            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
+            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
+
+            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+
+            Float result;
+            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
+            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
+
+            return result;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static UInt16 RightShift(const UInt16 &v, int bits)
+        {
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt31 RightShift(const UInt31 &v, int bits)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 RightShift(const SInt16 &v, int bits)
+        {
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt15 RightShift(const UInt15 &v, int bits)
+        {
+            UInt15 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static SInt32 RightShift(const SInt32 &v, int bits)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const SInt32 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt15 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt32 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt31 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt31 &v)
+        {
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const SInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static bool AnySet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) != 0;
+        }
+
+        static bool AllSet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) == 0xffff;
+        }
+
+        static bool AnySet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
+        }
+
+        static bool AllSet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
+        }
+    };
+
+#else
+    // Scalar version
+    struct ParallelMath
+    {
+        struct RoundTowardZeroForScope
+        {
+        };
+
+        struct RoundTowardNearestForScope
+        {
+        };
+
+        struct RoundUpForScope
+        {
+        };
+
+        struct RoundDownForScope
+        {
+        };
+
+        static const int ParallelSize = 1;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        typedef int32_t SInt16;
+        typedef int32_t UInt15;
+        typedef int32_t UInt16;
+        typedef int32_t AInt16;
+
+        typedef int32_t SInt32;
+        typedef int32_t UInt31;
+        typedef int32_t UInt32;
+        typedef int32_t AInt32;
+
+        typedef int32_t ScalarUInt16;
+        typedef int32_t ScalarSInt16;
+
+        typedef float Float;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+            static const int32_t& Cast(const int32_t &src)
+            {
+                return src;
+            }
+        };
+
+        typedef bool Int16CompFlag;
+        typedef bool FloatCompFlag;
+
+        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
+        {
+            return a + b;
+        }
+
+        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
+        {
+            return a - b;
+        }
+
+        static float Select(bool flag, float a, float b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t Select(bool flag, int32_t a, int32_t b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t SelectOrZero(bool flag, int32_t a)
+        {
+            return flag ? a : 0;
+        }
+
+        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(bool& dest, bool flag, bool src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static int32_t ConditionalNegate(bool flag, int32_t v)
+        {
+            return (flag) ? -v : v;
+        }
+
+        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(float& dest, bool flag, float src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void NotConditionalSet(float& dest, bool flag, float src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void MakeSafeDenominator(float& v)
+        {
+            if (v == 0.0f)
+                v = 1.0f;
+        }
+
+        static int32_t SignedRightShift(int32_t v, int bits)
+        {
+            return v >> bits;
+        }
+
+        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
+        {
+            v = (v << (32 - precision)) & 0xffffffff;
+            return SignedRightShift(v, 32 - precision);
+        }
+
+        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
+        {
+            return v & ((1 << precision) - 1);
+        }
+
+        static int32_t Min(int32_t a, int32_t b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static float Min(float a, float b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static int32_t Max(int32_t a, int32_t b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Max(float a, float b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Abs(float a)
+        {
+            return fabsf(a);
+        }
+
+        static int32_t Abs(int32_t a)
+        {
+            if (a < 0)
+                return -a;
+            return a;
+        }
+
+        static float Clamp(float v, float min, float max)
+        {
+            if (v < min)
+                return min;
+            if (v > max)
+                return max;
+            return v;
+        }
+
+        static float Reciprocal(float v)
+        {
+            return 1.0f / v;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static float MakeFloat(float v)
+        {
+            return v;
+        }
+
+        static float MakeFloatZero()
+        {
+            return 0.0f;
+        }
+
+        static int32_t MakeUInt16(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeAInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt15(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t Extract(int32_t v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static bool Extract(bool v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static float Extract(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static float ExtractFloat(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutFloat(float &dest, int offset, float v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutBoolInt16(bool &dest, int offset, bool v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static bool Less(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool Less(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool Equal(int32_t a, int32_t b)
+        {
+            return a == b;
+        }
+
+        static bool Equal(float a, float b)
+        {
+            return a == b;
+        }
+
+        static float ToFloat(int32_t v)
+        {
+            return static_cast<float>(v);
+        }
+
+        static int32_t ToUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static bool FloatFlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int32FlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int16FlagToFloat(bool v)
+        {
+            return v;
+        }
+
+        static bool MakeBoolInt16(bool b)
+        {
+            return b;
+        }
+
+        static bool MakeBoolFloat(bool b)
+        {
+            return b;
+        }
+
+        static bool AndNot(bool a, bool b)
+        {
+            return a && !b;
+        }
+
+        static bool Not(bool b)
+        {
+            return !b;
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
+        {
+            UNREFERENCED_PARAMETER(rtz);
+            return static_cast<int>(v);
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
+        {
+            UNREFERENCED_PARAMETER(ru);
+            return static_cast<int>(ceilf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
+        {
+            UNREFERENCED_PARAMETER(rd);
+            return static_cast<int>(floorf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
+        {
+            UNREFERENCED_PARAMETER(rtn);
+            return static_cast<int>(floorf(v + 0.5f));
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        static float Sqrt(float f)
+        {
+            return sqrtf(f);
+        }
+
+        static int32_t SqDiffUInt8(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffSInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static float TwosCLHalfToFloat(int32_t v)
+        {
+            int32_t absV = (v < 0) ? -v : v;
+
+            int32_t signBits = (absV & -32768);
+            int32_t mantissa = (absV & 0x03ff);
+            int32_t exponent = (absV & 0x7c00);
+
+            bool isDenormal = (exponent == 0);
+
+            // Convert exponent to high-bits
+            exponent = (exponent >> 3) + 14336;
+
+            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
+
+            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
+
+            float f, correction;
+            memcpy(&f, &fBits, 4);
+            memcpy(&correction, &denormalCorrection, 4);
+
+            return f - correction;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static int32_t RightShift(int32_t v, int bits)
+        {
+            return SignedRightShift(v, bits);
+        }
+
+        static int32_t ToSInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt15(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t XMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static int32_t CompactMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static bool AnySet(bool v)
+        {
+            return v;
+        }
+
+        static bool AllSet(bool v)
+        {
+            return v;
+        }
+    };
+
+#endif
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.cpp b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
new file mode 100644
index 0000000000..23f1bd3314
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
@@ -0,0 +1,1054 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_S3TC.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelector.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_S3TC_SingleColor.h"
+
+void cvtt::Internal::S3TCComputer::Init(MFloat& error)
+{
+    error = ParallelMath::MakeFloat(FLT_MAX);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
+    v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
+    v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
+{
+    QuantizeTo5Bits(endPoint[0]);
+    QuantizeTo6Bits(endPoint[1]);
+    QuantizeTo5Bits(endPoint[2]);
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
+{
+    return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
+{
+    MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
+    absDiff = absDiff + d;
+    return absDiff * absDiff;
+}
+
+void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            totals[ch] = totals[ch] + pixels[px][ch];
+    }
+
+    MUInt15 average[3];
+    for (int ch = 0; ch < 3; ch++)
+        average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
+
+    const Tables::S3TCSC::TableEntry* rbTable = NULL;
+    const Tables::S3TCSC::TableEntry* gTable = NULL;
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3_p;
+            gTable = Tables::S3TCSC::g_singleColor6_3_p;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2_p;
+            gTable = Tables::S3TCSC::g_singleColor6_2_p;
+        }
+    }
+    else
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3;
+            gTable = Tables::S3TCSC::g_singleColor6_3;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2;
+            gTable = Tables::S3TCSC::g_singleColor6_2;
+        }
+    }
+
+    MUInt15 interpolated[3];
+    MUInt15 eps[2][3];
+    MSInt16 spans[3];
+    for (int i = 0; i < ParallelMath::ParallelSize; i++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            uint16_t avg = ParallelMath::Extract(average[ch], i);
+            const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
+            ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
+            ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
+            ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
+            ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
+        }
+    }
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        MFloat spanParanoidFactors[3];
+        for (int ch = 0; ch < 3; ch++)
+            spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
+
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
+        }
+    }
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+    ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
+
+    if (ParallelMath::AnySet(better16))
+    {
+        bestError = ParallelMath::Min(bestError, error);
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
+
+        MUInt15 vindexes = ParallelMath::MakeUInt15(1);
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
+
+        ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 endPoints[2][3];
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
+
+    QuantizeTo565(endPoints[0]);
+    QuantizeTo565(endPoints[1]);
+
+    IndexSelector<3> selector;
+    selector.Init<false>(channelWeights, endPoints, range);
+
+    MUInt15 indexes[16];
+
+    MFloat paranoidFactors[3];
+    for (int ch = 0; ch < 3; ch++)
+        paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    AggregatedError<3> aggError;
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
+        indexes[px] = index;
+
+        if (refiner)
+            refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
+
+        MUInt15 reconstructed[3];
+        selector.ReconstructLDRPrecise(index, reconstructed);
+
+        if (flags & Flags::S3TC_Paranoid)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
+        }
+        else
+            BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
+    }
+
+    if (!(flags & Flags::S3TC_Paranoid))
+        error = aggError.Finalize(flags, channelWeightsSq);
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+
+    if (ParallelMath::AnySet(better))
+    {
+        ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
+
+        ParallelMath::ConditionalSet(bestError, better, error);
+
+        for (int ep = 0; ep < 2; ep++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
+
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
+
+        ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+    const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+    const ParallelMath::RoundTowardNearestForScope* rtn)
+{
+    UNREFERENCED_PARAMETER(alphaTest);
+    UNREFERENCED_PARAMETER(flags);
+
+    EndpointRefiner<3> refiner;
+
+    refiner.Init(nCounts, channelWeights);
+
+    bool escape = false;
+    int e = 0;
+    for (int i = 0; i < nCounts; i++)
+    {
+        for (int n = 0; n < counts[i]; n++)
+        {
+            ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
+            if (!ParallelMath::AnySet(valid))
+            {
+                escape = true;
+                break;
+            }
+
+            if (ParallelMath::AllSet(valid))
+                refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+            else
+            {
+                MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
+                refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
+            }
+        }
+
+        if (escape)
+            break;
+    }
+
+    MUInt15 endPoints[2][3];
+    refiner.GetRefinedEndpointsLDR(endPoints, rtn);
+
+    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
+}
+
+void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
+{
+    UNREFERENCED_PARAMETER(flags);
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float weights[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
+
+    IndexSelector<1> selector;
+    selector.Init<false>(weights, ep, 16);
+
+    MUInt15 indexes[16];
+
+    for (int px = 0; px < 16; px++)
+        indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        for (int px = 0; px < 16; px += 2)
+        {
+            int index0 = ParallelMath::Extract(indexes[px], block);
+            int index1 = ParallelMath::Extract(indexes[px + 1], block);
+
+            packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
+{
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float oneWeight[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
+    MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+
+        if (isSigned)
+            pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
+
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 sortedPixels[16];
+    for (int px = 0; px < 16; px++)
+        sortedPixels[px] = pixels[px];
+
+    for (int sortEnd = 15; sortEnd > 0; sortEnd--)
+    {
+        for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
+        {
+            MUInt15 a = sortedPixels[sortOffset];
+            MUInt15 b = sortedPixels[sortOffset + 1];
+
+            sortedPixels[sortOffset] = ParallelMath::Min(a, b);
+            sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
+        }
+    }
+
+    MUInt15 zero = ParallelMath::MakeUInt15(0);
+    MUInt15 one = ParallelMath::MakeUInt15(1);
+
+    MUInt15 bestIsFullRange = zero;
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestEP[2] = { zero, zero };
+    MUInt15 bestIndexes[16] = {
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero
+    };
+
+    // Full-precision
+    {
+        MUInt15 minEP = sortedPixels[0];
+        MUInt15 maxEP = sortedPixels[15];
+
+        MFloat base[1] = { ParallelMath::ToFloat(minEP) };
+        MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
+
+        UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+        int numTweakRounds = BCCommon::TweakRoundsForRange(8);
+        if (numTweakRounds > maxTweakRounds)
+            numTweakRounds = maxTweakRounds;
+
+        for (int tweak = 0; tweak < numTweakRounds; tweak++)
+        {
+            MUInt15 ep[2][1];
+
+            ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+            for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+            {
+                EndpointRefiner<1> refiner;
+                refiner.Init(8, oneWeight);
+
+                if (isSigned)
+                    for (int epi = 0; epi < 2; epi++)
+                        ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                IndexSelector<1> indexSelector;
+                indexSelector.Init<false>(oneWeight, ep, 8);
+
+                MUInt15 indexes[16];
+
+                AggregatedError<1> aggError;
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                    MUInt15 reconstructedPixel;
+
+                    indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
+                    BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
+
+                    if (refinePass != numRefineRounds - 1)
+                        refiner.ContributeUnweightedPW(&floatPixels[px], index);
+
+                    indexes[px] = index;
+                }
+                MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                if (ParallelMath::AnySet(errorBetter16))
+                {
+                    bestError = ParallelMath::Min(error, bestError);
+                    ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                    for (int epi = 0; epi < 2; epi++)
+                        ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                }
+
+                if (refinePass != numRefineRounds - 1)
+                    refiner.GetRefinedEndpointsLDR(ep, &rtn);
+            }
+        }
+    }
+
+    // Reduced precision with special endpoints
+    {
+        MUInt15 bestHeuristicMin = sortedPixels[0];
+        MUInt15 bestHeuristicMax = sortedPixels[15];
+
+        ParallelMath::Int16CompFlag canTryClipping;
+
+        // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
+        // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
+        // This will usually not find anything, but it's cheap to check.
+
+        {
+            MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
+            MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
+
+            MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
+            canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
+        }
+
+        if (ParallelMath::AnySet(canTryClipping))
+        {
+            MUInt15 lowClearances[16];
+            MUInt15 highClearances[16];
+            MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
+
+            lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
+
+            for (int px = 1; px < 16; px++)
+            {
+                lowClearances[px] = sortedPixels[px - 1];
+                highClearances[px] = highTerminal - sortedPixels[16 - px];
+            }
+
+            for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
+            {
+                uint16_t numSkippedLow = firstIndex;
+
+                MUInt15 lowClearance = lowClearances[firstIndex];
+
+                for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
+                {
+                    uint16_t numSkippedHigh = 15 - lastIndex;
+                    uint16_t numSkipped = numSkippedLow + numSkippedHigh;
+
+                    MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
+
+                    ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
+
+                    if (!ParallelMath::AnySet(areMoreSkipped))
+                        continue;
+
+                    MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
+                    MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
+
+                    MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
+
+                    ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
+                    ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
+                    ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
+                }
+            }
+        }
+
+        MUInt15 bestSimpleMin = one;
+        MUInt15 bestSimpleMax = highTerminalMinusOne;
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
+            ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
+        }
+
+        MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
+        MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
+
+        int minEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
+            minEPRange = 1;
+
+        int maxEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
+            maxEPRange = 1;
+
+        for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
+        {
+            for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
+            {
+                MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
+                MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
+
+                UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+                int numTweakRounds = BCCommon::TweakRoundsForRange(6);
+                if (numTweakRounds > maxTweakRounds)
+                    numTweakRounds = maxTweakRounds;
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 ep[2][1];
+
+                    ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+                    for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+                    {
+                        EndpointRefiner<1> refiner;
+                        refiner.Init(6, oneWeight);
+
+                        if (isSigned)
+                            for (int epi = 0; epi < 2; epi++)
+                                ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                        IndexSelector<1> indexSelector;
+                        indexSelector.Init<false>(oneWeight, ep, 6);
+
+                        MUInt15 indexes[16];
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                            MUInt15 reconstructedPixel;
+
+                            indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
+
+                            MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
+                            MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
+                            MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
+
+                            MFloat bestPixelError = zeroError;
+                            MUInt15 index = ParallelMath::MakeUInt15(6);
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
+                            bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
+
+                            ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
+
+                            if (ParallelMath::AllSet(selectedIndexBetter))
+                            {
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
+                            }
+                            else
+                            {
+                                MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
+
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
+                            }
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
+                            bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
+
+                            error = error + bestPixelError;
+
+                            indexes[px] = index;
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        if (ParallelMath::AnySet(errorBetter16))
+                        {
+                            bestError = ParallelMath::Min(error, bestError);
+                            ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                            for (int epi = 0; epi < 2; epi++)
+                                ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                        }
+
+                        if (refinePass != numRefineRounds - 1)
+                            refiner.GetRefinedEndpointsLDR(ep, &rtn);
+                    }
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int ep0 = ParallelMath::Extract(bestEP[0], block);
+        int ep1 = ParallelMath::Extract(bestEP[1], block);
+        int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
+
+        if (isSigned)
+        {
+            ep0 -= 127;
+            ep1 -= 127;
+
+            assert(ep0 >= -127 && ep0 <= 127);
+            assert(ep1 >= -127 && ep1 <= 127);
+        }
+
+
+        bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
+
+        if (swapEndpoints)
+            std::swap(ep0, ep1);
+
+        uint16_t dumpBits = 0;
+        int dumpBitsOffset = 0;
+        int dumpByteOffset = 2;
+        packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
+
+        int maxValue = (isFullRange != 0) ? 7 : 5;
+
+        for (int px = 0; px < 16; px++)
+        {
+            int index = ParallelMath::Extract(bestIndexes[px], block);
+
+            if (swapEndpoints && index <= maxValue)
+                index = maxValue - index;
+
+            if (index != 0)
+            {
+                if (index == maxValue)
+                    index = 1;
+                else if (index < maxValue)
+                    index++;
+            }
+
+            assert(index >= 0 && index < 8);
+
+            dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
+            dumpBitsOffset += 3;
+
+            if (dumpBitsOffset >= 8)
+            {
+                assert(dumpByteOffset < 8);
+                packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
+                dumpBits >>= 8;
+                dumpBitsOffset -= 8;
+                dumpByteOffset++;
+            }
+        }
+
+        assert(dumpBitsOffset == 0);
+        assert(dumpByteOffset == 8);
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
+{
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    EndpointSelector<3, 8> endpointSelector;
+
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    MFloat preWeightedPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    if (alphaTest)
+    {
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
+            pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
+        }
+    }
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+
+    for (int px = 0; px < 16; px++)
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
+
+    MFloat pixelWeights[16];
+    for (int px = 0; px < 16; px++)
+    {
+        pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
+        if (alphaTest)
+        {
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+            ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
+        }
+    }
+
+    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+    {
+        for (int px = 0; px < 16; px++)
+            endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
+
+        endpointSelector.FinishPass(pass);
+    }
+
+    UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
+
+    MUInt15 bestEndpoints[2][3];
+    MUInt15 bestIndexes[16];
+    MUInt15 bestRange = ParallelMath::MakeUInt15(0);
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
+
+    if (exhaustive)
+    {
+        MSInt16 sortBins[16];
+
+        {
+            // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
+            // and pack the original indexes into the low bits.
+
+            MUInt15 sortEP[2][3];
+            ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
+
+            IndexSelector<3> sortSelector;
+            sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
+
+            for (int16_t px = 0; px < 16; px++)
+            {
+                MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
+
+                if (alphaTest)
+                {
+                    ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+                    ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
+                }
+
+                sortBin = sortBin + ParallelMath::MakeSInt16(px);
+
+                sortBins[px] = sortBin;
+            }
+        }
+
+        // Sort bins
+        for (int sortEnd = 1; sortEnd < 16; sortEnd++)
+        {
+            for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
+            {
+                MSInt16 a = sortBins[sortLoc];
+                MSInt16 b = sortBins[sortLoc - 1];
+
+                sortBins[sortLoc] = ParallelMath::Max(a, b);
+                sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
+            }
+        }
+
+        MUInt15 firstElement = ParallelMath::MakeUInt15(0);
+        for (uint16_t e = 0; e < 16; e++)
+        {
+            ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
+            ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
+            if (!ParallelMath::AnySet(isInvalid))
+                break;
+        }
+
+        MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
+
+        MUInt15 sortedInputs[16][4];
+        MFloat floatSortedInputs[16][4];
+        MFloat pwFloatSortedInputs[16][4];
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+                sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
+            {
+                ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
+                int originalIndex = (sortBin & 15);
+
+                for (int ch = 0; ch < 4; ch++)
+                    ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
+            }
+        }
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+            {
+                MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
+                floatSortedInputs[e][ch] = f;
+                pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
+            }
+        }
+
+        for (int n0 = 0; n0 <= 15; n0++)
+        {
+            int remainingFor1 = 16 - n0;
+            if (remainingFor1 == 16)
+                remainingFor1 = 15;
+
+            for (int n1 = 0; n1 <= remainingFor1; n1++)
+            {
+                int remainingFor2 = 16 - n1 - n0;
+                if (remainingFor2 == 16)
+                    remainingFor2 = 15;
+
+                for (int n2 = 0; n2 <= remainingFor2; n2++)
+                {
+                    int n3 = 16 - n2 - n1 - n0;
+
+                    if (n3 == 16)
+                        continue;
+
+                    int counts[4] = { n0, n1, n2, n3 };
+
+                    TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+        }
+
+        TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+
+        if (alphaTest)
+        {
+            for (int n0 = 0; n0 <= 15; n0++)
+            {
+                int remainingFor1 = 16 - n0;
+                if (remainingFor1 == 16)
+                    remainingFor1 = 15;
+
+                for (int n1 = 0; n1 <= remainingFor1; n1++)
+                {
+                    int n2 = 16 - n1 - n0;
+
+                    if (n2 == 16)
+                        continue;
+
+                    int counts[3] = { n0, n1, n2 };
+
+                    TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+
+            TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+        }
+    }
+    else
+    {
+        int minRange = alphaTest ? 3 : 4;
+
+        for (int range = minRange; range <= 4; range++)
+        {
+            int tweakRounds = BCCommon::TweakRoundsForRange(range);
+            if (tweakRounds > maxTweakRounds)
+                tweakRounds = maxTweakRounds;
+
+            for (int tweak = 0; tweak < tweakRounds; tweak++)
+            {
+                MUInt15 endPoints[2][3];
+
+                ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
+
+                for (int refine = 0; refine < numRefineRounds; refine++)
+                {
+                    EndpointRefiner<3> refiner;
+                    refiner.Init(range, channelWeights);
+
+                    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
+
+                    if (refine != numRefineRounds - 1)
+                        refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
+        assert(range == 3 || range == 4);
+
+        ParallelMath::ScalarUInt16 compressedEP[2];
+        for (int ep = 0; ep < 2; ep++)
+        {
+            ParallelMath::ScalarUInt16 endPoint[3];
+            for (int ch = 0; ch < 3; ch++)
+                endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
+
+            int compressed = (endPoint[0] & 0xf8) << 8;
+            compressed |= (endPoint[1] & 0xfc) << 3;
+            compressed |= (endPoint[2] & 0xf8) >> 3;
+
+            compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
+        }
+
+        int indexOrder[4];
+
+        if (range == 4)
+        {
+            if (compressedEP[0] == compressedEP[1])
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 0;
+                indexOrder[2] = 0;
+                indexOrder[3] = 0;
+            }
+            else if (compressedEP[0] < compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 3;
+                indexOrder[2] = 2;
+                indexOrder[3] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 3;
+                indexOrder[3] = 1;
+            }
+        }
+        else
+        {
+            assert(range == 3);
+
+            if (compressedEP[0] > compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 2;
+                indexOrder[2] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 1;
+            }
+            indexOrder[3] = 3;
+        }
+
+        packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
+        packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
+        packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
+
+        for (int i = 0; i < 16; i += 4)
+        {
+            int packedIndexes = 0;
+            for (int subi = 0; subi < 4; subi++)
+            {
+                ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
+                packedIndexes |= (indexOrder[index] << (subi * 2));
+            }
+
+            packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.h b/thirdparty/cvtt/ConvectionKernels_S3TC.h
new file mode 100644
index 0000000000..aa197229c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.h
@@ -0,0 +1,51 @@
+#pragma once
+#ifndef __CVTT_S3TC_H__
+#define __CVTT_S3TC_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class EndpointRefiner;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class S3TCComputer
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static void Init(MFloat& error);
+            static void QuantizeTo6Bits(MUInt15& v);
+            static void QuantizeTo5Bits(MUInt15& v);
+            static void QuantizeTo565(MUInt15 endPoint[3]);
+            static MFloat ParanoidFactorForSpan(const MSInt16& span);
+            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d);
+            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+                const ParallelMath::RoundTowardNearestForScope* rtn);
+            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride);
+            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds);
+            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds);
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
new file mode 100644
index 0000000000..c772b163c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
@@ -0,0 +1,304 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace S3TCSC {
+
+struct TableEntry
+{
+    uint8_t m_min;
+    uint8_t m_max;
+    uint8_t m_actualColor;
+    uint8_t m_span;
+};
+
+TableEntry g_singleColor5_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 0, 69, 23, 69 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 8, 65, 27, 57 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 12, 69, 31, 57 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 20, 65, 35, 45 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 24, 69, 39, 45 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 93, 56, 80, 37 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 97, 60, 84, 37 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 105, 56, 88, 49 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 109, 60, 92, 49 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 77, 134, 96, 57 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 85, 130, 100, 45 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 89, 134, 104, 45 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 158, 121, 145, 37 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 162, 125, 149, 37 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 170, 121, 153, 49 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 174, 125, 157, 49 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 142, 199, 161, 57 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 150, 195, 165, 45 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 154, 199, 169, 45 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 223, 186, 210, 37 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 227, 190, 214, 37 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 235, 186, 218, 49 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 239, 190, 222, 49 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 247, 186, 226, 61 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 251, 190, 230, 61 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
+    { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
+    { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
+    { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
+    { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
+    { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
+    { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
+    { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
new file mode 100644
index 0000000000..ad59988655
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
@@ -0,0 +1,48 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if defined(CVTT_SINGLE_FILE)
+#define CVTT_SINGLE_FILE_IMPL
+
+#include "ConvectionKernels_API.cpp"
+#include "ConvectionKernels_BC67.cpp"
+#include "ConvectionKernels_BC6H_IO.cpp"
+#include "ConvectionKernels_BC7_PrioData.cpp"
+#include "ConvectionKernels_BCCommon.cpp"
+#include "ConvectionKernels_ETC.cpp"
+#include "ConvectionKernels_IndexSelector.cpp"
+#include "ConvectionKernels_S3TC.cpp"
+#include "ConvectionKernels_Util.cpp"
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
new file mode 100644
index 0000000000..371cbe54bf
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ConvectionKernels_Util.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class UnfinishedEndpoints
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            UnfinishedEndpoints()
+            {
+            }
+
+            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = offset[ch];
+            }
+
+            UnfinishedEndpoints(const UnfinishedEndpoints& other)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = other.m_base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = other.m_offset[ch];
+            }
+
+            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt15 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
+                    }
+
+                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
+                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
+                }
+            }
+
+            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
+                    }
+
+                    outEP0[ch] = channelEPs[0];
+                    outEP1[ch] = channelEPs[1];
+                }
+            }
+
+            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
+            {
+                ParallelMath::RoundTowardNearestForScope roundingMode;
+
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
+                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
+                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
+                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
+                }
+            }
+
+            template<int TNewVectorSize>
+            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
+            {
+                MFloat newBase[TNewVectorSize];
+                MFloat newOffset[TNewVectorSize];
+
+                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
+                {
+                    newBase[ch] = m_base[ch];
+                    newOffset[ch] = m_offset[ch];
+                }
+
+                MFloat fillerV = ParallelMath::MakeFloat(filler);
+
+                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
+                {
+                    newBase[ch] = fillerV;
+                    newOffset[ch] = ParallelMath::MakeFloatZero();
+                }
+
+                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
+            }
+
+        private:
+            MFloat m_base[TVectorSize];
+            MFloat m_offset[TVectorSize];
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.cpp b/thirdparty/cvtt/ConvectionKernels_Util.cpp
new file mode 100644
index 0000000000..d9c25c7845
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.cpp
@@ -0,0 +1,88 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+#include <algorithm>
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
+        {
+            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                const PixelBlockS8& inputSignedBlock = inputSigned[block];
+                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
+
+                for (size_t px = 0; px < 16; px++)
+                {
+                    for (size_t ch = 0; ch < 4; ch++)
+                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
+                }
+            }
+        }
+
+        void FillWeights(const Options &options, float channelWeights[4])
+        {
+            if (options.flags & Flags::Uniform)
+                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
+            else
+            {
+                channelWeights[0] = options.redWeight;
+                channelWeights[1] = options.greenWeight;
+                channelWeights[2] = options.blueWeight;
+                channelWeights[3] = options.alphaWeight;
+            }
+        }
+
+        void ComputeTweakFactors(int tweak, int range, float *outFactors)
+        {
+            int totalUnits = range - 1;
+            int minOutsideUnits = ((tweak >> 1) & 1);
+            int maxOutsideUnits = (tweak & 1);
+            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
+
+            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
+            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.h b/thirdparty/cvtt/ConvectionKernels_Util.h
new file mode 100644
index 0000000000..c07b9bf2aa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct PixelBlockU8;
+    struct PixelBlockS8;
+    struct Options;
+}
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize]);
+        void FillWeights(const Options &options, float channelWeights[4]);
+        void ComputeTweakFactors(int tweak, int range, float *outFactors);
+    }
+}
diff --git a/thirdparty/cvtt/etc_notes.txt b/thirdparty/cvtt/etc_notes.txt
new file mode 100644
index 0000000000..bb041a8435
--- /dev/null
+++ b/thirdparty/cvtt/etc_notes.txt
@@ -0,0 +1,27 @@
+The ETC1 compressor uses modified cluster fit:
+
+Assume that there exists an ideal base color and set of selectors for a given table.
+For a given table and set of selectors, the ideal base color can be determined by subtracting the offsets from each pixel and averaging them.
+Doing that is equivalent to subtracting the average offset from the average color.
+Because positive and negative selectors of the same magnitude cancel out, the search space of possible average offsets is reduced: 57 unique offsets for the first table and 81 for the others.
+Most of the offsets result in the same color as another average offset due to quantization of the base color, so those can be de-duplicated.
+So:
+- Start with a high-precision average color.
+- Apply precomputed luma offsets to it.
+- Quantize and de-duplicate the base colors.
+- Find the ideal selectors for each base color.
+
+Differential mode is solved by just finding the best legal combination from those attempts.
+
+There are several scenarios where this is not ideal:
+- Clamping behavior can sometimes be leveraged for a more accurate block.
+- Differentials can sometimes be moved slightly closer to become legal.
+- This only works when MSE is the error metric (i.e. not normal maps)
+- This only works when pixel weights are of equal importance (i.e. not using weight by alpha or edge deblocking)
+
+T and H mode just work by generating clustering assignments by computing a chrominance line and splitting the block in half by the chrominance midpoint and using those to determine the averages.
+
+Planar mode is just solved algebraically.
+
+If you want to emulate etc2comp's default settings, add the flag ETC_UseFakeBT709 to use its modified Rec. 709 error coefficients.
+Doing that will significantly slow down encoding because it requires much more complicated quantization math.
+\ No newline at end of file
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
index 1dcbe92904..1db0f1df92 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
@@ -839,7 +839,7 @@ struct StateTableDriver
     }
 
     if (!c->in_place)
-      buffer->swap_buffers ();
+      buffer->sync ();
   }
 
   public:
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh b/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
index d745c11431..0bf9bd2912 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
@@ -146,7 +146,7 @@ struct DuctileGlyphAction
   HBUINT32	variationAxis;	/* The 4-byte tag identifying the ductile axis.
 				 * This would normally be 0x64756374 ('duct'),
 				 * but you may use any axis the font contains. */
-  HBFixed	minimumLimit;	/* The lowest value for the ductility axis tha
+  HBFixed	minimumLimit;	/* The lowest value for the ductility axis that
 				 * still yields an acceptable appearance. Normally
 				 * this will be 1.0. */
   HBFixed	noStretchValue; /* This is the default value that corresponds to
diff --git a/thirdparty/harfbuzz/src/hb-algs.hh b/thirdparty/harfbuzz/src/hb-algs.hh
index 446d87e28b..3a3ab08046 100644
--- a/thirdparty/harfbuzz/src/hb-algs.hh
+++ b/thirdparty/harfbuzz/src/hb-algs.hh
@@ -36,6 +36,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <functional>
 #include <new>
 
 /*
@@ -210,12 +211,23 @@ struct
 }
 HB_FUNCOBJ (hb_bool);
 
+template <typename T>
+static inline
+T hb_coerce (const T v) { return v; }
+template <typename T, typename V,
+	  hb_enable_if (!hb_is_same (hb_decay<T>, hb_decay<V>) && std::is_pointer<V>::value)>
+static inline
+T hb_coerce (const V v) { return *v; }
+
 struct
 {
   private:
 
   template <typename T> constexpr auto
-  impl (const T& v, hb_priority<1>) const HB_RETURN (uint32_t, hb_deref (v).hash ())
+  impl (const T& v, hb_priority<2>) const HB_RETURN (uint32_t, hb_deref (v).hash ())
+
+  template <typename T> constexpr auto
+  impl (const T& v, hb_priority<1>) const HB_RETURN (uint32_t, std::hash<hb_decay<decltype (hb_deref (v))>>{} (hb_deref (v)))
 
   template <typename T,
 	    hb_enable_if (std::is_integral<T>::value)> constexpr auto
@@ -435,23 +447,29 @@ struct
   private:
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<2>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<3>) const HB_AUTO_RETURN
   (
     std::forward<T2> (v2).cmp (std::forward<T1> (v1)) == 0
   )
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<1>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<2>) const HB_AUTO_RETURN
   (
     std::forward<T1> (v1).cmp (std::forward<T2> (v2)) == 0
   )
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<0>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<1>) const HB_AUTO_RETURN
   (
     std::forward<T1> (v1) == std::forward<T2> (v2)
   )
 
+  template <typename T1, typename T2> auto
+  impl (T1&& v1, T2 &&v2, hb_priority<0>) const HB_AUTO_RETURN
+  (
+    std::forward<T2> (v2) == std::forward<T1> (v1)
+  )
+
   public:
 
   template <typename T1, typename T2> auto
@@ -472,6 +490,10 @@ struct hb_pair_t
   typedef T2 second_t;
   typedef hb_pair_t<T1, T2> pair_t;
 
+  template <typename U1 = T1, typename U2 = T2,
+	    hb_enable_if (std::is_default_constructible<U1>::value &&
+			  std::is_default_constructible<U2>::value)>
+  hb_pair_t () : first (), second () {}
   hb_pair_t (T1 a, T2 b) : first (a), second (b) {}
 
   template <typename Q1, typename Q2,
@@ -870,7 +892,7 @@ hb_bsearch_impl (unsigned *pos, /* Out */
 #pragma GCC diagnostic ignored "-Wcast-align"
     V* p = (V*) (((const char *) base) + (mid * stride));
 #pragma GCC diagnostic pop
-    int c = compar ((const void *) hb_addressof (key), (const void *) p, ds...);
+    int c = compar ((const void *) std::addressof (key), (const void *) p, ds...);
     if (c < 0)
       max = mid - 1;
     else if (c > 0)
diff --git a/thirdparty/harfbuzz/src/hb-array.hh b/thirdparty/harfbuzz/src/hb-array.hh
index 0beffb078f..1d1476d7cd 100644
--- a/thirdparty/harfbuzz/src/hb-array.hh
+++ b/thirdparty/harfbuzz/src/hb-array.hh
@@ -412,7 +412,7 @@ bool hb_array_t<T>::operator == (const hb_array_t<T> &o) const
   return true;
 }
 
-/* TODO Specialize opeator== for hb_bytes_t and hb_ubytes_t. */
+/* TODO Specialize operator== for hb_bytes_t and hb_ubytes_t. */
 
 template <>
 inline uint32_t hb_array_t<const char>::hash () const {
diff --git a/thirdparty/harfbuzz/src/hb-bimap.hh b/thirdparty/harfbuzz/src/hb-bimap.hh
index d466af8b60..a9e1278de7 100644
--- a/thirdparty/harfbuzz/src/hb-bimap.hh
+++ b/thirdparty/harfbuzz/src/hb-bimap.hh
@@ -33,20 +33,6 @@
 /* Bi-directional map */
 struct hb_bimap_t
 {
-  /* XXX(remove) */
-  void init ()
-  {
-    forw_map.init ();
-    back_map.init ();
-  }
-
-  /* XXX(remove) */
-  void fini ()
-  {
-    forw_map.fini ();
-    back_map.fini ();
-  }
-
   void reset ()
   {
     forw_map.reset ();
diff --git a/thirdparty/harfbuzz/src/hb-buffer.cc b/thirdparty/harfbuzz/src/hb-buffer.cc
index be3161a54d..e50afcb203 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.cc
+++ b/thirdparty/harfbuzz/src/hb-buffer.cc
@@ -86,7 +86,46 @@ hb_segment_properties_hash (const hb_segment_properties_t *p)
 	 (intptr_t) (p->language);
 }
 
+/**
+ * hb_segment_properties_overlay:
+ * @p: #hb_segment_properties_t to fill in.
+ * @src: #hb_segment_properties_t to fill in from.
+ *
+ * Fills in missing fields of @p from @src in a considered manner.
+ *
+ * First, if @p does not have direction set, direction is copied from @src.
+ *
+ * Next, if @p and @src have the same direction (which can be unset), if @p
+ * does not have script set, script is copied from @src.
+ *
+ * Finally, if @p and @src have the same direction and script (which either
+ * can be unset), if @p does not have language set, language is copied from
+ * @src.
+ *
+ * Since: 3.3.0
+ **/
+void
+hb_segment_properties_overlay (hb_segment_properties_t *p,
+			       const hb_segment_properties_t *src)
+{
+  if (unlikely (!p || !src))
+    return;
 
+  if (!p->direction)
+    p->direction = src->direction;
+
+  if (p->direction != src->direction)
+    return;
+
+  if (!p->script)
+    p->script = src->script;
+
+  if (p->script != src->script)
+    return;
+
+  if (!p->language)
+    p->language = src->language;
+}
 
 /* Here is how the buffer works internally:
  *
@@ -96,14 +135,14 @@ hb_segment_properties_hash (const hb_segment_properties_t *p)
  * As an optimization, both info and out_info may point to the
  * same piece of memory, which is owned by info.  This remains the
  * case as long as out_len doesn't exceed i at any time.
- * In that case, swap_buffers() is mostly no-op and the glyph operations
+ * In that case, sync() is mostly no-op and the glyph operations
  * operate mostly in-place.
  *
  * As soon as out_info gets longer than info, out_info is moved over
  * to an alternate buffer (which we reuse the pos buffer for), and its
  * current contents (out_len entries) are copied to the new place.
  *
- * This should all remain transparent to the user.  swap_buffers() then
+ * This should all remain transparent to the user.  sync() then
  * switches info over to out_info and does housekeeping.
  */
 
@@ -217,11 +256,24 @@ hb_buffer_t::get_scratch_buffer (unsigned int *size)
 /* HarfBuzz-Internal API */
 
 void
+hb_buffer_t::similar (const hb_buffer_t &src)
+{
+  hb_unicode_funcs_destroy (unicode);
+  unicode = hb_unicode_funcs_reference (src.unicode);
+  flags = src.flags;
+  cluster_level = src.cluster_level;
+  replacement = src.invisible;
+  invisible = src.invisible;
+  not_found = src.not_found;
+}
+
+void
 hb_buffer_t::reset ()
 {
   hb_unicode_funcs_destroy (unicode);
   unicode = hb_unicode_funcs_reference (hb_unicode_funcs_get_default ());
   flags = HB_BUFFER_FLAG_DEFAULT;
+  cluster_level = HB_BUFFER_CLUSTER_LEVEL_DEFAULT;
   replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT;
   invisible = 0;
   not_found = 0;
@@ -232,11 +284,10 @@ hb_buffer_t::reset ()
 void
 hb_buffer_t::clear ()
 {
+  content_type = HB_BUFFER_CONTENT_TYPE_INVALID;
   hb_segment_properties_t default_props = HB_SEGMENT_PROPERTIES_DEFAULT;
   props = default_props;
-  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
 
-  content_type = HB_BUFFER_CONTENT_TYPE_INVALID;
   successful = true;
   have_output = false;
   have_positions = false;
@@ -244,16 +295,44 @@ hb_buffer_t::clear ()
   idx = 0;
   len = 0;
   out_len = 0;
-  out_info = info;
 
-  serial = 0;
+  out_info = info;
 
   memset (context, 0, sizeof context);
   memset (context_len, 0, sizeof context_len);
 
   deallocate_var_all ();
+  serial = 0;
+  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
+}
+
+void
+hb_buffer_t::enter ()
+{
+  deallocate_var_all ();
+  serial = 0;
+  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
+  if (likely (!hb_unsigned_mul_overflows (len, HB_BUFFER_MAX_LEN_FACTOR)))
+  {
+    max_len = hb_max (len * HB_BUFFER_MAX_LEN_FACTOR,
+		      (unsigned) HB_BUFFER_MAX_LEN_MIN);
+  }
+  if (likely (!hb_unsigned_mul_overflows (len, HB_BUFFER_MAX_OPS_FACTOR)))
+  {
+    max_ops = hb_max (len * HB_BUFFER_MAX_OPS_FACTOR,
+		      (unsigned) HB_BUFFER_MAX_OPS_MIN);
+  }
+}
+void
+hb_buffer_t::leave ()
+{
+  max_len = HB_BUFFER_MAX_LEN_DEFAULT;
+  max_ops = HB_BUFFER_MAX_OPS_DEFAULT;
+  deallocate_var_all ();
+  serial = 0;
 }
 
+
 void
 hb_buffer_t::add (hb_codepoint_t  codepoint,
 		  unsigned int    cluster)
@@ -307,7 +386,7 @@ hb_buffer_t::clear_positions ()
 }
 
 void
-hb_buffer_t::swap_buffers ()
+hb_buffer_t::sync ()
 {
   assert (have_output);
 
@@ -494,33 +573,6 @@ done:
 }
 
 void
-hb_buffer_t::unsafe_to_break_impl (unsigned int start, unsigned int end)
-{
-  unsigned int cluster = UINT_MAX;
-  cluster = _infos_find_min_cluster (info, start, end, cluster);
-  _unsafe_to_break_set_mask (info, start, end, cluster);
-}
-void
-hb_buffer_t::unsafe_to_break_from_outbuffer (unsigned int start, unsigned int end)
-{
-  if (!have_output)
-  {
-    unsafe_to_break_impl (start, end);
-    return;
-  }
-
-  assert (start <= out_len);
-  assert (idx <= end);
-
-  unsigned int cluster = UINT_MAX;
-  cluster = _infos_find_min_cluster (out_info, start, out_len, cluster);
-  cluster = _infos_find_min_cluster (info, idx, end, cluster);
-
-  _unsafe_to_break_set_mask (out_info, start, out_len, cluster);
-  _unsafe_to_break_set_mask (info, idx, end, cluster);
-}
-
-void
 hb_buffer_t::guess_segment_properties ()
 {
   assert_unicode ();
@@ -565,12 +617,11 @@ DEFINE_NULL_INSTANCE (hb_buffer_t) =
   HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT,
   0, /* invisible */
   0, /* not_found */
-  HB_BUFFER_SCRATCH_FLAG_DEFAULT,
-  HB_BUFFER_MAX_LEN_DEFAULT,
-  HB_BUFFER_MAX_OPS_DEFAULT,
+
 
   HB_BUFFER_CONTENT_TYPE_INVALID,
   HB_SEGMENT_PROPERTIES_DEFAULT,
+
   false, /* successful */
   false, /* have_output */
   true  /* have_positions */
@@ -610,6 +661,46 @@ hb_buffer_create ()
 }
 
 /**
+ * hb_buffer_create_similar:
+ * @src: An #hb_buffer_t
+ *
+ * Creates a new #hb_buffer_t, similar to hb_buffer_create(). The only
+ * difference is that the buffer is configured similarly to @src.
+ *
+ * Return value: (transfer full):
+ * A newly allocated #hb_buffer_t, similar to hb_buffer_create().
+ *
+ * Since: 3.3.0
+ **/
+hb_buffer_t *
+hb_buffer_create_similar (const hb_buffer_t *src)
+{
+  hb_buffer_t *buffer = hb_buffer_create ();
+
+  buffer->similar (*src);
+
+  return buffer;
+}
+
+/**
+ * hb_buffer_reset:
+ * @buffer: An #hb_buffer_t
+ *
+ * Resets the buffer to its initial status, as if it was just newly created
+ * with hb_buffer_create().
+ *
+ * Since: 0.9.2
+ **/
+void
+hb_buffer_reset (hb_buffer_t *buffer)
+{
+  if (unlikely (hb_object_is_immutable (buffer)))
+    return;
+
+  buffer->reset ();
+}
+
+/**
  * hb_buffer_get_empty:
  *
  * Fetches an empty #hb_buffer_t.
@@ -1157,24 +1248,6 @@ hb_buffer_get_not_found_glyph (hb_buffer_t    *buffer)
 
 
 /**
- * hb_buffer_reset:
- * @buffer: An #hb_buffer_t
- *
- * Resets the buffer to its initial status, as if it was just newly created
- * with hb_buffer_create().
- *
- * Since: 0.9.2
- **/
-void
-hb_buffer_reset (hb_buffer_t *buffer)
-{
-  if (unlikely (hb_object_is_immutable (buffer)))
-    return;
-
-  buffer->reset ();
-}
-
-/**
  * hb_buffer_clear_contents:
  * @buffer: An #hb_buffer_t
  *
@@ -1749,6 +1822,8 @@ hb_buffer_append (hb_buffer_t *buffer,
   if (!buffer->have_positions && source->have_positions)
     buffer->clear_positions ();
 
+  hb_segment_properties_overlay (&buffer->props, &source->props);
+
   memcpy (buffer->info + orig_len, source->info + start, (end - start) * sizeof (buffer->info[0]));
   if (buffer->have_positions)
     memcpy (buffer->pos + orig_len, source->pos + start, (end - start) * sizeof (buffer->pos[0]));
diff --git a/thirdparty/harfbuzz/src/hb-buffer.h b/thirdparty/harfbuzz/src/hb-buffer.h
index a183cb9d4a..9fbd7b1ec3 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.h
+++ b/thirdparty/harfbuzz/src/hb-buffer.h
@@ -76,18 +76,68 @@ typedef struct hb_glyph_info_t {
  * @HB_GLYPH_FLAG_UNSAFE_TO_BREAK: Indicates that if input text is broken at the
  * 				   beginning of the cluster this glyph is part of,
  * 				   then both sides need to be re-shaped, as the
- * 				   result might be different.  On the flip side,
- * 				   it means that when this flag is not present,
- * 				   then it's safe to break the glyph-run at the
- * 				   beginning of this cluster, and the two sides
- * 				   represent the exact same result one would get
- * 				   if breaking input text at the beginning of
- * 				   this cluster and shaping the two sides
- * 				   separately.  This can be used to optimize
- * 				   paragraph layout, by avoiding re-shaping
- * 				   of each line after line-breaking, or limiting
- * 				   the reshaping to a small piece around the
- * 				   breaking point only.
+ * 				   result might be different.
+ * 				   On the flip side, it means that when this
+ * 				   flag is not present, then it is safe to break
+ * 				   the glyph-run at the beginning of this
+ * 				   cluster, and the two sides will represent the
+ * 				   exact same result one would get if breaking
+ * 				   input text at the beginning of this cluster
+ * 				   and shaping the two sides separately.
+ * 				   This can be used to optimize paragraph
+ * 				   layout, by avoiding re-shaping of each line
+ * 				   after line-breaking.
+ * @HB_GLYPH_FLAG_UNSAFE_TO_CONCAT: Indicates that if input text is changed on one
+ * 				   side of the beginning of the cluster this glyph
+ * 				   is part of, then the shaping results for the
+ * 				   other side might change.
+ * 				   Note that the absence of this flag will NOT by
+ * 				   itself mean that it IS safe to concat text.
+ * 				   Only two pieces of text both of which clear of
+ * 				   this flag can be concatenated safely.
+ * 				   This can be used to optimize paragraph
+ * 				   layout, by avoiding re-shaping of each line
+ * 				   after line-breaking, by limiting the
+ * 				   reshaping to a small piece around the
+ * 				   breaking positin only, even if the breaking
+ * 				   position carries the
+ * 				   #HB_GLYPH_FLAG_UNSAFE_TO_BREAK or when
+ * 				   hyphenation or other text transformation
+ * 				   happens at line-break position, in the following
+ * 				   way:
+ * 				   1. Iterate back from the line-break position
+ * 				   until the first cluster start position that is
+ * 				   NOT unsafe-to-concat, 2. shape the segment from
+ * 				   there till the end of line, 3. check whether the
+ * 				   resulting glyph-run also is clear of the
+ * 				   unsafe-to-concat at its start-of-text position;
+ * 				   if it is, just splice it into place and the line
+ * 				   is shaped; If not, move on to a position further
+ * 				   back that is clear of unsafe-to-concat and retry
+ * 				   from there, and repeat.
+ * 				   At the start of next line a similar algorithm can
+ * 				   be implemented. That is: 1. Iterate forward from
+ * 				   the line-break position untill the first cluster
+ * 				   start position that is NOT unsafe-to-concat, 2.
+ * 				   shape the segment from beginning of the line to
+ * 				   that position, 3. check whether the resulting
+ * 				   glyph-run also is clear of the unsafe-to-concat
+ * 				   at its end-of-text position; if it is, just splice
+ * 				   it into place and the beginning is shaped; If not,
+ * 				   move on to a position further forward that is clear
+ * 				   of unsafe-to-concat and retry up to there, and repeat.
+ * 				   A slight complication will arise in the
+ * 				   implementation of the algorithm above,
+ * 				   because while our buffer API has a way to
+ * 				   return flags for position corresponding to
+ * 				   start-of-text, there is currently no position
+ * 				   corresponding to end-of-text.  This limitation
+ * 				   can be alleviated by shaping more text than needed
+ * 				   and looking for unsafe-to-concat flag within text
+ * 				   clusters.
+ * 				   The #HB_GLYPH_FLAG_UNSAFE_TO_BREAK flag will
+ * 				   always imply this flag.
+ * 				   Since: 3.3.0
  * @HB_GLYPH_FLAG_DEFINED: All the currently defined flags.
  *
  * Flags for #hb_glyph_info_t.
@@ -96,8 +146,9 @@ typedef struct hb_glyph_info_t {
  */
 typedef enum { /*< flags >*/
   HB_GLYPH_FLAG_UNSAFE_TO_BREAK		= 0x00000001,
+  HB_GLYPH_FLAG_UNSAFE_TO_CONCAT	= 0x00000002,
 
-  HB_GLYPH_FLAG_DEFINED			= 0x00000001 /* OR of all defined flags */
+  HB_GLYPH_FLAG_DEFINED			= 0x00000003 /* OR of all defined flags */
 } hb_glyph_flags_t;
 
 HB_EXTERN hb_glyph_flags_t
@@ -170,6 +221,9 @@ hb_segment_properties_equal (const hb_segment_properties_t *a,
 HB_EXTERN unsigned int
 hb_segment_properties_hash (const hb_segment_properties_t *p);
 
+HB_EXTERN void
+hb_segment_properties_overlay (hb_segment_properties_t *p,
+			       const hb_segment_properties_t *src);
 
 
 /**
@@ -185,6 +239,13 @@ HB_EXTERN hb_buffer_t *
 hb_buffer_create (void);
 
 HB_EXTERN hb_buffer_t *
+hb_buffer_create_similar (const hb_buffer_t *src);
+
+HB_EXTERN void
+hb_buffer_reset (hb_buffer_t *buffer);
+
+
+HB_EXTERN hb_buffer_t *
 hb_buffer_get_empty (void);
 
 HB_EXTERN hb_buffer_t *
@@ -391,8 +452,9 @@ HB_EXTERN hb_codepoint_t
 hb_buffer_get_not_found_glyph (hb_buffer_t    *buffer);
 
 
-HB_EXTERN void
-hb_buffer_reset (hb_buffer_t *buffer);
+/*
+ * Content API.
+ */
 
 HB_EXTERN void
 hb_buffer_clear_contents (hb_buffer_t *buffer);
diff --git a/thirdparty/harfbuzz/src/hb-buffer.hh b/thirdparty/harfbuzz/src/hb-buffer.hh
index 0f8140f1b3..ac45f090a5 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.hh
+++ b/thirdparty/harfbuzz/src/hb-buffer.hh
@@ -67,8 +67,8 @@ enum hb_buffer_scratch_flags_t {
   HB_BUFFER_SCRATCH_FLAG_HAS_DEFAULT_IGNORABLES		= 0x00000002u,
   HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK		= 0x00000004u,
   HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT		= 0x00000008u,
-  HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK		= 0x00000010u,
-  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000020u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000010u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS		= 0x00000020u,
 
   /* Reserved for complex shapers' internal use. */
   HB_BUFFER_SCRATCH_FLAG_COMPLEX0			= 0x01000000u,
@@ -87,18 +87,21 @@ struct hb_buffer_t
 {
   hb_object_header_t header;
 
-  /* Information about how the text in the buffer should be treated */
+  /*
+   * Information about how the text in the buffer should be treated.
+   */
+
   hb_unicode_funcs_t *unicode; /* Unicode functions */
   hb_buffer_flags_t flags; /* BOT / EOT / etc. */
   hb_buffer_cluster_level_t cluster_level;
   hb_codepoint_t replacement; /* U+FFFD or something else. */
   hb_codepoint_t invisible; /* 0 or something else. */
   hb_codepoint_t not_found; /* 0 or something else. */
-  hb_buffer_scratch_flags_t scratch_flags; /* Have space-fallback, etc. */
-  unsigned int max_len; /* Maximum allowed len. */
-  int max_ops; /* Maximum allowed operations. */
 
-  /* Buffer contents */
+  /*
+   * Buffer contents
+   */
+
   hb_buffer_content_type_t content_type;
   hb_segment_properties_t props; /* Script, language, direction */
 
@@ -115,8 +118,6 @@ struct hb_buffer_t
   hb_glyph_info_t     *out_info;
   hb_glyph_position_t *pos;
 
-  unsigned int serial;
-
   /* Text before / after the main buffer contents.
    * Always in Unicode, and ordered outward.
    * Index 0 is for "pre-context", 1 for "post-context". */
@@ -124,7 +125,25 @@ struct hb_buffer_t
   hb_codepoint_t context[2][CONTEXT_LENGTH];
   unsigned int context_len[2];
 
-  /* Debugging API */
+
+  /*
+   * Managed by enter / leave
+   */
+
+#ifndef HB_NDEBUG
+  uint8_t allocated_var_bits;
+#endif
+  uint8_t serial;
+  hb_buffer_scratch_flags_t scratch_flags; /* Have space-fallback, etc. */
+  unsigned int max_len; /* Maximum allowed len. */
+  int max_ops; /* Maximum allowed operations. */
+  /* The bits here reflect current allocations of the bytes in glyph_info_t's var1 and var2. */
+
+
+  /*
+   * Messaging callback
+   */
+
 #ifndef HB_NO_BUFFER_MESSAGE
   hb_buffer_message_func_t message_func;
   void *message_data;
@@ -134,11 +153,6 @@ struct hb_buffer_t
   static constexpr unsigned message_depth = 0u;
 #endif
 
-  /* Internal debugging. */
-  /* The bits here reflect current allocations of the bytes in glyph_info_t's var1 and var2. */
-#ifndef HB_NDEBUG
-  uint8_t allocated_var_bits;
-#endif
 
 
   /* Methods */
@@ -190,12 +204,17 @@ struct hb_buffer_t
   hb_glyph_info_t &prev ()      { return out_info[out_len ? out_len - 1 : 0]; }
   hb_glyph_info_t prev () const { return out_info[out_len ? out_len - 1 : 0]; }
 
+  HB_INTERNAL void similar (const hb_buffer_t &src);
   HB_INTERNAL void reset ();
   HB_INTERNAL void clear ();
 
+  /* Called around shape() */
+  HB_INTERNAL void enter ();
+  HB_INTERNAL void leave ();
+
   unsigned int backtrack_len () const { return have_output ? out_len : idx; }
   unsigned int lookahead_len () const { return len - idx; }
-  unsigned int next_serial () { return serial++; }
+  uint8_t next_serial () { return ++serial ? serial : ++serial; }
 
   HB_INTERNAL void add (hb_codepoint_t  codepoint,
 			unsigned int    cluster);
@@ -252,7 +271,7 @@ struct hb_buffer_t
 
   HB_INTERNAL void guess_segment_properties ();
 
-  HB_INTERNAL void swap_buffers ();
+  HB_INTERNAL void sync ();
   HB_INTERNAL void clear_output ();
   HB_INTERNAL void clear_positions ();
 
@@ -366,15 +385,83 @@ struct hb_buffer_t
   /* Merge clusters for deleting current glyph, and skip it. */
   HB_INTERNAL void delete_glyph ();
 
-  void unsafe_to_break (unsigned int start,
-			unsigned int end)
+
+  /* Adds glyph flags in mask to infos with clusters between start and end.
+   * The start index will be from out-buffer if from_out_buffer is true.
+   * If interior is true, then the cluster having the minimum value is skipped. */
+  void _set_glyph_flags (hb_mask_t mask,
+			 unsigned start = 0,
+			 unsigned end = (unsigned) -1,
+			 bool interior = false,
+			 bool from_out_buffer = false)
   {
-    if (end - start < 2)
+    end = hb_min (end, len);
+
+    if (interior && !from_out_buffer && end - start < 2)
       return;
-    unsafe_to_break_impl (start, end);
+
+    scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS;
+
+    if (!from_out_buffer || !have_output)
+    {
+      if (!interior)
+      {
+	for (unsigned i = start; i < end; i++)
+	  info[i].mask |= mask;
+      }
+      else
+      {
+	unsigned cluster = _infos_find_min_cluster (info, start, end);
+	_infos_set_glyph_flags (info, start, end, cluster, mask);
+      }
+    }
+    else
+    {
+      assert (start <= out_len);
+      assert (idx <= end);
+
+      if (!interior)
+      {
+	for (unsigned i = start; i < out_len; i++)
+	  out_info[i].mask |= mask;
+	for (unsigned i = idx; i < end; i++)
+	  info[i].mask |= mask;
+      }
+      else
+      {
+	unsigned cluster = _infos_find_min_cluster (info, idx, end);
+	cluster = _infos_find_min_cluster (out_info, start, out_len, cluster);
+
+	_infos_set_glyph_flags (out_info, start, out_len, cluster, mask);
+	_infos_set_glyph_flags (info, idx, end, cluster, mask);
+      }
+    }
+  }
+
+  void unsafe_to_break (unsigned int start = 0, unsigned int end = -1)
+  {
+    _set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK | HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		      start, end,
+		      true);
+  }
+  void unsafe_to_concat (unsigned int start = 0, unsigned int end = -1)
+  {
+    _set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		      start, end,
+		      true);
+  }
+  void unsafe_to_break_from_outbuffer (unsigned int start = 0, unsigned int end = -1)
+  {
+    _set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK | HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		      start, end,
+		      true, true);
+  }
+  void unsafe_to_concat_from_outbuffer (unsigned int start = 0, unsigned int end = -1)
+  {
+    _set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		      start, end,
+		      false, true);
   }
-  HB_INTERNAL void unsafe_to_break_impl (unsigned int start, unsigned int end);
-  HB_INTERNAL void unsafe_to_break_from_outbuffer (unsigned int start, unsigned int end);
 
 
   /* Internal methods */
@@ -465,36 +552,31 @@ struct hb_buffer_t
   set_cluster (hb_glyph_info_t &inf, unsigned int cluster, unsigned int mask = 0)
   {
     if (inf.cluster != cluster)
-    {
-      if (mask & HB_GLYPH_FLAG_UNSAFE_TO_BREAK)
-	inf.mask |= HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-      else
-	inf.mask &= ~HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-    }
+      inf.mask = (inf.mask & ~HB_GLYPH_FLAG_DEFINED) | (mask & HB_GLYPH_FLAG_DEFINED);
     inf.cluster = cluster;
   }
-
+  void
+  _infos_set_glyph_flags (hb_glyph_info_t *infos,
+			  unsigned int start, unsigned int end,
+			  unsigned int cluster,
+			  hb_mask_t mask)
+  {
+    for (unsigned int i = start; i < end; i++)
+      if (cluster != infos[i].cluster)
+      {
+	scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS;
+	infos[i].mask |= mask;
+      }
+  }
   static unsigned
   _infos_find_min_cluster (const hb_glyph_info_t *infos,
 			   unsigned start, unsigned end,
-			   unsigned cluster)
+			   unsigned cluster = UINT_MAX)
   {
     for (unsigned int i = start; i < end; i++)
       cluster = hb_min (cluster, infos[i].cluster);
     return cluster;
   }
-  void
-  _unsafe_to_break_set_mask (hb_glyph_info_t *infos,
-			     unsigned int start, unsigned int end,
-			     unsigned int cluster)
-  {
-    for (unsigned int i = start; i < end; i++)
-      if (cluster != infos[i].cluster)
-      {
-	scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK;
-	infos[i].mask |= HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-      }
-  }
 
   void clear_glyph_flags (hb_mask_t mask = 0)
   {
diff --git a/thirdparty/harfbuzz/src/hb-cff-interp-common.hh b/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
index c251e2d0ed..641de0eff2 100644
--- a/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
+++ b/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
@@ -217,9 +217,6 @@ inline unsigned int OpCode_Size (op_code_t op) { return Is_OpCode_ESC (op) ? 2:
 
 struct number_t
 {
-  void init () { set_real (0.0); }
-  void fini () {}
-
   void set_int (int v)       { value = v; }
   int to_int () const        { return value; }
 
@@ -245,7 +242,7 @@ struct number_t
   }
 
   protected:
-  double value;
+  double value = 0.;
 };
 
 /* byte string */
@@ -380,10 +377,8 @@ struct cff_stack_t
     count = 0;
     elements.init ();
     elements.resize (kSizeLimit);
-    for (unsigned int i = 0; i < elements.length; i++)
-      elements[i].init ();
   }
-  void fini () { elements.fini_deep (); }
+  void fini () { elements.fini (); }
 
   ELEM& operator [] (unsigned int i)
   {
@@ -523,9 +518,6 @@ struct arg_stack_t : cff_stack_t<ARG, 513>
 /* an operator prefixed by its operands in a byte string */
 struct op_str_t
 {
-  void init () {}
-  void fini () {}
-
   op_code_t  op;
   byte_str_t str;
 };
@@ -553,7 +545,7 @@ struct parsed_values_t
     opStart = 0;
     values.init ();
   }
-  void fini () { values.fini_deep (); }
+  void fini () { values.fini (); }
 
   void add_op (op_code_t op, const byte_str_ref_t& str_ref = byte_str_ref_t ())
   {
diff --git a/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh b/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
index 52d778ffe2..ef299369b5 100644
--- a/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
+++ b/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
@@ -94,12 +94,6 @@ struct biased_subrs_t
 
 struct point_t
 {
-  void init ()
-  {
-    x.init ();
-    y.init ();
-  }
-
   void set_int (int _x, int _y)
   {
     x.set_int (_x);
@@ -128,7 +122,7 @@ struct cs_interp_env_t : interp_env_t<ARG>
     hstem_count = 0;
     vstem_count = 0;
     hintmask_size = 0;
-    pt.init ();
+    pt.set_int (0, 0);
     callStack.init ();
     globalSubrs.init (globalSubrs_);
     localSubrs.init (localSubrs_);
@@ -841,7 +835,6 @@ struct path_procs_t
     if (likely (env.argStack.get_count () == 11))
     {
       point_t d;
-      d.init ();
       for (unsigned int i = 0; i < 10; i += 2)
 	d.move (env.eval_arg (i), env.eval_arg (i+1));
 
diff --git a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
index d961566447..766183760e 100644
--- a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
+++ b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
@@ -35,18 +35,6 @@ using namespace OT;
 
 struct blend_arg_t : number_t
 {
-  void init ()
-  {
-    number_t::init ();
-    deltas.init ();
-  }
-
-  void fini ()
-  {
-    number_t::fini ();
-    deltas.fini_deep ();
-  }
-
   void set_int (int v) { reset_blends (); number_t::set_int (v); }
   void set_fixed (int32_t v) { reset_blends (); number_t::set_fixed (v); }
   void set_real (double v) { reset_blends (); number_t::set_real (v); }
@@ -202,7 +190,7 @@ struct cff2_cs_opset_t : cs_opset_t<blend_arg_t, OPSET, cff2_cs_interp_env_t, PA
     switch (op) {
       case OpCode_callsubr:
       case OpCode_callgsubr:
-	/* a subroutine number shoudln't be a blended value */
+	/* a subroutine number shouldn't be a blended value */
 	if (unlikely (env.argStack.peek ().blending ()))
 	{
 	  env.set_error ();
diff --git a/thirdparty/harfbuzz/src/hb-common.cc b/thirdparty/harfbuzz/src/hb-common.cc
index 26c8ad0f49..249a8a8010 100644
--- a/thirdparty/harfbuzz/src/hb-common.cc
+++ b/thirdparty/harfbuzz/src/hb-common.cc
@@ -29,10 +29,31 @@
 #include "hb.hh"
 #include "hb-machinery.hh"
 
+#if !defined(HB_NO_SETLOCALE) && (!defined(HAVE_NEWLOCALE) || !defined(HAVE_USELOCALE))
+#define HB_NO_SETLOCALE 1
+#endif
+
+#ifndef HB_NO_SETLOCALE
+
 #include <locale.h>
+#ifdef HAVE_XLOCALE_H
+#include <xlocale.h> // Needed on BSD/OS X for uselocale
+#endif
+
+#ifdef WIN32
+#define hb_locale_t _locale_t
+#else
+#define hb_locale_t locale_t
+#endif
+#define hb_setlocale setlocale
+#define hb_uselocale uselocale
+
+#else
+
+#define hb_locale_t void *
+#define hb_setlocale(Category, Locale) "C"
+#define hb_uselocale(Locale) ((hb_locale_t) 0)
 
-#ifdef HB_NO_SETLOCALE
-#define setlocale(Category, Locale) "C"
 #endif
 
 /**
@@ -122,7 +143,7 @@ hb_tag_from_string (const char *str, int len)
  * @tag: #hb_tag_t to convert
  * @buf: (out caller-allocates) (array fixed-size=4) (element-type uint8_t): Converted string
  *
- * Converts an #hb_tag_t to a string and returns it in @buf. 
+ * Converts an #hb_tag_t to a string and returns it in @buf.
  * Strings will be four characters long.
  *
  * Since: 0.9.5
@@ -151,13 +172,13 @@ const char direction_strings[][4] = {
  * @str: (array length=len) (element-type uint8_t): String to convert
  * @len: Length of @str, or -1 if it is %NULL-terminated
  *
- * Converts a string to an #hb_direction_t. 
+ * Converts a string to an #hb_direction_t.
  *
  * Matching is loose and applies only to the first letter. For
  * examples, "LTR" and "left-to-right" will both return #HB_DIRECTION_LTR.
  *
  * Unmatched strings will return #HB_DIRECTION_INVALID.
- * 
+ *
  * Return value: The #hb_direction_t matching @str
  *
  * Since: 0.9.2
@@ -413,7 +434,7 @@ hb_language_get_default ()
   hb_language_t language = default_language;
   if (unlikely (language == HB_LANGUAGE_INVALID))
   {
-    language = hb_language_from_string (setlocale (LC_CTYPE, nullptr), -1);
+    language = hb_language_from_string (hb_setlocale (LC_CTYPE, nullptr), -1);
     (void) default_language.cmpexch (HB_LANGUAGE_INVALID, language);
   }
 
@@ -1039,6 +1060,47 @@ hb_variation_from_string (const char *str, int len,
   return false;
 }
 
+#ifndef HB_NO_SETLOCALE
+
+static inline void free_static_C_locale ();
+
+static struct hb_C_locale_lazy_loader_t : hb_lazy_loader_t<hb_remove_pointer<hb_locale_t>,
+							     hb_C_locale_lazy_loader_t>
+{
+  static hb_locale_t create ()
+  {
+    hb_locale_t l = newlocale (LC_ALL_MASK, "C", NULL);
+    if (!l)
+      return l;
+
+    hb_atexit (free_static_C_locale);
+
+    return l;
+  }
+  static void destroy (hb_locale_t l)
+  {
+    freelocale (l);
+  }
+  static hb_locale_t get_null ()
+  {
+    return (hb_locale_t) 0;
+  }
+} static_C_locale;
+
+static inline
+void free_static_C_locale ()
+{
+  static_C_locale.free_instance ();
+}
+
+static hb_locale_t
+get_C_locale ()
+{
+  return static_C_locale.get_unconst ();
+}
+
+#endif
+
 /**
  * hb_variation_to_string:
  * @variation: an #hb_variation_t to convert
@@ -1064,7 +1126,11 @@ hb_variation_to_string (hb_variation_t *variation,
   while (len && s[len - 1] == ' ')
     len--;
   s[len++] = '=';
+
+  hb_locale_t oldlocale HB_UNUSED;
+  oldlocale = hb_uselocale (get_C_locale ());
   len += hb_max (0, snprintf (s + len, ARRAY_LENGTH (s) - len, "%g", (double) variation->value));
+  (void) hb_uselocale (oldlocale);
 
   assert (len < ARRAY_LENGTH (s));
   len = hb_min (len, size - 1);
diff --git a/thirdparty/harfbuzz/src/hb-coretext.cc b/thirdparty/harfbuzz/src/hb-coretext.cc
index a512f3b8b7..5f383064c4 100644
--- a/thirdparty/harfbuzz/src/hb-coretext.cc
+++ b/thirdparty/harfbuzz/src/hb-coretext.cc
@@ -481,8 +481,8 @@ struct active_feature_t {
 	   a->rec.setting < b->rec.setting ? -1 : a->rec.setting > b->rec.setting ? 1 :
 	   0;
   }
-  bool operator== (const active_feature_t *f) {
-    return cmp (this, f) == 0;
+  bool operator== (const active_feature_t& f) const {
+    return cmp (this, &f) == 0;
   }
 };
 
@@ -677,7 +677,7 @@ _hb_coretext_shape (hb_shape_plan_t    *shape_plan,
       {
 	active_features.push (event->feature);
       } else {
-	active_feature_t *feature = active_features.find (&event->feature);
+	active_feature_t *feature = active_features.lsearch (event->feature);
 	if (feature)
 	  active_features.remove (feature - active_features.arrayZ);
       }
@@ -1213,7 +1213,8 @@ resize_and_retry:
     }
   }
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
 #undef FAIL
 
diff --git a/thirdparty/harfbuzz/src/hb-directwrite.cc b/thirdparty/harfbuzz/src/hb-directwrite.cc
index dea87b8cd0..f177ff31c0 100644
--- a/thirdparty/harfbuzz/src/hb-directwrite.cc
+++ b/thirdparty/harfbuzz/src/hb-directwrite.cc
@@ -762,7 +762,8 @@ retry_getglyphs:
 
   if (isRightToLeft) hb_buffer_reverse (buffer);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   delete [] clusterMap;
   delete [] glyphIndices;
diff --git a/thirdparty/harfbuzz/src/hb-draw.h b/thirdparty/harfbuzz/src/hb-draw.h
index bddc876399..f82cc34842 100644
--- a/thirdparty/harfbuzz/src/hb-draw.h
+++ b/thirdparty/harfbuzz/src/hb-draw.h
@@ -50,7 +50,7 @@ typedef void (*hb_draw_close_path_func_t) (void *user_data);
  *
  * Glyph draw callbacks.
  *
- * _move_to, _line_to and _cubic_to calls are nessecary to be defined but we
+ * _move_to, _line_to and _cubic_to calls are necessary to be defined but we
  * translate _quadratic_to calls to _cubic_to if the callback isn't defined.
  *
  * Since: EXPERIMENTAL
diff --git a/thirdparty/harfbuzz/src/hb-face.cc b/thirdparty/harfbuzz/src/hb-face.cc
index 2c0087370c..5365598636 100644
--- a/thirdparty/harfbuzz/src/hb-face.cc
+++ b/thirdparty/harfbuzz/src/hb-face.cc
@@ -143,7 +143,7 @@ hb_face_create_for_tables (hb_reference_table_func_t  reference_table_func,
 
 typedef struct hb_face_for_data_closure_t {
   hb_blob_t *blob;
-  unsigned int  index;
+  uint16_t  index;
 } hb_face_for_data_closure_t;
 
 static hb_face_for_data_closure_t *
@@ -156,7 +156,7 @@ _hb_face_for_data_closure_create (hb_blob_t *blob, unsigned int index)
     return nullptr;
 
   closure->blob = blob;
-  closure->index = index;
+  closure->index = (uint16_t) (index & 0xFFFFu);
 
   return closure;
 }
@@ -195,9 +195,19 @@ _hb_face_for_data_reference_table (hb_face_t *face HB_UNUSED, hb_tag_t tag, void
  * @index: The index of the face within @blob
  *
  * Constructs a new face object from the specified blob and
- * a face index into that blob. This is used for blobs of
- * file formats such as Dfont and TTC that can contain more
- * than one face.
+ * a face index into that blob.
+ *
+ * The face index is used for blobs of file formats such as TTC and
+ * and DFont that can contain more than one face.  Face indices within
+ * such collections are zero-based.
+ *
+ * <note>Note: If the blob font format is not a collection, @index
+ * is ignored.  Otherwise, only the lower 16-bits of @index are used.
+ * The unmodified @index can be accessed via hb_face_get_index().</note>
+ *
+ * <note>Note: The high 16-bits of @index, if non-zero, are used by
+ * hb_font_create() to load named-instances in variable fonts.  See
+ * hb_font_create() for details.</note>
  *
  * Return value: (transfer full): The new face object
  *
@@ -420,7 +430,8 @@ hb_face_reference_blob (hb_face_t *face)
  * Assigns the specified face-index to @face. Fails if the
  * face is immutable.
  *
- * <note>Note: face indices within a collection are zero-based.</note>
+ * <note>Note: changing the index has no effect on the face itself
+ * This only changes the value returned by hb_face_get_index().</note>
  *
  * Since: 0.9.2
  **/
diff --git a/thirdparty/harfbuzz/src/hb-font.cc b/thirdparty/harfbuzz/src/hb-font.cc
index fa8da96395..350fcac139 100644
--- a/thirdparty/harfbuzz/src/hb-font.cc
+++ b/thirdparty/harfbuzz/src/hb-font.cc
@@ -631,7 +631,7 @@ hb_font_funcs_destroy (hb_font_funcs_t *ffuncs)
  * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
- * Attaches a user-data key/data pair to the specified font-functions structure. 
+ * Attaches a user-data key/data pair to the specified font-functions structure.
  *
  * Return value: %true if success, %false otherwise
  *
@@ -821,7 +821,7 @@ hb_font_get_glyph (hb_font_t      *font,
  * @glyph: (out): The glyph ID retrieved
  *
  * Fetches the nominal glyph ID for a Unicode code point in the
- * specified font. 
+ * specified font.
  *
  * This version of the function should not be used to fetch glyph IDs
  * for code points modified by variation selectors. For variation-selector
@@ -940,7 +940,7 @@ hb_font_get_glyph_v_advance (hb_font_t      *font,
  * @advance_stride: The stride between successive advances
  *
  * Fetches the advances for a sequence of glyph IDs in the specified
- * font, for horizontal text segments. 
+ * font, for horizontal text segments.
  *
  * Since: 1.8.6
  **/
@@ -964,7 +964,7 @@ hb_font_get_glyph_h_advances (hb_font_t*            font,
  * @advance_stride: (out): The stride between successive advances
  *
  * Fetches the advances for a sequence of glyph IDs in the specified
- * font, for vertical text segments.  
+ * font, for vertical text segments.
  *
  * Since: 1.8.6
  **/
@@ -1278,7 +1278,7 @@ hb_font_get_glyph_origin_for_direction (hb_font_t      *font,
  * @font: #hb_font_t to work upon
  * @glyph: The glyph ID to query
  * @direction: The direction of the text segment
- * @x: (inout): Input = The original X coordinate 
+ * @x: (inout): Input = The original X coordinate
  *     Output = The X coordinate plus the X-coordinate of the origin
  * @y: (inout): Input = The original Y coordinate
  *     Output = The Y coordinate plus the Y-coordinate of the origin
@@ -1306,7 +1306,7 @@ hb_font_add_glyph_origin_for_direction (hb_font_t      *font,
  * @font: #hb_font_t to work upon
  * @glyph: The glyph ID to query
  * @direction: The direction of the text segment
- * @x: (inout): Input = The original X coordinate 
+ * @x: (inout): Input = The original X coordinate
  *     Output = The X coordinate minus the X-coordinate of the origin
  * @y: (inout): Input = The original Y coordinate
  *     Output = The Y coordinate minus the Y-coordinate of the origin
@@ -1477,6 +1477,8 @@ DEFINE_NULL_INSTANCE (hb_font_t) =
 
   1000, /* x_scale */
   1000, /* y_scale */
+  0., /* slant */
+  0., /* slant_xy; */
   1<<16, /* x_mult */
   1<<16, /* y_mult */
 
@@ -1521,6 +1523,13 @@ _hb_font_create (hb_face_t *face)
  *
  * Constructs a new font object from the specified face.
  *
+ * <note>Note: If @face's index value (as passed to hb_face_create()
+ * has non-zero top 16-bits, those bits minus one are passed to
+ * hb_font_set_var_named_instance(), effectively loading a named-instance
+ * of a variable font, instead of the default-instance.  This allows
+ * specifying which named-instance to load by default when creating the
+ * face.</note>
+ *
  * Return value: (transfer full): The new font object
  *
  * Since: 0.9.2
@@ -1535,6 +1544,11 @@ hb_font_create (hb_face_t *face)
   hb_ot_font_set_funcs (font);
 #endif
 
+#ifndef HB_NO_VAR
+  if (face && face->index >> 16)
+    hb_font_set_var_named_instance (font, (face->index >> 16) - 1);
+#endif
+
   return font;
 }
 
@@ -1578,6 +1592,7 @@ hb_font_create_sub_font (hb_font_t *parent)
 
   font->x_scale = parent->x_scale;
   font->y_scale = parent->y_scale;
+  font->slant = parent->slant;
   font->mults_changed ();
   font->x_ppem = parent->x_ppem;
   font->y_ppem = parent->y_ppem;
@@ -1668,12 +1683,12 @@ hb_font_destroy (hb_font_t *font)
 /**
  * hb_font_set_user_data: (skip)
  * @font: #hb_font_t to work upon
- * @key: The user-data key 
+ * @key: The user-data key
  * @data: A pointer to the user data
  * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
- * Attaches a user-data key/data pair to the specified font object. 
+ * Attaches a user-data key/data pair to the specified font object.
  *
  * Return value: %true if success, %false otherwise
  *
@@ -1875,7 +1890,7 @@ hb_font_set_funcs (hb_font_t         *font,
  * @font_data: (destroy destroy) (scope notified): Data to attach to @font
  * @destroy: (nullable): The function to call when @font_data is not needed anymore
  *
- * Replaces the user data attached to a font, updating the font's 
+ * Replaces the user data attached to a font, updating the font's
  * @destroy callback.
  *
  * Since: 0.9.2
@@ -1949,7 +1964,7 @@ hb_font_get_scale (hb_font_t *font,
  * @x_ppem: Horizontal ppem value to assign
  * @y_ppem: Vertical ppem value to assign
  *
- * Sets the horizontal and vertical pixels-per-em (ppem) of a font. 
+ * Sets the horizontal and vertical pixels-per-em (ppem) of a font.
  *
  * Since: 0.9.2
  **/
@@ -1971,7 +1986,7 @@ hb_font_set_ppem (hb_font_t    *font,
  * @x_ppem: (out): Horizontal ppem value
  * @y_ppem: (out): Vertical ppem value
  *
- * Fetches the horizontal and vertical points-per-em (ppem) of a font. 
+ * Fetches the horizontal and vertical points-per-em (ppem) of a font.
  *
  * Since: 0.9.2
  **/
@@ -2015,7 +2030,7 @@ hb_font_set_ptem (hb_font_t *font,
  *
  * Return value: Point size.  A value of zero means "not set."
  *
- * Since: 0.9.2
+ * Since: 1.6.0
  **/
 float
 hb_font_get_ptem (hb_font_t *font)
@@ -2023,6 +2038,49 @@ hb_font_get_ptem (hb_font_t *font)
   return font->ptem;
 }
 
+/**
+ * hb_font_set_synthetic_slant:
+ * @font: #hb_font_t to work upon
+ * @slant: synthetic slant value.
+ *
+ * Sets the "synthetic slant" of a font.  By default is zero.
+ * Synthetic slant is the graphical skew that the renderer
+ * applies to the font at rendering time.
+ *
+ * HarfBuzz needs to know this value to adjust shaping results,
+ * metrics, and style values to match the slanted rendering.
+ *
+ * <note>Note: The slant value is a ratio.  For example, a
+ * 20% slant would be represented as a 0.2 value.</note>
+ *
+ * Since: 3.3.0
+ **/
+HB_EXTERN void
+hb_font_set_synthetic_slant (hb_font_t *font, float slant)
+{
+  if (hb_object_is_immutable (font))
+    return;
+
+  font->slant = slant;
+  font->mults_changed ();
+}
+
+/**
+ * hb_font_get_synthetic_slant:
+ * @font: #hb_font_t to work upon
+ *
+ * Fetches the "synthetic slant" of a font.
+ *
+ * Return value: Synthetic slant.  By default is zero.
+ *
+ * Since: 3.3.0
+ **/
+HB_EXTERN float
+hb_font_get_synthetic_slant (hb_font_t *font)
+{
+  return font->slant;
+}
+
 #ifndef HB_NO_VAR
 /*
  * Variations
@@ -2036,6 +2094,10 @@ hb_font_get_ptem (hb_font_t *font)
  *
  * Applies a list of font-variation settings to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @variations will be effectively set to their
+ * default values.
+ *
  * Since: 1.4.2
  */
 void
@@ -2091,6 +2153,10 @@ hb_font_set_variations (hb_font_t            *font,
  * Applies a list of variation coordinates (in design-space units)
  * to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @coords will be effectively set to their
+ * default values.
+ *
  * Since: 1.4.2
  */
 void
@@ -2154,6 +2220,10 @@ hb_font_set_var_named_instance (hb_font_t *font,
  * Applies a list of variation coordinates (in normalized units)
  * to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @coords will be effectively set to their
+ * default values.
+ *
  * <note>Note: Coordinates should be normalized to 2.14.</note>
  *
  * Since: 1.4.2
@@ -2196,14 +2266,19 @@ hb_font_set_var_coords_normalized (hb_font_t    *font,
 /**
  * hb_font_get_var_coords_normalized:
  * @font: #hb_font_t to work upon
- * @length: Number of coordinates retrieved
+ * @length: (out): Number of coordinates retrieved
  *
  * Fetches the list of normalized variation coordinates currently
  * set on a font.
  *
+ * Note that this returned array may only contain values for some
+ * (or none) of the axes; omitted axes effectively have zero values.
+ *
  * Return value is valid as long as variation coordinates of the font
  * are not modified.
  *
+ * Return value: coordinates array
+ *
  * Since: 1.4.2
  */
 const int *
@@ -2216,18 +2291,24 @@ hb_font_get_var_coords_normalized (hb_font_t    *font,
   return font->coords;
 }
 
-#ifdef HB_EXPERIMENTAL_API
 /**
  * hb_font_get_var_coords_design:
  * @font: #hb_font_t to work upon
- * @length: (out): number of coordinates
+ * @length: (out): Number of coordinates retrieved
+ *
+ * Fetches the list of variation coordinates (in design-space units) currently
+ * set on a font.
+ *
+ * Note that this returned array may only contain values for some
+ * (or none) of the axes; omitted axes effectively have their default
+ * values.
  *
  * Return value is valid as long as variation coordinates of the font
  * are not modified.
  *
  * Return value: coordinates array
  *
- * Since: EXPERIMENTAL
+ * Since: 3.3.0
  */
 const float *
 hb_font_get_var_coords_design (hb_font_t *font,
@@ -2239,7 +2320,6 @@ hb_font_get_var_coords_design (hb_font_t *font,
   return font->design_coords;
 }
 #endif
-#endif
 
 #ifndef HB_DISABLE_DEPRECATED
 /*
diff --git a/thirdparty/harfbuzz/src/hb-font.h b/thirdparty/harfbuzz/src/hb-font.h
index 15dc126523..a3bbb2e37b 100644
--- a/thirdparty/harfbuzz/src/hb-font.h
+++ b/thirdparty/harfbuzz/src/hb-font.h
@@ -1024,6 +1024,12 @@ HB_EXTERN float
 hb_font_get_ptem (hb_font_t *font);
 
 HB_EXTERN void
+hb_font_set_synthetic_slant (hb_font_t *font, float slant);
+
+HB_EXTERN float
+hb_font_get_synthetic_slant (hb_font_t *font);
+
+HB_EXTERN void
 hb_font_set_variations (hb_font_t *font,
 			const hb_variation_t *variations,
 			unsigned int variations_length);
@@ -1033,11 +1039,9 @@ hb_font_set_var_coords_design (hb_font_t *font,
 			       const float *coords,
 			       unsigned int coords_length);
 
-#ifdef HB_EXPERIMENTAL_API
 HB_EXTERN const float *
 hb_font_get_var_coords_design (hb_font_t *font,
 			       unsigned int *length);
-#endif
 
 HB_EXTERN void
 hb_font_set_var_coords_normalized (hb_font_t *font,
diff --git a/thirdparty/harfbuzz/src/hb-font.hh b/thirdparty/harfbuzz/src/hb-font.hh
index 1b7f445e8b..0d73589e8c 100644
--- a/thirdparty/harfbuzz/src/hb-font.hh
+++ b/thirdparty/harfbuzz/src/hb-font.hh
@@ -109,6 +109,8 @@ struct hb_font_t
 
   int32_t x_scale;
   int32_t y_scale;
+  float slant;
+  float slant_xy;
   int64_t x_mult;
   int64_t y_mult;
 
@@ -617,6 +619,7 @@ struct hb_font_t
     signed upem = face->get_upem ();
     x_mult = ((int64_t) x_scale << 16) / upem;
     y_mult = ((int64_t) y_scale << 16) / upem;
+    slant_xy = y_scale ? slant * x_scale / y_scale : 0.f;
   }
 
   hb_position_t em_mult (int16_t v, int64_t mult)
diff --git a/thirdparty/harfbuzz/src/hb-graphite2.cc b/thirdparty/harfbuzz/src/hb-graphite2.cc
index 42420ac0b0..63dc18b466 100644
--- a/thirdparty/harfbuzz/src/hb-graphite2.cc
+++ b/thirdparty/harfbuzz/src/hb-graphite2.cc
@@ -439,7 +439,8 @@ _hb_graphite2_shape (hb_shape_plan_t    *shape_plan HB_UNUSED,
   if (feats) gr_featureval_destroy (feats);
   gr_seg_destroy (seg);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   return true;
 }
diff --git a/thirdparty/harfbuzz/src/hb-iter.hh b/thirdparty/harfbuzz/src/hb-iter.hh
index ad2e45e3c5..43a3098f65 100644
--- a/thirdparty/harfbuzz/src/hb-iter.hh
+++ b/thirdparty/harfbuzz/src/hb-iter.hh
@@ -90,8 +90,8 @@ struct hb_iter_t
    * it will be returning pointer to temporary rvalue.
    * TODO Use a wrapper return type to fix for non-reference type. */
   template <typename T = item_t,
-	    hb_enable_if (hb_is_reference (T))>
-  hb_remove_reference<item_t>* operator -> () const { return hb_addressof (**thiz()); }
+	    hb_enable_if (std::is_reference<T>::value)>
+  hb_remove_reference<item_t>* operator -> () const { return std::addressof (**thiz()); }
   item_t operator * () const { return thiz()->__item__ (); }
   item_t operator * () { return thiz()->__item__ (); }
   item_t operator [] (unsigned i) const { return thiz()->__item_at__ (i); }
@@ -289,7 +289,7 @@ struct hb_is_source_of
 {
   private:
   template <typename Iter2 = Iter,
-	    hb_enable_if (hb_is_convertible (typename Iter2::item_t, hb_add_lvalue_reference<hb_add_const<Item>>))>
+	    hb_enable_if (hb_is_convertible (typename Iter2::item_t, hb_add_lvalue_reference<const Item>))>
   static hb_true_type impl (hb_priority<2>);
   template <typename Iter2 = Iter>
   static auto impl (hb_priority<1>) -> decltype (hb_declval (Iter2) >> hb_declval (Item &), hb_true_type ());
diff --git a/thirdparty/harfbuzz/src/hb-kern.hh b/thirdparty/harfbuzz/src/hb-kern.hh
index 3f952fe7fc..9ea945caed 100644
--- a/thirdparty/harfbuzz/src/hb-kern.hh
+++ b/thirdparty/harfbuzz/src/hb-kern.hh
@@ -49,6 +49,10 @@ struct hb_kern_machine_t
 	     hb_mask_t    kern_mask,
 	     bool         scale = true) const
   {
+    if (!buffer->message (font, "start kern"))
+      return;
+
+    buffer->unsafe_to_concat ();
     OT::hb_ot_apply_context_t c (1, font, buffer);
     c.set_lookup_mask (kern_mask);
     c.set_lookup_props (OT::LookupFlag::IgnoreMarks);
@@ -67,7 +71,8 @@ struct hb_kern_machine_t
       }
 
       skippy_iter.reset (idx, 1);
-      if (!skippy_iter.next ())
+      unsigned unsafe_to;
+      if (!skippy_iter.next (&unsafe_to))
       {
 	idx++;
 	continue;
@@ -125,6 +130,8 @@ struct hb_kern_machine_t
     skip:
       idx = skippy_iter.idx;
     }
+
+    (void) buffer->message (font, "end kern");
   }
 
   const Driver &driver;
diff --git a/thirdparty/harfbuzz/src/hb-machinery.hh b/thirdparty/harfbuzz/src/hb-machinery.hh
index 010c2570d7..5046ac1933 100644
--- a/thirdparty/harfbuzz/src/hb-machinery.hh
+++ b/thirdparty/harfbuzz/src/hb-machinery.hh
@@ -244,19 +244,19 @@ struct hb_lazy_loader_t : hb_data_wrapper_t<Data, WheresData>
   {
     Stored *p = (Stored *) hb_calloc (1, sizeof (Stored));
     if (likely (p))
-      p->init (data);
+      p = new (p) Stored (data);
     return p;
   }
   static Stored *create ()
   {
     Stored *p = (Stored *) hb_calloc (1, sizeof (Stored));
     if (likely (p))
-      p->init ();
+      p = new (p) Stored ();
     return p;
   }
   static void destroy (Stored *p)
   {
-    p->fini ();
+    p->~Stored ();
     hb_free (p);
   }
 
diff --git a/thirdparty/harfbuzz/src/hb-map.hh b/thirdparty/harfbuzz/src/hb-map.hh
index 793dcf22ca..9341637eac 100644
--- a/thirdparty/harfbuzz/src/hb-map.hh
+++ b/thirdparty/harfbuzz/src/hb-map.hh
@@ -37,13 +37,10 @@
 template <typename K, typename V,
 	  typename k_invalid_t = K,
 	  typename v_invalid_t = V,
-	  k_invalid_t kINVALID = hb_is_pointer (K) ? 0 : std::is_signed<K>::value ? hb_int_min (K) : (K) -1,
-	  v_invalid_t vINVALID = hb_is_pointer (V) ? 0 : std::is_signed<V>::value ? hb_int_min (V) : (V) -1>
+	  k_invalid_t kINVALID = std::is_pointer<K>::value ? 0 : std::is_signed<K>::value ? hb_int_min (K) : (K) -1,
+	  v_invalid_t vINVALID = std::is_pointer<V>::value ? 0 : std::is_signed<V>::value ? hb_int_min (V) : (V) -1>
 struct hb_hashmap_t
 {
-  static constexpr K INVALID_KEY   = kINVALID;
-  static constexpr V INVALID_VALUE = vINVALID;
-
   hb_hashmap_t ()  { init (); }
   ~hb_hashmap_t () { fini (); }
 
@@ -64,24 +61,40 @@ struct hb_hashmap_t
     hb_copy (o, *this);
   }
 
-  static_assert (std::is_trivially_copyable<K>::value, "");
-  static_assert (std::is_trivially_copyable<V>::value, "");
-  static_assert (std::is_trivially_destructible<K>::value, "");
-  static_assert (std::is_trivially_destructible<V>::value, "");
-
   struct item_t
   {
     K key;
     V value;
     uint32_t hash;
 
-    void clear () { key = kINVALID; value = vINVALID; hash = 0; }
+    void clear ()
+    {
+      new (std::addressof (key)) K ();
+      key = hb_coerce<K> (kINVALID);
+      new (std::addressof (value)) V ();
+      value = hb_coerce<V> (vINVALID);
+      hash = 0;
+    }
 
     bool operator == (const K &o) { return hb_deref (key) == hb_deref (o); }
     bool operator == (const item_t &o) { return *this == o.key; }
-    bool is_unused () const    { return key == kINVALID; }
-    bool is_tombstone () const { return key != kINVALID && value == vINVALID; }
-    bool is_real () const { return key != kINVALID && value != vINVALID; }
+    bool is_unused () const
+    {
+      const K inv = hb_coerce<K> (kINVALID);
+      return key == inv;
+    }
+    bool is_tombstone () const
+    {
+      const K kinv = hb_coerce<K> (kINVALID);
+      const V vinv = hb_coerce<V> (vINVALID);
+      return key != kinv && value == vinv;
+    }
+    bool is_real () const
+    {
+      const K kinv = hb_coerce<K> (kINVALID);
+      const V vinv = hb_coerce<V> (vINVALID);
+      return key != kinv && value != vinv;
+    }
     hb_pair_t<K, V> get_pair() const { return hb_pair_t<K, V> (key, value); }
   };
 
@@ -118,8 +131,13 @@ struct hb_hashmap_t
   }
   void fini_shallow ()
   {
-    hb_free (items);
-    items = nullptr;
+    if (likely (items)) {
+      unsigned size = mask + 1;
+      for (unsigned i = 0; i < size; i++)
+        items[i].~item_t ();
+      hb_free (items);
+      items = nullptr;
+    }
     population = occupancy = 0;
   }
   void fini ()
@@ -163,10 +181,15 @@ struct hb_hashmap_t
     /* Insert back old items. */
     if (old_items)
       for (unsigned int i = 0; i < old_size; i++)
+      {
 	if (old_items[i].is_real ())
+	{
 	  set_with_hash (old_items[i].key,
 			 old_items[i].hash,
 			 std::move (old_items[i].value));
+	}
+	old_items[i].~item_t ();
+      }
 
     hb_free (old_items);
 
@@ -178,22 +201,22 @@ struct hb_hashmap_t
 
   V get (K key) const
   {
-    if (unlikely (!items)) return vINVALID;
+    if (unlikely (!items)) return hb_coerce<V> (vINVALID);
     unsigned int i = bucket_for (key);
-    return items[i].is_real () && items[i] == key ? items[i].value : vINVALID;
+    return items[i].is_real () && items[i] == key ? items[i].value : hb_coerce<V> (vINVALID);
   }
 
-  void del (K key) { set (key, vINVALID); }
+  void del (K key) { set (key, hb_coerce<V> (vINVALID)); }
 
   /* Has interface. */
-  static constexpr V SENTINEL = vINVALID;
   typedef V value_t;
   value_t operator [] (K k) const { return get (k); }
   bool has (K k, V *vp = nullptr) const
   {
     V v = (*this)[k];
     if (vp) *vp = v;
-    return v != SENTINEL;
+    const V vinv = hb_coerce<V> (vINVALID);
+    return v != vinv;
   }
   /* Projection. */
   V operator () (K k) const { return get (k); }
@@ -248,11 +271,13 @@ struct hb_hashmap_t
   bool set_with_hash (K key, uint32_t hash, VV&& value)
   {
     if (unlikely (!successful)) return false;
-    if (unlikely (key == kINVALID)) return true;
+    const K kinv = hb_coerce<K> (kINVALID);
+    if (unlikely (key == kinv)) return true;
     if (unlikely ((occupancy + occupancy / 2) >= mask && !resize ())) return false;
     unsigned int i = bucket_for_hash (key, hash);
 
-    if (value == vINVALID && items[i].key != key)
+    const V vinv = hb_coerce<V> (vINVALID);
+    if (value == vinv && items[i].key != key)
       return true; /* Trying to delete non-existent key. */
 
     if (!items[i].is_unused ())
diff --git a/thirdparty/harfbuzz/src/hb-meta.hh b/thirdparty/harfbuzz/src/hb-meta.hh
index 0ea5774a9f..3fea5d995e 100644
--- a/thirdparty/harfbuzz/src/hb-meta.hh
+++ b/thirdparty/harfbuzz/src/hb-meta.hh
@@ -29,6 +29,7 @@
 
 #include "hb.hh"
 
+#include <memory>
 #include <type_traits>
 #include <utility>
 
@@ -85,30 +86,13 @@ template <>             struct hb_priority<0> {};
 template <typename T> struct hb_type_identity_t { typedef T type; };
 template <typename T> using hb_type_identity = typename hb_type_identity_t<T>::type;
 
-struct
-{
-  template <typename T> constexpr T*
-  operator () (T& arg) const
-  {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-align"
-    /* https://en.cppreference.com/w/cpp/memory/addressof */
-    return reinterpret_cast<T*> (
-	     &const_cast<char&> (
-		reinterpret_cast<const volatile char&> (arg)));
-#pragma GCC diagnostic pop
-  }
-}
-HB_FUNCOBJ (hb_addressof);
-
 template <typename T> static inline T hb_declval ();
 #define hb_declval(T) (hb_declval<T> ())
 
 template <typename T> struct hb_match_const		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_const<const T>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> using hb_remove_const = typename hb_match_const<T>::type;
-template <typename T> using hb_add_const = const T;
-#define hb_is_const(T) hb_match_const<T>::value
+
 template <typename T> struct hb_match_reference		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_reference<T &>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> struct hb_match_reference<T &&>	: hb_type_identity_t<T>, hb_true_type	{};
@@ -119,14 +103,13 @@ template <typename T> using hb_add_lvalue_reference = decltype (_hb_try_add_lval
 template <typename T> auto _hb_try_add_rvalue_reference (hb_priority<1>) -> hb_type_identity<T&&>;
 template <typename T> auto _hb_try_add_rvalue_reference (hb_priority<0>) -> hb_type_identity<T>;
 template <typename T> using hb_add_rvalue_reference = decltype (_hb_try_add_rvalue_reference<T> (hb_prioritize));
-#define hb_is_reference(T) hb_match_reference<T>::value
+
 template <typename T> struct hb_match_pointer		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_pointer<T *>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> using hb_remove_pointer = typename hb_match_pointer<T>::type;
 template <typename T> auto _hb_try_add_pointer (hb_priority<1>) -> hb_type_identity<hb_remove_reference<T>*>;
 template <typename T> auto _hb_try_add_pointer (hb_priority<1>) -> hb_type_identity<T>;
 template <typename T> using hb_add_pointer = decltype (_hb_try_add_pointer<T> (hb_prioritize));
-#define hb_is_pointer(T) hb_match_pointer<T>::value
 
 
 /* TODO Add feature-parity to std::decay. */
@@ -137,8 +120,8 @@ template <typename T> using hb_decay = hb_remove_const<hb_remove_reference<T>>;
 template <typename From, typename To>
 using hb_is_cr_convertible = hb_bool_constant<
   hb_is_same (hb_decay<From>, hb_decay<To>) &&
-  (!hb_is_const (From) || hb_is_const (To)) &&
-  (!hb_is_reference (To) || hb_is_const (To) || hb_is_reference (To))
+  (!std::is_const<From>::value || std::is_const<To>::value) &&
+  (!std::is_reference<To>::value || std::is_const<To>::value || std::is_reference<To>::value)
 >;
 #define hb_is_cr_convertible(From,To) hb_is_cr_convertible<From, To>::value
 
@@ -153,16 +136,6 @@ struct
 }
 HB_FUNCOBJ (hb_deref);
 
-struct
-{
-  template <typename T> constexpr auto
-  operator () (T&& v) const HB_AUTO_RETURN (std::forward<T> (v))
-
-  template <typename T> constexpr auto
-  operator () (T& v) const HB_AUTO_RETURN (hb_addressof (v))
-}
-HB_FUNCOBJ (hb_ref);
-
 template <typename T>
 struct hb_reference_wrapper
 {
@@ -176,7 +149,7 @@ struct hb_reference_wrapper
 template <typename T>
 struct hb_reference_wrapper<T&>
 {
-  hb_reference_wrapper (T& v) : v (hb_addressof (v)) {}
+  hb_reference_wrapper (T& v) : v (std::addressof (v)) {}
   bool operator == (const hb_reference_wrapper& o) const { return v == o.v; }
   bool operator != (const hb_reference_wrapper& o) const { return v != o.v; }
   operator T& () const { return *v; }
diff --git a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc
deleted file mode 100644
index 6d09b252d8..0000000000
--- a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright © 2011,2012,2013  Google, Inc.
- * Copyright © 2021  Khaled Hosny
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Google Author(s): Behdad Esfahbod
- */
-
-#include "hb-ms-feature-ranges.hh"
-
-bool
-hb_ms_setup_features (const hb_feature_t                *features,
-		      unsigned int                       num_features,
-		      hb_vector_t<hb_ms_feature_t>      &feature_records, /* OUT */
-		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */)
-{
-  feature_records.shrink(0);
-  range_records.shrink(0);
-
-  /* Sort features by start/end events. */
-  hb_vector_t<hb_ms_feature_event_t> feature_events;
-  for (unsigned int i = 0; i < num_features; i++)
-  {
-    hb_ms_active_feature_t feature;
-    feature.fea.tag_le = hb_uint32_swap (features[i].tag);
-    feature.fea.value = features[i].value;
-    feature.order = i;
-
-    hb_ms_feature_event_t *event;
-
-    event = feature_events.push ();
-    event->index = features[i].start;
-    event->start = true;
-    event->feature = feature;
-
-    event = feature_events.push ();
-    event->index = features[i].end;
-    event->start = false;
-    event->feature = feature;
-  }
-  feature_events.qsort ();
-  /* Add a strategic final event. */
-  {
-    hb_ms_active_feature_t feature;
-    feature.fea.tag_le = 0;
-    feature.fea.value = 0;
-    feature.order = num_features + 1;
-
-    auto *event = feature_events.push ();
-    event->index = 0; /* This value does magic. */
-    event->start = false;
-    event->feature = feature;
-  }
-
-  /* Scan events and save features for each range. */
-  hb_vector_t<hb_ms_active_feature_t> active_features;
-  unsigned int last_index = 0;
-  for (unsigned int i = 0; i < feature_events.length; i++)
-  {
-    auto *event = &feature_events[i];
-
-    if (event->index != last_index)
-    {
-      /* Save a snapshot of active features and the range. */
-      auto *range = range_records.push ();
-      auto offset = feature_records.length;
-
-      active_features.qsort ();
-      for (unsigned int j = 0; j < active_features.length; j++)
-      {
-        if (!j || active_features[j].fea.tag_le != feature_records[feature_records.length - 1].tag_le)
-        {
-          feature_records.push (active_features[j].fea);
-        }
-        else
-        {
-          /* Overrides value for existing feature. */
-          feature_records[feature_records.length - 1].value = active_features[j].fea.value;
-        }
-      }
-
-      /* Will convert to pointer after all is ready, since feature_records.array
-       * may move as we grow it. */
-      range->features.features = reinterpret_cast<hb_ms_feature_t *> (offset);
-      range->features.num_features = feature_records.length - offset;
-      range->index_first = last_index;
-      range->index_last  = event->index - 1;
-
-      last_index = event->index;
-    }
-
-    if (event->start)
-    {
-      active_features.push (event->feature);
-    }
-    else
-    {
-      auto *feature = active_features.find (&event->feature);
-      if (feature)
-        active_features.remove (feature - active_features.arrayZ);
-    }
-  }
-
-  if (!range_records.length) /* No active feature found. */
-    num_features = 0;
-
-  /* Fixup the pointers. */
-  for (unsigned int i = 0; i < range_records.length; i++)
-  {
-    auto *range = &range_records[i];
-    range->features.features = (hb_ms_feature_t *) feature_records + reinterpret_cast<uintptr_t> (range->features.features);
-  }
-
-  return !!num_features;
-}
-
-void
-hb_ms_make_feature_ranges (hb_vector_t<hb_ms_feature_t>      &feature_records,
-			   hb_vector_t<hb_ms_range_record_t> &range_records,
-			   unsigned int                       chars_offset,
-			   unsigned int                       chars_len,
-			   uint16_t                          *log_clusters,
-			   hb_vector_t<hb_ms_features_t*>    &range_features, /* OUT */
-			   hb_vector_t<uint32_t>             &range_counts /* OUT */)
-{
-  range_features.shrink (0);
-  range_counts.shrink (0);
-
-  auto *last_range = &range_records[0];
-  for (unsigned int i = chars_offset; i < chars_len; i++)
-  {
-    auto *range = last_range;
-    while (log_clusters[i] < range->index_first)
-      range--;
-    while (log_clusters[i] > range->index_last)
-      range++;
-    if (!range_features.length ||
-        &range->features != range_features[range_features.length - 1])
-    {
-      auto **features = range_features.push ();
-      auto *c = range_counts.push ();
-      if (unlikely (!features || !c))
-      {
-        range_features.shrink (0);
-        range_counts.shrink (0);
-        break;
-      }
-      *features = &range->features;
-      *c = 1;
-    }
-    else
-    {
-      range_counts[range_counts.length - 1]++;
-    }
-
-    last_range = range;
-  }
-}
diff --git a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
index 401d1e1d97..d40fdeaa82 100644
--- a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
+++ b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
@@ -52,8 +52,8 @@ struct hb_ms_active_feature_t {
 	   a->fea.value < b->fea.value ? -1 : a->fea.value > b->fea.value ? 1 :
 	   0;
   }
-  bool operator== (const hb_ms_active_feature_t *f)
-  { return cmp (this, f) == 0; }
+  bool operator== (const hb_ms_active_feature_t& f) const
+  { return cmp (this, &f) == 0; }
 };
 
 struct hb_ms_feature_event_t {
@@ -77,20 +77,153 @@ struct hb_ms_range_record_t {
   unsigned int index_last;  /* == end - 1 */
 };
 
-HB_INTERNAL bool
+static inline bool
 hb_ms_setup_features (const hb_feature_t                *features,
 		      unsigned int                       num_features,
 		      hb_vector_t<hb_ms_feature_t>      &feature_records, /* OUT */
-		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */);
+		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */)
+{
+  feature_records.shrink(0);
+  range_records.shrink(0);
 
+  /* Sort features by start/end events. */
+  hb_vector_t<hb_ms_feature_event_t> feature_events;
+  for (unsigned int i = 0; i < num_features; i++)
+  {
+    hb_ms_active_feature_t feature;
+    feature.fea.tag_le = hb_uint32_swap (features[i].tag);
+    feature.fea.value = features[i].value;
+    feature.order = i;
+
+    hb_ms_feature_event_t *event;
+
+    event = feature_events.push ();
+    event->index = features[i].start;
+    event->start = true;
+    event->feature = feature;
+
+    event = feature_events.push ();
+    event->index = features[i].end;
+    event->start = false;
+    event->feature = feature;
+  }
+  feature_events.qsort ();
+  /* Add a strategic final event. */
+  {
+    hb_ms_active_feature_t feature;
+    feature.fea.tag_le = 0;
+    feature.fea.value = 0;
+    feature.order = num_features + 1;
+
+    auto *event = feature_events.push ();
+    event->index = 0; /* This value does magic. */
+    event->start = false;
+    event->feature = feature;
+  }
+
+  /* Scan events and save features for each range. */
+  hb_vector_t<hb_ms_active_feature_t> active_features;
+  unsigned int last_index = 0;
+  for (unsigned int i = 0; i < feature_events.length; i++)
+  {
+    auto *event = &feature_events[i];
+
+    if (event->index != last_index)
+    {
+      /* Save a snapshot of active features and the range. */
+      auto *range = range_records.push ();
+      auto offset = feature_records.length;
+
+      active_features.qsort ();
+      for (unsigned int j = 0; j < active_features.length; j++)
+      {
+        if (!j || active_features[j].fea.tag_le != feature_records[feature_records.length - 1].tag_le)
+        {
+          feature_records.push (active_features[j].fea);
+        }
+        else
+        {
+          /* Overrides value for existing feature. */
+          feature_records[feature_records.length - 1].value = active_features[j].fea.value;
+        }
+      }
+
+      /* Will convert to pointer after all is ready, since feature_records.array
+       * may move as we grow it. */
+      range->features.features = reinterpret_cast<hb_ms_feature_t *> (offset);
+      range->features.num_features = feature_records.length - offset;
+      range->index_first = last_index;
+      range->index_last  = event->index - 1;
 
-HB_INTERNAL void
+      last_index = event->index;
+    }
+
+    if (event->start)
+    {
+      active_features.push (event->feature);
+    }
+    else
+    {
+      auto *feature = active_features.lsearch (event->feature);
+      if (feature)
+        active_features.remove (feature - active_features.arrayZ);
+    }
+  }
+
+  if (!range_records.length) /* No active feature found. */
+    num_features = 0;
+
+  /* Fixup the pointers. */
+  for (unsigned int i = 0; i < range_records.length; i++)
+  {
+    auto *range = &range_records[i];
+    range->features.features = (hb_ms_feature_t *) feature_records + reinterpret_cast<uintptr_t> (range->features.features);
+  }
+
+  return !!num_features;
+}
+
+static inline void
 hb_ms_make_feature_ranges (hb_vector_t<hb_ms_feature_t>      &feature_records,
 			   hb_vector_t<hb_ms_range_record_t> &range_records,
 			   unsigned int                       chars_offset,
 			   unsigned int                       chars_len,
 			   uint16_t                          *log_clusters,
 			   hb_vector_t<hb_ms_features_t*>    &range_features, /* OUT */
-			   hb_vector_t<uint32_t>             &range_counts /* OUT */);
+			   hb_vector_t<uint32_t>             &range_counts /* OUT */)
+{
+  range_features.shrink (0);
+  range_counts.shrink (0);
+
+  auto *last_range = &range_records[0];
+  for (unsigned int i = chars_offset; i < chars_len; i++)
+  {
+    auto *range = last_range;
+    while (log_clusters[i] < range->index_first)
+      range--;
+    while (log_clusters[i] > range->index_last)
+      range++;
+    if (!range_features.length ||
+        &range->features != range_features[range_features.length - 1])
+    {
+      auto **features = range_features.push ();
+      auto *c = range_counts.push ();
+      if (unlikely (!features || !c))
+      {
+        range_features.shrink (0);
+        range_counts.shrink (0);
+        break;
+      }
+      *features = &range->features;
+      *c = 1;
+    }
+    else
+    {
+      range_counts[range_counts.length - 1]++;
+    }
+
+    last_range = range;
+  }
+}
 
 #endif /* HB_MS_FEATURE_RANGES_HH */
diff --git a/thirdparty/harfbuzz/src/hb-object.hh b/thirdparty/harfbuzz/src/hb-object.hh
index 0e15cb12c4..4b5bc32ade 100644
--- a/thirdparty/harfbuzz/src/hb-object.hh
+++ b/thirdparty/harfbuzz/src/hb-object.hh
@@ -53,7 +53,7 @@ struct hb_lockable_set_t
   item_t *replace_or_insert (T v, lock_t &l, bool replace)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item) {
       if (replace) {
 	item_t old = *item;
@@ -76,7 +76,7 @@ struct hb_lockable_set_t
   void remove (T v, lock_t &l)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item)
     {
       item_t old = *item;
@@ -93,7 +93,7 @@ struct hb_lockable_set_t
   bool find (T v, item_t *i, lock_t &l)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item)
       *i = *item;
     l.unlock ();
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
index 180c87cb89..c102c15173 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
@@ -68,8 +68,6 @@ struct code_pair_t
 typedef hb_vector_t<unsigned char> str_buff_t;
 struct str_buff_vec_t : hb_vector_t<str_buff_t>
 {
-  void fini () { SUPER::fini_deep (); }
-
   unsigned int total_size () const
   {
     unsigned int size = 0;
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
index 5dd183e3a0..6fb59315c9 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
@@ -1144,8 +1144,8 @@ struct cff1
     {
       sc.end_processing ();
       topDict.fini ();
-      fontDicts.fini_deep ();
-      privateDicts.fini_deep ();
+      fontDicts.fini ();
+      privateDicts.fini ();
       hb_blob_destroy (blob);
       blob = nullptr;
     }
@@ -1245,32 +1245,32 @@ struct cff1
     }
 
     protected:
-    hb_blob_t	           *blob;
+    hb_blob_t	           *blob = nullptr;
     hb_sanitize_context_t   sc;
 
     public:
-    const Encoding	    *encoding;
-    const Charset	    *charset;
-    const CFF1NameIndex     *nameIndex;
-    const CFF1TopDictIndex  *topDictIndex;
-    const CFF1StringIndex   *stringIndex;
-    const CFF1Subrs	    *globalSubrs;
-    const CFF1CharStrings   *charStrings;
-    const CFF1FDArray       *fdArray;
-    const CFF1FDSelect      *fdSelect;
-    unsigned int	     fdCount;
+    const Encoding	    *encoding = nullptr;
+    const Charset	    *charset = nullptr;
+    const CFF1NameIndex     *nameIndex = nullptr;
+    const CFF1TopDictIndex  *topDictIndex = nullptr;
+    const CFF1StringIndex   *stringIndex = nullptr;
+    const CFF1Subrs	    *globalSubrs = nullptr;
+    const CFF1CharStrings   *charStrings = nullptr;
+    const CFF1FDArray       *fdArray = nullptr;
+    const CFF1FDSelect      *fdSelect = nullptr;
+    unsigned int	     fdCount = 0;
 
     cff1_top_dict_values_t   topDict;
     hb_vector_t<cff1_font_dict_values_t>
 			     fontDicts;
     hb_vector_t<PRIVDICTVAL> privateDicts;
 
-    unsigned int	     num_glyphs;
+    unsigned int	     num_glyphs = 0;
   };
 
   struct accelerator_t : accelerator_templ_t<cff1_private_dict_opset_t, cff1_private_dict_values_t>
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       SUPER::init (face);
 
@@ -1295,8 +1295,7 @@ struct cff1
       }
       glyph_names.qsort ();
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       glyph_names.fini ();
 
@@ -1398,7 +1397,10 @@ struct cff1
   DEFINE_SIZE_STATIC (4);
 };
 
-struct cff1_accelerator_t : cff1::accelerator_t {};
+struct cff1_accelerator_t : cff1::accelerator_t {
+  cff1_accelerator_t (hb_face_t *face) : cff1::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF1_TABLE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
index 829217feaa..6e1b01c8fe 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
@@ -397,7 +397,7 @@ struct cff2
   template <typename PRIVOPSET, typename PRIVDICTVAL>
   struct accelerator_templ_t
   {
-    void init (hb_face_t *face)
+    accelerator_templ_t (hb_face_t *face)
     {
       topDict.init ();
       fontDicts.init ();
@@ -412,15 +412,15 @@ struct cff2
       const OT::cff2 *cff2 = this->blob->template as<OT::cff2> ();
 
       if (cff2 == &Null (OT::cff2))
-      { fini (); return; }
+        goto fail;
 
       { /* parse top dict */
 	byte_str_t topDictStr (cff2 + cff2->topDict, cff2->topDictSize);
-	if (unlikely (!topDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!topDictStr.sanitize (&sc))) goto fail;
 	cff2_top_dict_interpreter_t top_interp;
 	top_interp.env.init (topDictStr);
 	topDict.init ();
-	if (unlikely (!top_interp.interpret (topDict))) { fini (); return; }
+	if (unlikely (!top_interp.interpret (topDict))) goto fail;
       }
 
       globalSubrs = &StructAtOffset<CFF2Subrs> (cff2, cff2->topDict + cff2->topDictSize);
@@ -434,49 +434,55 @@ struct cff2
 	  (globalSubrs == &Null (CFF2Subrs)) || unlikely (!globalSubrs->sanitize (&sc)) ||
 	  (fdArray == &Null (CFF2FDArray)) || unlikely (!fdArray->sanitize (&sc)) ||
 	  (((fdSelect != &Null (CFF2FDSelect)) && unlikely (!fdSelect->sanitize (&sc, fdArray->count)))))
-      { fini (); return; }
+        goto fail;
 
       num_glyphs = charStrings->count;
       if (num_glyphs != sc.get_num_glyphs ())
-      { fini (); return; }
+        goto fail;
 
       fdCount = fdArray->count;
       if (!privateDicts.resize (fdCount))
-      { fini (); return; }
+        goto fail;
 
       /* parse font dicts and gather private dicts */
       for (unsigned int i = 0; i < fdCount; i++)
       {
 	const byte_str_t fontDictStr = (*fdArray)[i];
-	if (unlikely (!fontDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!fontDictStr.sanitize (&sc))) goto fail;
 	cff2_font_dict_values_t  *font;
 	cff2_font_dict_interpreter_t font_interp;
 	font_interp.env.init (fontDictStr);
 	font = fontDicts.push ();
-	if (unlikely (font == &Crap (cff2_font_dict_values_t))) { fini (); return; }
+	if (unlikely (font == &Crap (cff2_font_dict_values_t))) goto fail;
 	font->init ();
-	if (unlikely (!font_interp.interpret (*font))) { fini (); return; }
+	if (unlikely (!font_interp.interpret (*font))) goto fail;
 
 	const byte_str_t privDictStr (StructAtOffsetOrNull<UnsizedByteStr> (cff2, font->privateDictInfo.offset), font->privateDictInfo.size);
-	if (unlikely (!privDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!privDictStr.sanitize (&sc))) goto fail;
 	dict_interpreter_t<PRIVOPSET, PRIVDICTVAL, cff2_priv_dict_interp_env_t>  priv_interp;
 	priv_interp.env.init(privDictStr);
 	privateDicts[i].init ();
-	if (unlikely (!priv_interp.interpret (privateDicts[i]))) { fini (); return; }
+	if (unlikely (!priv_interp.interpret (privateDicts[i]))) goto fail;
 
 	privateDicts[i].localSubrs = &StructAtOffsetOrNull<CFF2Subrs> (&privDictStr[0], privateDicts[i].subrsOffset);
 	if (privateDicts[i].localSubrs != &Null (CFF2Subrs) &&
 	  unlikely (!privateDicts[i].localSubrs->sanitize (&sc)))
-	{ fini (); return; }
+	  goto fail;
       }
-    }
 
-    void fini ()
+
+      return;
+
+      fail:
+        _fini ();
+    }
+    ~accelerator_templ_t () { _fini (); }
+    void _fini ()
     {
       sc.end_processing ();
       topDict.fini ();
-      fontDicts.fini_deep ();
-      privateDicts.fini_deep ();
+      fontDicts.fini ();
+      privateDicts.fini ();
       hb_blob_destroy (blob);
       blob = nullptr;
     }
@@ -484,26 +490,28 @@ struct cff2
     bool is_valid () const { return blob; }
 
     protected:
-    hb_blob_t			*blob;
+    hb_blob_t			*blob = nullptr;
     hb_sanitize_context_t	sc;
 
     public:
     cff2_top_dict_values_t	topDict;
-    const CFF2Subrs		*globalSubrs;
-    const CFF2VariationStore	*varStore;
-    const CFF2CharStrings	*charStrings;
-    const CFF2FDArray		*fdArray;
-    const CFF2FDSelect		*fdSelect;
-    unsigned int		fdCount;
+    const CFF2Subrs		*globalSubrs = nullptr;
+    const CFF2VariationStore	*varStore = nullptr;
+    const CFF2CharStrings	*charStrings = nullptr;
+    const CFF2FDArray		*fdArray = nullptr;
+    const CFF2FDSelect		*fdSelect = nullptr;
+    unsigned int		fdCount = 0;
 
     hb_vector_t<cff2_font_dict_values_t>     fontDicts;
     hb_vector_t<PRIVDICTVAL>  privateDicts;
 
-    unsigned int	      num_glyphs;
+    unsigned int	      num_glyphs = 0;
   };
 
   struct accelerator_t : accelerator_templ_t<cff2_private_dict_opset_t, cff2_private_dict_values_t>
   {
+    accelerator_t (hb_face_t *face) : accelerator_templ_t (face) {}
+
     HB_INTERNAL bool get_extents (hb_font_t *font,
 				  hb_codepoint_t glyph,
 				  hb_glyph_extents_t *extents) const;
@@ -525,7 +533,10 @@ struct cff2
   DEFINE_SIZE_STATIC (5);
 };
 
-struct cff2_accelerator_t : cff2::accelerator_t {};
+struct cff2_accelerator_t : cff2::accelerator_t {
+  cff2_accelerator_t (hb_face_t *face) : cff2::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF2_TABLE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
index d837adc788..fde57cdc5b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
@@ -369,7 +369,6 @@ struct CmapSubtableFormat4
   {
     accelerator_t () {}
     accelerator_t (const CmapSubtableFormat4 *subtable) { init (subtable); }
-    ~accelerator_t () { fini (); }
 
     void init (const CmapSubtableFormat4 *subtable)
     {
@@ -381,7 +380,6 @@ struct CmapSubtableFormat4
       glyphIdArray = idRangeOffset + segCount;
       glyphIdArrayLength = (subtable->length - 16 - 8 * segCount) / 2;
     }
-    void fini () {}
 
     bool get_glyph (hb_codepoint_t codepoint, hb_codepoint_t *glyph) const
     {
@@ -1607,7 +1605,7 @@ struct cmap
       unsigned format = (this + _.subtable).u.format;
       if (format == 12) has_format12 = true;
 
-      const EncodingRecord *table = hb_addressof (_);
+      const EncodingRecord *table = std::addressof (_);
       if      (_.platformID == 0 && _.encodingID ==  3) unicode_bmp = table;
       else if (_.platformID == 0 && _.encodingID ==  4) unicode_ucs4 = table;
       else if (_.platformID == 3 && _.encodingID ==  1) ms_bmp = table;
@@ -1665,7 +1663,7 @@ struct cmap
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<cmap> (face);
       bool symbol;
@@ -1700,8 +1698,7 @@ struct cmap
 	}
       }
     }
-
-    void fini () { this->table.destroy (); }
+    ~accelerator_t () { this->table.destroy (); }
 
     bool get_nominal_glyph (hb_codepoint_t  unicode,
 			    hb_codepoint_t *glyph) const
@@ -1863,7 +1860,9 @@ struct cmap
   DEFINE_SIZE_ARRAY (4, encodingRecord);
 };
 
-struct cmap_accelerator_t : cmap::accelerator_t {};
+struct cmap_accelerator_t : cmap::accelerator_t {
+  cmap_accelerator_t (hb_face_t *face) : cmap::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
index 14459914ee..23fa56c4f6 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
@@ -360,6 +360,16 @@ struct IndexSubtable
 
 struct IndexSubtableRecord
 {
+  /* XXX Remove this and fix by not inserting it into vector. */
+  IndexSubtableRecord& operator = (const IndexSubtableRecord &o)
+  {
+    firstGlyphIndex = o.firstGlyphIndex;
+    lastGlyphIndex = o.lastGlyphIndex;
+    offsetToSubtable = (unsigned) o.offsetToSubtable;
+    assert (offsetToSubtable.is_null ());
+    return *this;
+  }
+
   bool sanitize (hb_sanitize_context_t *c, const void *base) const
   {
     TRACE_SANITIZE (this);
@@ -809,15 +819,14 @@ struct CBDT
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
-      cblc = hb_sanitize_context_t ().reference_table<CBLC> (face);
-      cbdt = hb_sanitize_context_t ().reference_table<CBDT> (face);
+      this->cblc = hb_sanitize_context_t ().reference_table<CBLC> (face);
+      this->cbdt = hb_sanitize_context_t ().reference_table<CBDT> (face);
 
       upem = hb_face_get_upem (face);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       this->cblc.destroy ();
       this->cbdt.destroy ();
@@ -978,7 +987,10 @@ CBLC::subset (hb_subset_context_t *c) const
   return_trace (CBLC::sink_cbdt (c, &cbdt_prime));
 }
 
-struct CBDT_accelerator_t : CBDT::accelerator_t {};
+struct CBDT_accelerator_t : CBDT::accelerator_t {
+  CBDT_accelerator_t (hb_face_t *face) : CBDT::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
index 008422d089..dac755c02c 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
@@ -71,7 +71,7 @@ struct hb_colrv1_closure_context_t :
   bool paint_visited (const void *paint)
   {
     hb_codepoint_t delta = (hb_codepoint_t) ((uintptr_t) paint - (uintptr_t) base);
-     if (visited_paint.has (delta))
+    if (visited_paint.in_error() || visited_paint.has (delta))
       return true;
 
     visited_paint.add (delta);
@@ -1270,13 +1270,9 @@ struct COLR
 
   struct accelerator_t
   {
-    accelerator_t () {}
-    ~accelerator_t () { fini (); }
-
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { colr = hb_sanitize_context_t ().reference_table<COLR> (face); }
-
-    void fini () { this->colr.destroy (); }
+    ~accelerator_t () { this->colr.destroy (); }
 
     bool is_valid () { return colr.get_blob ()->length; }
 
@@ -1535,6 +1531,10 @@ struct COLR
   DEFINE_SIZE_MIN (14);
 };
 
+struct COLR_accelerator_t : COLR::accelerator_t {
+  COLR_accelerator_t (hb_face_t *face) : COLR::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh b/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
index ca85ba6ad6..fbaf2ec26b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
@@ -43,7 +43,7 @@ HB_INTERNAL void PaintColrLayers::closurev1 (hb_colrv1_closure_context_t* c) con
   const LayerList &paint_offset_lists = c->get_colr_table ()->get_layerList ();
   for (unsigned i = firstLayerIndex; i < firstLayerIndex + numLayers; i++)
   {
-    const Paint &paint = hb_addressof (paint_offset_lists) + paint_offset_lists[i];
+    const Paint &paint = std::addressof (paint_offset_lists) + paint_offset_lists[i];
     paint.dispatch (c);
   }
 }
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
index d2911f19e6..9741ebd450 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
@@ -202,12 +202,12 @@ struct sbix
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       table = hb_sanitize_context_t ().reference_table<sbix> (face);
       num_glyphs = face->get_num_glyphs ();
     }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     bool has_data () const { return table->has_data (); }
 
@@ -407,7 +407,10 @@ struct sbix
   DEFINE_SIZE_ARRAY (8, strikes);
 };
 
-struct sbix_accelerator_t : sbix::accelerator_t {};
+struct sbix_accelerator_t : sbix::accelerator_t {
+  sbix_accelerator_t (hb_face_t *face) : sbix::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
index e022ef43b7..fc649f1006 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
@@ -79,9 +79,9 @@ struct SVG
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<SVG> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_t *reference_blob_for_glyph (hb_codepoint_t glyph_id) const
     {
@@ -116,7 +116,9 @@ struct SVG
   DEFINE_SIZE_STATIC (10);
 };
 
-struct SVG_accelerator_t : SVG::accelerator_t {};
+struct SVG_accelerator_t : SVG::accelerator_t {
+  SVG_accelerator_t (hb_face_t *face) : SVG::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color.cc b/thirdparty/harfbuzz/src/hb-ot-color.cc
index 4170b71317..16077765bd 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-color.cc
@@ -90,15 +90,15 @@ hb_ot_color_palette_get_count (hb_face_t *face)
 /**
  * hb_ot_color_palette_get_name_id:
  * @face: #hb_face_t to work upon
- * @palette_index: The index of the color palette 
+ * @palette_index: The index of the color palette
  *
  * Fetches the `name` table Name ID that provides display names for
- * a `CPAL` color palette. 
+ * a `CPAL` color palette.
  *
  * Palette display names can be generic (e.g., "Default") or provide
  * specific, themed names (e.g., "Spring", "Summer", "Fall", and "Winter").
  *
- * Return value: the Named ID found for the palette. 
+ * Return value: the Named ID found for the palette.
  * If the requested palette has no name the result is #HB_OT_NAME_ID_INVALID.
  *
  * Since: 2.1.0
@@ -116,7 +116,7 @@ hb_ot_color_palette_get_name_id (hb_face_t *face,
  * @color_index: The index of the color
  *
  * Fetches the `name` table Name ID that provides display names for
- * the specificed color in a face's `CPAL` color palette. 
+ * the specified color in a face's `CPAL` color palette.
  *
  * Display names can be generic (e.g., "Background") or specific
  * (e.g., "Eye color").
@@ -256,6 +256,8 @@ hb_ot_color_has_svg (hb_face_t *face)
  *
  * Fetches the SVG document for a glyph. The blob may be either plain text or gzip-encoded.
  *
+ * If the glyph has no SVG document, the singleton empty blob is returned.
+ *
  * Return value: (transfer full): An #hb_blob_t containing the SVG document of the glyph, if available
  *
  * Since: 2.1.0
@@ -296,6 +298,8 @@ hb_ot_color_has_png (hb_face_t *face)
  * as input. To get an optimally sized PNG blob, the UPEM value must be set on the @font
  * object. If UPEM is unset, the blob returned will be the largest PNG available.
  *
+ * If the glyph has no PNG image, the singleton empty blob is returned.
+ *
  * Return value: (transfer full): An #hb_blob_t containing the PNG image for the glyph, if available
  *
  * Since: 2.1.0
diff --git a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
index 6aa34295c7..87a7d800c1 100644
--- a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
@@ -207,8 +207,7 @@ struct glyf
   _populate_subset_glyphs (const hb_subset_plan_t   *plan,
 			   hb_vector_t<SubsetGlyph> *glyphs /* OUT */) const
   {
-    OT::glyf::accelerator_t glyf;
-    glyf.init (plan->source);
+    OT::glyf::accelerator_t glyf (plan->source);
 
     + hb_range (plan->num_output_glyphs ())
     | hb_map ([&] (hb_codepoint_t new_gid)
@@ -233,8 +232,6 @@ struct glyf
 	      })
     | hb_sink (glyphs)
     ;
-
-    glyf.fini ();
   }
 
   static bool
@@ -595,7 +592,7 @@ struct glyf
         if (unlikely (!header.numberOfContours)) return;
 
         unsigned flags_offset = length (instructions_length ());
-        if (unlikely (length (flags_offset + 1) > bytes.length)) return;
+        if (unlikely (flags_offset + 1 > bytes.length)) return;
 
 	HBUINT8 &first_flag = (HBUINT8 &) StructAtOffset<HBUINT16> (&bytes, flags_offset);
         first_flag = (uint8_t) first_flag | FLAG_OVERLAP_SIMPLE;
@@ -920,7 +917,7 @@ struct glyf
 
   struct accelerator_t
   {
-    void init (hb_face_t *face_)
+    accelerator_t (hb_face_t *face)
     {
       short_offset = false;
       num_glyphs = 0;
@@ -933,7 +930,6 @@ struct glyf
 #ifndef HB_NO_VERTICAL
       vmtx = nullptr;
 #endif
-      face = face_;
       const OT::head &head = *face->table.head;
       if (head.indexToLocFormat > 1 || head.glyphDataFormat > 0)
 	/* Unknown format.  Leave num_glyphs=0, that takes care of disabling us. */
@@ -953,8 +949,7 @@ struct glyf
       num_glyphs = hb_max (1u, loca_table.get_length () / (short_offset ? 2 : 4)) - 1;
       num_glyphs = hb_min (num_glyphs, face->get_num_glyphs ());
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       loca_table.destroy ();
       glyf_table.destroy ();
@@ -1291,7 +1286,6 @@ struct glyf
     unsigned int num_glyphs;
     hb_blob_ptr_t<loca> loca_table;
     hb_blob_ptr_t<glyf> glyf_table;
-    hb_face_t *face;
   };
 
   struct SubsetGlyph
@@ -1358,7 +1352,10 @@ struct glyf
 			 * defining it _MIN instead. */
 };
 
-struct glyf_accelerator_t : glyf::accelerator_t {};
+struct glyf_accelerator_t : glyf::accelerator_t {
+  glyf_accelerator_t (hb_face_t *face) : glyf::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
index 7d2d2d3eb8..36bffa70a5 100644
--- a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
@@ -127,8 +127,7 @@ struct hmtxvmtx
     T *table_prime = c->serializer->start_embed <T> ();
     if (unlikely (!table_prime)) return_trace (false);
 
-    accelerator_t _mtx;
-    _mtx.init (c->plan->source);
+    accelerator_t _mtx (c->plan->source);
     unsigned num_advances = _mtx.num_advances_for_subset (c->plan);
 
     auto it =
@@ -144,8 +143,6 @@ struct hmtxvmtx
 
     table_prime->serialize (c->serializer, it, num_advances);
 
-    _mtx.fini ();
-
     if (unlikely (c->serializer->in_error ()))
       return_trace (false);
 
@@ -160,8 +157,8 @@ struct hmtxvmtx
   {
     friend struct hmtxvmtx;
 
-    void init (hb_face_t *face,
-	       unsigned int default_advance_ = 0)
+    accelerator_t (hb_face_t *face,
+		   unsigned int default_advance_ = 0)
     {
       default_advance = default_advance_ ? default_advance_ : hb_face_get_upem (face);
 
@@ -193,8 +190,7 @@ struct hmtxvmtx
 
       var_table = hb_sanitize_context_t ().reference_table<HVARVVAR> (face, T::variationsTag);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       table.destroy ();
       var_table.destroy ();
@@ -338,8 +334,12 @@ struct vmtx : hmtxvmtx<vmtx, vhea> {
   static constexpr bool is_horizontal = false;
 };
 
-struct hmtx_accelerator_t : hmtx::accelerator_t {};
-struct vmtx_accelerator_t : vmtx::accelerator_t {};
+struct hmtx_accelerator_t : hmtx::accelerator_t {
+  hmtx_accelerator_t (hb_face_t *face) : hmtx::accelerator_t (face) {}
+};
+struct vmtx_accelerator_t : vmtx::accelerator_t {
+  vmtx_accelerator_t (hb_face_t *face) : vmtx::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
index 4fb1893435..60a1906155 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
@@ -128,7 +128,7 @@ struct hb_prune_langsys_context_t
   bool visited (const T *p, hb_set_t &visited_set)
   {
     hb_codepoint_t delta = (hb_codepoint_t) ((uintptr_t) p - (uintptr_t) table);
-     if (visited_set.has (delta))
+    if (visited_set.in_error () || visited_set.has (delta))
       return true;
 
     visited_set.add (delta);
@@ -655,7 +655,6 @@ struct LangSys
   void collect_features (hb_prune_langsys_context_t *c) const
   {
     if (!has_required_feature () && !get_feature_count ()) return;
-    if (c->visitedLangsys (this)) return;
     if (has_required_feature () &&
         c->duplicate_feature_map->has (reqFeatureIndex))
       c->new_feature_indexes->add (get_required_feature_index ());
@@ -750,11 +749,15 @@ struct Script
     {
       //only collect features from non-redundant langsys
       const LangSys& d = get_default_lang_sys ();
-      d.collect_features (c);
+      if (!c->visitedLangsys (&d)) {
+        d.collect_features (c);
+      }
 
       for (auto _ : + hb_zip (langSys, hb_range (langsys_count)))
       {
+
         const LangSys& l = this+_.first.offset;
+        if (c->visitedLangsys (&l)) continue;
         if (l.compare (d, c->duplicate_feature_map)) continue;
 
         l.collect_features (c);
@@ -766,6 +769,7 @@ struct Script
       for (auto _ : + hb_zip (langSys, hb_range (langsys_count)))
       {
         const LangSys& l = this+_.first.offset;
+        if (c->visitedLangsys (&l)) continue;
         l.collect_features (c);
         c->script_langsys_map->get (script_index)->add (_.second);
       }
@@ -845,7 +849,7 @@ struct FeatureParamsSize
     if (unlikely (!c->check_struct (this))) return_trace (false);
 
     /* This subtable has some "history", if you will.  Some earlier versions of
-     * Adobe tools calculated the offset of the FeatureParams sutable from the
+     * Adobe tools calculated the offset of the FeatureParams subtable from the
      * beginning of the FeatureList table!  Now, that is dealt with in the
      * Feature implementation.  But we still need to be able to tell junk from
      * real data.  Note: We don't check that the nameID actually exists.
@@ -2926,8 +2930,6 @@ struct VariationStore
 
     hb_vector_t<hb_inc_bimap_t> inner_maps;
     inner_maps.resize ((unsigned) dataSets.len);
-    for (unsigned i = 0; i < inner_maps.length; i++)
-      inner_maps[i].init ();
 
     for (unsigned idx : c->plan->layout_variation_indices->iter ())
     {
@@ -2935,18 +2937,11 @@ struct VariationStore
       uint16_t minor = idx & 0xFFFF;
 
       if (major >= inner_maps.length)
-      {
-	for (unsigned i = 0; i < inner_maps.length; i++)
-	  inner_maps[i].fini ();
 	return_trace (false);
-      }
       inner_maps[major].add (minor);
     }
     varstore_prime->serialize (c->serializer, this, inner_maps.as_array ());
 
-    for (unsigned i = 0; i < inner_maps.length; i++)
-      inner_maps[i].fini ();
-
     return_trace (
         !c->serializer->in_error()
         && varstore_prime->dataSets);
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
index aea644f3e1..a76d644c4b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
@@ -585,17 +585,16 @@ struct GDEF
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
-      this->table = hb_sanitize_context_t ().reference_table<GDEF> (face);
-      if (unlikely (this->table->is_blocklisted (this->table.get_blob (), face)))
+      table = hb_sanitize_context_t ().reference_table<GDEF> (face);
+      if (unlikely (table->is_blocklisted (table.get_blob (), face)))
       {
-	hb_blob_destroy (this->table.get_blob ());
-	this->table = hb_blob_get_empty ();
+	hb_blob_destroy (table.get_blob ());
+	table = hb_blob_get_empty ();
       }
     }
-
-    void fini () { this->table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_ptr_t<GDEF> table;
   };
@@ -715,7 +714,9 @@ struct GDEF
   DEFINE_SIZE_MIN (12);
 };
 
-struct GDEF_accelerator_t : GDEF::accelerator_t {};
+struct GDEF_accelerator_t : GDEF::accelerator_t {
+  GDEF_accelerator_t (hb_face_t *face) : GDEF::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
index 6db3e08940..2f9186a2a7 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
@@ -706,7 +706,7 @@ struct MarkArray : Array16Of<MarkRecord>	/* Array of MarkRecords--in Coverage or
 
     float mark_x, mark_y, base_x, base_y;
 
-    buffer->unsafe_to_break (glyph_pos, buffer->idx);
+    buffer->unsafe_to_break (glyph_pos, buffer->idx + 1);
     mark_anchor.get_anchor (c, buffer->cur().codepoint, &mark_x, &mark_y);
     glyph_anchor.get_anchor (c, buffer->info[glyph_pos].codepoint, &base_x, &base_y);
 
@@ -1235,6 +1235,7 @@ struct PairSet
       buffer->idx = pos;
       return_trace (true);
     }
+    buffer->unsafe_to_concat (buffer->idx, pos + 1);
     return_trace (false);
   }
 
@@ -1362,7 +1363,12 @@ struct PairPosFormat1
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      buffer->unsafe_to_concat (buffer->idx, unsafe_to);
+      return_trace (false);
+    }
 
     return_trace ((this+pairSet[index]).apply (c, valueFormat, skippy_iter.idx));
   }
@@ -1555,7 +1561,12 @@ struct PairPosFormat2
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      buffer->unsafe_to_concat (buffer->idx, unsafe_to);
+      return_trace (false);
+    }
 
     unsigned int len1 = valueFormat1.get_len ();
     unsigned int len2 = valueFormat2.get_len ();
@@ -1563,13 +1574,89 @@ struct PairPosFormat2
 
     unsigned int klass1 = (this+classDef1).get_class (buffer->cur().codepoint);
     unsigned int klass2 = (this+classDef2).get_class (buffer->info[skippy_iter.idx].codepoint);
-    if (unlikely (klass1 >= class1Count || klass2 >= class2Count)) return_trace (false);
+    if (unlikely (klass1 >= class1Count || klass2 >= class2Count))
+    {
+      buffer->unsafe_to_concat (buffer->idx, skippy_iter.idx + 1);
+      return_trace (false);
+    }
 
     const Value *v = &values[record_len * (klass1 * class2Count + klass2)];
-    bool applied_first = valueFormat1.apply_value (c, this, v, buffer->cur_pos());
-    bool applied_second = valueFormat2.apply_value (c, this, v + len1, buffer->pos[skippy_iter.idx]);
+
+    bool applied_first = false, applied_second = false;
+
+
+    /* Isolate simple kerning and apply it half to each side.
+     * Results in better cursor positinoing / underline drawing.
+     *
+     * Disabled, because causes issues... :-(
+     * https://github.com/harfbuzz/harfbuzz/issues/3408
+     * https://github.com/harfbuzz/harfbuzz/pull/3235#issuecomment-1029814978
+     */
+#ifndef HB_SPLIT_KERN
+    if (0)
+#endif
+    {
+      if (!len2)
+      {
+	const hb_direction_t dir = buffer->props.direction;
+	const bool horizontal = HB_DIRECTION_IS_HORIZONTAL (dir);
+	const bool backward = HB_DIRECTION_IS_BACKWARD (dir);
+	unsigned mask = horizontal ? ValueFormat::xAdvance : ValueFormat::yAdvance;
+	if (backward)
+	  mask |= mask >> 2; /* Add eg. xPlacement in RTL. */
+	/* Add Devices. */
+	mask |= mask << 4;
+
+	if (valueFormat1 & ~mask)
+	  goto bail;
+
+	/* Is simple kern. Apply value on an empty position slot,
+	 * then split it between sides. */
+
+	hb_glyph_position_t pos{};
+	if (valueFormat1.apply_value (c, this, v, pos))
+	{
+	  hb_position_t *src  = &pos.x_advance;
+	  hb_position_t *dst1 = &buffer->cur_pos().x_advance;
+	  hb_position_t *dst2 = &buffer->pos[skippy_iter.idx].x_advance;
+	  unsigned i = horizontal ? 0 : 1;
+
+	  hb_position_t kern  = src[i];
+	  hb_position_t kern1 = kern >> 1;
+	  hb_position_t kern2 = kern - kern1;
+
+	  if (!backward)
+	  {
+	    dst1[i] += kern1;
+	    dst2[i] += kern2;
+	    dst2[i + 2] += kern2;
+	  }
+	  else
+	  {
+	    dst1[i] += kern1;
+	    dst1[i + 2] += src[i + 2] - kern2;
+	    dst2[i] += kern2;
+	  }
+
+	  applied_first = applied_second = kern != 0;
+	  goto success;
+	}
+	goto boring;
+      }
+    }
+    bail:
+
+
+    applied_first = valueFormat1.apply_value (c, this, v, buffer->cur_pos());
+    applied_second = valueFormat2.apply_value (c, this, v + len1, buffer->pos[skippy_iter.idx]);
+
+    success:
     if (applied_first || applied_second)
       buffer->unsafe_to_break (buffer->idx, skippy_iter.idx + 1);
+    else
+    boring:
+      buffer->unsafe_to_concat (buffer->idx, skippy_iter.idx + 1);
+
 
     buffer->idx = skippy_iter.idx;
     if (len2)
@@ -1799,10 +1886,19 @@ struct CursivePosFormat1
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
     const EntryExitRecord &prev_record = entryExitRecord[(this+coverage).get_coverage  (buffer->info[skippy_iter.idx].codepoint)];
-    if (!prev_record.exitAnchor) return_trace (false);
+    if (!prev_record.exitAnchor)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     unsigned int i = skippy_iter.idx;
     unsigned int j = buffer->idx;
@@ -2066,7 +2162,13 @@ struct MarkBasePosFormat1
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (LookupFlag::IgnoreMarks);
     do {
-      if (!skippy_iter.prev ()) return_trace (false);
+      unsigned unsafe_from;
+      if (!skippy_iter.prev (&unsafe_from))
+      {
+	buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+	return_trace (false);
+      }
+
       /* We only want to attach to the first of a MultipleSubst sequence.
        * https://github.com/harfbuzz/harfbuzz/issues/740
        * Reject others...
@@ -2089,7 +2191,11 @@ struct MarkBasePosFormat1
     //if (!_hb_glyph_info_is_base_glyph (&buffer->info[skippy_iter.idx])) { return_trace (false); }
 
     unsigned int base_index = (this+baseCoverage).get_coverage  (buffer->info[skippy_iter.idx].codepoint);
-    if (base_index == NOT_COVERED) return_trace (false);
+    if (base_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     return_trace ((this+markArray).apply (c, mark_index, base_index, this+baseArray, classCount, skippy_iter.idx));
   }
@@ -2320,21 +2426,34 @@ struct MarkLigPosFormat1
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (LookupFlag::IgnoreMarks);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
     /* Checking that matched glyph is actually a ligature by GDEF is too strong; disabled */
     //if (!_hb_glyph_info_is_ligature (&buffer->info[skippy_iter.idx])) { return_trace (false); }
 
     unsigned int j = skippy_iter.idx;
     unsigned int lig_index = (this+ligatureCoverage).get_coverage  (buffer->info[j].codepoint);
-    if (lig_index == NOT_COVERED) return_trace (false);
+    if (lig_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     const LigatureArray& lig_array = this+ligatureArray;
     const LigatureAttach& lig_attach = lig_array[lig_index];
 
     /* Find component to attach to */
     unsigned int comp_count = lig_attach.rows;
-    if (unlikely (!comp_count)) return_trace (false);
+    if (unlikely (!comp_count))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     /* We must now check whether the ligature ID of the current mark glyph
      * is identical to the ligature ID of the found ligature.  If yes, we
@@ -2517,9 +2636,18 @@ struct MarkMarkPosFormat1
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (c->lookup_props & ~LookupFlag::IgnoreFlags);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
-    if (!_hb_glyph_info_is_mark (&buffer->info[skippy_iter.idx])) { return_trace (false); }
+    if (!_hb_glyph_info_is_mark (&buffer->info[skippy_iter.idx]))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     unsigned int j = skippy_iter.idx;
 
@@ -2544,11 +2672,16 @@ struct MarkMarkPosFormat1
     }
 
     /* Didn't match. */
+    buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
     return_trace (false);
 
     good:
     unsigned int mark2_index = (this+mark2Coverage).get_coverage  (buffer->info[j].codepoint);
-    if (mark2_index == NOT_COVERED) return_trace (false);
+    if (mark2_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     return_trace ((this+mark1Array).apply (c, mark1_index, mark2_index, this+mark2Array, classCount, j));
   }
@@ -2951,7 +3084,7 @@ GPOS::position_finish_advances (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer H
 }
 
 void
-GPOS::position_finish_offsets (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer)
+GPOS::position_finish_offsets (hb_font_t *font, hb_buffer_t *buffer)
 {
   _hb_buffer_assert_gsubgpos_vars (buffer);
 
@@ -2961,12 +3094,21 @@ GPOS::position_finish_offsets (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer)
 
   /* Handle attachments */
   if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT)
-    for (unsigned int i = 0; i < len; i++)
+    for (unsigned i = 0; i < len; i++)
       propagate_attachment_offsets (pos, len, i, direction);
+
+  if (unlikely (font->slant))
+  {
+    for (unsigned i = 0; i < len; i++)
+      if (unlikely (pos[i].y_offset))
+        pos[i].x_offset += _hb_roundf (font->slant_xy * pos[i].y_offset);
+  }
 }
 
 
-struct GPOS_accelerator_t : GPOS::accelerator_t {};
+struct GPOS_accelerator_t : GPOS::accelerator_t {
+  GPOS_accelerator_t (hb_face_t *face) : GPOS::accelerator_t (face) {}
+};
 
 
 /* Out-of-class implementation for methods recursing */
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
index b7ce30135e..0b0bc547bd 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
@@ -826,22 +826,25 @@ struct Ligature
 
     unsigned int total_component_count = 0;
 
-    unsigned int match_length = 0;
+    unsigned int match_end = 0;
     unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
 
     if (likely (!match_input (c, count,
 			      &component[1],
 			      match_glyph,
 			      nullptr,
-			      &match_length,
+			      &match_end,
 			      match_positions,
 			      &total_component_count)))
+    {
+      c->buffer->unsafe_to_concat (c->buffer->idx, match_end);
       return_trace (false);
+    }
 
     ligate_input (c,
 		  count,
 		  match_positions,
-		  match_length,
+		  match_end,
 		  ligGlyph,
 		  total_component_count);
 
@@ -1296,7 +1299,7 @@ struct ReverseChainSingleSubstFormat1
 	match_lookahead (c,
 			 lookahead.len, (HBUINT16 *) lookahead.arrayZ,
 			 match_coverage, this,
-			 1, &end_index))
+			 c->buffer->idx + 1, &end_index))
     {
       c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index);
       c->replace_glyph_inplace (substitute[index]);
@@ -1305,8 +1308,11 @@ struct ReverseChainSingleSubstFormat1
        * calls us through a Context lookup. */
       return_trace (true);
     }
-
-    return_trace (false);
+    else
+    {
+      c->buffer->unsafe_to_concat_from_outbuffer (start_index, end_index);
+      return_trace (false);
+    }
   }
 
   template<typename Iterator,
@@ -1739,7 +1745,9 @@ struct GSUB : GSUBGPOS
 };
 
 
-struct GSUB_accelerator_t : GSUB::accelerator_t {};
+struct GSUB_accelerator_t : GSUB::accelerator_t {
+  GSUB_accelerator_t (hb_face_t *face) : GSUB::accelerator_t (face) {}
+};
 
 
 /* Out-of-class implementation for methods recursing */
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
index 191d3bebc5..65de131f85 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
@@ -125,24 +125,31 @@ struct hb_closure_context_t :
     hb_set_t *covered_glyph_set = done_lookups_glyph_set->get (lookup_index);
     if (unlikely (covered_glyph_set->in_error ()))
       return true;
-    if (parent_active_glyphs ()->is_subset (*covered_glyph_set))
+    if (parent_active_glyphs ().is_subset (*covered_glyph_set))
       return true;
 
-    hb_set_union (covered_glyph_set, parent_active_glyphs ());
+    covered_glyph_set->union_ (parent_active_glyphs ());
     return false;
   }
 
-  hb_set_t* parent_active_glyphs ()
+  const hb_set_t& previous_parent_active_glyphs () {
+    if (active_glyphs_stack.length <= 1)
+      return *glyphs;
+
+    return active_glyphs_stack[active_glyphs_stack.length - 2];
+  }
+
+  const hb_set_t& parent_active_glyphs ()
   {
-    if (active_glyphs_stack.length < 1)
-      return glyphs;
+    if (!active_glyphs_stack)
+      return *glyphs;
 
     return active_glyphs_stack.tail ();
   }
 
-  void push_cur_active_glyphs (hb_set_t* cur_active_glyph_set)
+  hb_set_t& push_cur_active_glyphs ()
   {
-    active_glyphs_stack.push (cur_active_glyph_set);
+    return *active_glyphs_stack.push ();
   }
 
   bool pop_cur_done_glyphs ()
@@ -156,29 +163,24 @@ struct hb_closure_context_t :
 
   hb_face_t *face;
   hb_set_t *glyphs;
-  hb_set_t *cur_intersected_glyphs;
   hb_set_t output[1];
-  hb_vector_t<hb_set_t *> active_glyphs_stack;
+  hb_vector_t<hb_set_t> active_glyphs_stack;
   recurse_func_t recurse_func;
   unsigned int nesting_level_left;
 
   hb_closure_context_t (hb_face_t *face_,
 			hb_set_t *glyphs_,
-			hb_set_t *cur_intersected_glyphs_,
 			hb_map_t *done_lookups_glyph_count_,
 			hb_hashmap_t<unsigned, hb_set_t *> *done_lookups_glyph_set_,
 			unsigned int nesting_level_left_ = HB_MAX_NESTING_LEVEL) :
 			  face (face_),
 			  glyphs (glyphs_),
-			  cur_intersected_glyphs (cur_intersected_glyphs_),
 			  recurse_func (nullptr),
 			  nesting_level_left (nesting_level_left_),
 			  done_lookups_glyph_count (done_lookups_glyph_count_),
 			  done_lookups_glyph_set (done_lookups_glyph_set_),
 			  lookup_count (0)
-  {
-    push_cur_active_glyphs (glyphs_);
-  }
+  {}
 
   ~hb_closure_context_t () { flush (); }
 
@@ -186,11 +188,11 @@ struct hb_closure_context_t :
 
   void flush ()
   {
-    hb_set_del_range (output, face->get_num_glyphs (), HB_SET_VALUE_INVALID);	/* Remove invalid glyphs. */
-    hb_set_union (glyphs, output);
-    hb_set_clear (output);
+    output->del_range (face->get_num_glyphs (), HB_SET_VALUE_INVALID);	/* Remove invalid glyphs. */
+    glyphs->union_ (*output);
+    output->clear ();
     active_glyphs_stack.pop ();
-    active_glyphs_stack.fini ();
+    active_glyphs_stack.reset ();
   }
 
   private:
@@ -520,7 +522,7 @@ struct hb_ot_apply_context_t :
     may_skip (const hb_glyph_info_t &info) const
     { return matcher.may_skip (c, info); }
 
-    bool next ()
+    bool next (unsigned *unsafe_to = nullptr)
     {
       assert (num_items > 0);
       while (idx + num_items < end)
@@ -543,11 +545,17 @@ struct hb_ot_apply_context_t :
 	}
 
 	if (skip == matcher_t::SKIP_NO)
+	{
+	  if (unsafe_to)
+	    *unsafe_to = idx + 1;
 	  return false;
+	}
       }
+      if (unsafe_to)
+        *unsafe_to = end;
       return false;
     }
-    bool prev ()
+    bool prev (unsigned *unsafe_from = nullptr)
     {
       assert (num_items > 0);
       while (idx > num_items - 1)
@@ -570,8 +578,14 @@ struct hb_ot_apply_context_t :
 	}
 
 	if (skip == matcher_t::SKIP_NO)
+	{
+	  if (unsafe_from)
+	    *unsafe_from = hb_max (1u, idx) - 1u;
 	  return false;
+	}
       }
+      if (unsafe_from)
+        *unsafe_from = 0;
       return false;
     }
 
@@ -712,53 +726,60 @@ struct hb_ot_apply_context_t :
     return true;
   }
 
-  void _set_glyph_props (hb_codepoint_t glyph_index,
+  void _set_glyph_class (hb_codepoint_t glyph_index,
 			  unsigned int class_guess = 0,
 			  bool ligature = false,
 			  bool component = false) const
   {
-    unsigned int add_in = _hb_glyph_info_get_glyph_props (&buffer->cur()) &
-			  HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
-    add_in |= HB_OT_LAYOUT_GLYPH_PROPS_SUBSTITUTED;
+    unsigned int props = _hb_glyph_info_get_glyph_props (&buffer->cur());
+    props |= HB_OT_LAYOUT_GLYPH_PROPS_SUBSTITUTED;
     if (ligature)
     {
-      add_in |= HB_OT_LAYOUT_GLYPH_PROPS_LIGATED;
+      props |= HB_OT_LAYOUT_GLYPH_PROPS_LIGATED;
       /* In the only place that the MULTIPLIED bit is used, Uniscribe
        * seems to only care about the "last" transformation between
        * Ligature and Multiple substitutions.  Ie. if you ligate, expand,
        * and ligate again, it forgives the multiplication and acts as
        * if only ligation happened.  As such, clear MULTIPLIED bit.
        */
-      add_in &= ~HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
+      props &= ~HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
     }
     if (component)
-      add_in |= HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
+      props |= HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
     if (likely (has_glyph_classes))
-      _hb_glyph_info_set_glyph_props (&buffer->cur(), add_in | gdef.get_glyph_props (glyph_index));
+    {
+      props &= HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props | gdef.get_glyph_props (glyph_index));
+    }
     else if (class_guess)
-      _hb_glyph_info_set_glyph_props (&buffer->cur(), add_in | class_guess);
+    {
+      props &= HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props | class_guess);
+    }
+    else
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props);
   }
 
   void replace_glyph (hb_codepoint_t glyph_index) const
   {
-    _set_glyph_props (glyph_index);
+    _set_glyph_class (glyph_index);
     (void) buffer->replace_glyph (glyph_index);
   }
   void replace_glyph_inplace (hb_codepoint_t glyph_index) const
   {
-    _set_glyph_props (glyph_index);
+    _set_glyph_class (glyph_index);
     buffer->cur().codepoint = glyph_index;
   }
   void replace_glyph_with_ligature (hb_codepoint_t glyph_index,
 				    unsigned int class_guess) const
   {
-    _set_glyph_props (glyph_index, class_guess, true);
+    _set_glyph_class (glyph_index, class_guess, true);
     (void) buffer->replace_glyph (glyph_index);
   }
   void output_glyph_for_component (hb_codepoint_t glyph_index,
 				   unsigned int class_guess) const
   {
-    _set_glyph_props (glyph_index, class_guess, false, true);
+    _set_glyph_class (glyph_index, class_guess, false, true);
     (void) buffer->output_glyph (glyph_index);
   }
 };
@@ -948,7 +969,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
 				const HBUINT16 input[], /* Array of input values--start with second glyph */
 				match_func_t match_func,
 				const void *match_data,
-				unsigned int *end_offset,
+				unsigned int *end_position,
 				unsigned int match_positions[HB_MAX_CONTEXT_LENGTH],
 				unsigned int *p_total_component_count = nullptr)
 {
@@ -1001,7 +1022,12 @@ static inline bool match_input (hb_ot_apply_context_t *c,
   match_positions[0] = buffer->idx;
   for (unsigned int i = 1; i < count; i++)
   {
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      *end_position = unsafe_to;
+      return_trace (false);
+    }
 
     match_positions[i] = skippy_iter.idx;
 
@@ -1055,7 +1081,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
     total_component_count += _hb_glyph_info_get_lig_num_comps (&buffer->info[skippy_iter.idx]);
   }
 
-  *end_offset = skippy_iter.idx - buffer->idx + 1;
+  *end_position = skippy_iter.idx + 1;
 
   if (p_total_component_count)
     *p_total_component_count = total_component_count;
@@ -1065,7 +1091,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
 static inline bool ligate_input (hb_ot_apply_context_t *c,
 				 unsigned int count, /* Including the first glyph */
 				 const unsigned int match_positions[HB_MAX_CONTEXT_LENGTH], /* Including the first glyph */
-				 unsigned int match_length,
+				 unsigned int match_end,
 				 hb_codepoint_t lig_glyph,
 				 unsigned int total_component_count)
 {
@@ -1073,7 +1099,7 @@ static inline bool ligate_input (hb_ot_apply_context_t *c,
 
   hb_buffer_t *buffer = c->buffer;
 
-  buffer->merge_clusters (buffer->idx, buffer->idx + match_length);
+  buffer->merge_clusters (buffer->idx, match_end);
 
   /* - If a base and one or more marks ligate, consider that as a base, NOT
    *   ligature, such that all following marks can still attach to it.
@@ -1190,11 +1216,16 @@ static inline bool match_backtrack (hb_ot_apply_context_t *c,
   skippy_iter.set_match_func (match_func, match_data, backtrack);
 
   for (unsigned int i = 0; i < count; i++)
-    if (!skippy_iter.prev ())
+  {
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      *match_start = unsafe_from;
       return_trace (false);
+    }
+  }
 
   *match_start = skippy_iter.idx;
-
   return_trace (true);
 }
 
@@ -1203,21 +1234,26 @@ static inline bool match_lookahead (hb_ot_apply_context_t *c,
 				    const HBUINT16 lookahead[],
 				    match_func_t match_func,
 				    const void *match_data,
-				    unsigned int offset,
+				    unsigned int start_index,
 				    unsigned int *end_index)
 {
   TRACE_APPLY (nullptr);
 
   hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_context;
-  skippy_iter.reset (c->buffer->idx + offset - 1, count);
+  skippy_iter.reset (start_index - 1, count);
   skippy_iter.set_match_func (match_func, match_data, lookahead);
 
   for (unsigned int i = 0; i < count; i++)
-    if (!skippy_iter.next ())
+  {
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      *end_index = unsafe_to;
       return_trace (false);
+    }
+  }
 
   *end_index = skippy_iter.idx + 1;
-
   return_trace (true);
 }
 
@@ -1284,22 +1320,23 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
     unsigned seqIndex = lookupRecord[i].sequenceIndex;
     if (seqIndex >= inputCount) continue;
 
-    hb_set_t *pos_glyphs = nullptr;
+    bool has_pos_glyphs = false;
+    hb_set_t pos_glyphs;
 
     if (hb_set_is_empty (covered_seq_indicies) || !hb_set_has (covered_seq_indicies, seqIndex))
     {
-      pos_glyphs = hb_set_create ();
+      has_pos_glyphs = true;
       if (seqIndex == 0)
       {
         switch (context_format) {
         case ContextFormat::SimpleContext:
-          pos_glyphs->add (value);
+          pos_glyphs.add (value);
           break;
         case ContextFormat::ClassBasedContext:
-          intersected_glyphs_func (c->cur_intersected_glyphs, data, value, pos_glyphs);
+          intersected_glyphs_func (&c->parent_active_glyphs (), data, value, &pos_glyphs);
           break;
         case ContextFormat::CoverageBasedContext:
-          hb_set_set (pos_glyphs, c->cur_intersected_glyphs);
+          pos_glyphs.set (c->parent_active_glyphs ());
           break;
         }
       }
@@ -1313,12 +1350,16 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
           input_value = input[seqIndex - 1];
         }
 
-        intersected_glyphs_func (c->glyphs, input_data, input_value, pos_glyphs);
+        intersected_glyphs_func (c->glyphs, input_data, input_value, &pos_glyphs);
       }
     }
 
-    hb_set_add (covered_seq_indicies, seqIndex);
-    c->push_cur_active_glyphs (pos_glyphs ? pos_glyphs : c->glyphs);
+    covered_seq_indicies->add (seqIndex);
+    if (has_pos_glyphs) {
+      c->push_cur_active_glyphs () = pos_glyphs;
+    } else {
+      c->push_cur_active_glyphs ().set (*c->glyphs);
+    }
 
     unsigned endIndex = inputCount;
     if (context_format == ContextFormat::CoverageBasedContext)
@@ -1327,8 +1368,6 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
     c->recurse (lookupRecord[i].lookupListIndex, covered_seq_indicies, seqIndex, endIndex);
 
     c->pop_cur_done_glyphs ();
-    if (pos_glyphs)
-      hb_set_destroy (pos_glyphs);
   }
 
   hb_set_destroy (covered_seq_indicies);
@@ -1343,15 +1382,13 @@ static inline void recurse_lookups (context_t *c,
     c->recurse (lookupRecord[i].lookupListIndex);
 }
 
-static inline bool apply_lookup (hb_ot_apply_context_t *c,
+static inline void apply_lookup (hb_ot_apply_context_t *c,
 				 unsigned int count, /* Including the first glyph */
 				 unsigned int match_positions[HB_MAX_CONTEXT_LENGTH], /* Including the first glyph */
 				 unsigned int lookupCount,
 				 const LookupRecord lookupRecord[], /* Array of LookupRecords--in design order */
-				 unsigned int match_length)
+				 unsigned int match_end)
 {
-  TRACE_APPLY (nullptr);
-
   hb_buffer_t *buffer = c->buffer;
   int end;
 
@@ -1359,7 +1396,7 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
    * Adjust. */
   {
     unsigned int bl = buffer->backtrack_len ();
-    end = bl + match_length;
+    end = bl + match_end - buffer->idx;
 
     int delta = bl - buffer->idx;
     /* Convert positions to new indexing. */
@@ -1461,8 +1498,6 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
   }
 
   (void) buffer->move_to (end);
-
-  return_trace (true);
 }
 
 
@@ -1550,17 +1585,25 @@ static inline bool context_apply_lookup (hb_ot_apply_context_t *c,
 					 const LookupRecord lookupRecord[],
 					 ContextApplyLookupContext &lookup_context)
 {
-  unsigned int match_length = 0;
-  unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
-  return match_input (c,
-		      inputCount, input,
-		      lookup_context.funcs.match, lookup_context.match_data,
-		      &match_length, match_positions)
-      && (c->buffer->unsafe_to_break (c->buffer->idx, c->buffer->idx + match_length),
-	  apply_lookup (c,
-		       inputCount, match_positions,
-		       lookupCount, lookupRecord,
-		       match_length));
+  unsigned match_end = 0;
+  unsigned match_positions[HB_MAX_CONTEXT_LENGTH];
+  if (match_input (c,
+		   inputCount, input,
+		   lookup_context.funcs.match, lookup_context.match_data,
+		   &match_end, match_positions))
+  {
+    c->buffer->unsafe_to_break (c->buffer->idx, match_end);
+    apply_lookup (c,
+		  inputCount, match_positions,
+		  lookupCount, lookupRecord,
+		  match_end);
+    return true;
+  }
+  else
+  {
+    c->buffer->unsafe_to_concat (c->buffer->idx, match_end);
+    return false;
+  }
 }
 
 struct Rule
@@ -1828,8 +1871,9 @@ struct ContextFormat1
 
   void closure (hb_closure_context_t *c) const
   {
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     struct ContextClosureLookupContext lookup_context = {
       {intersects_glyph, intersected_glyph},
@@ -1838,10 +1882,14 @@ struct ContextFormat1
     };
 
     + hb_zip (this+coverage, hb_range ((unsigned) ruleSet.len))
-    | hb_filter (c->parent_active_glyphs (), hb_first)
+    | hb_filter ([&] (hb_codepoint_t _) {
+      return c->previous_parent_active_glyphs ().has (_);
+    }, hb_first)
     | hb_map ([&](const hb_pair_t<hb_codepoint_t, unsigned> _) { return hb_pair_t<unsigned, const RuleSet&> (_.first, this+ruleSet[_.second]); })
     | hb_apply ([&] (const hb_pair_t<unsigned, const RuleSet&>& _) { _.second.closure (c, _.first, lookup_context); })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -1989,8 +2037,9 @@ struct ContextFormat2
     if (!(this+coverage).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     const ClassDef &class_def = this+classDef;
 
@@ -2000,10 +2049,9 @@ struct ContextFormat2
       &class_def
     };
 
-    return
     + hb_enumerate (ruleSet)
     | hb_filter ([&] (unsigned _)
-		 { return class_def.intersects_class (c->cur_intersected_glyphs, _); },
+    { return class_def.intersects_class (&c->parent_active_glyphs (), _); },
 		 hb_first)
     | hb_apply ([&] (const hb_pair_t<unsigned, const Offset16To<RuleSet>&> _)
                 {
@@ -2011,6 +2059,8 @@ struct ContextFormat2
                   rule_set.closure (c, _.first, lookup_context);
                 })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2183,8 +2233,10 @@ struct ContextFormat3
     if (!(this+coverageZ[0]).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const LookupRecord *lookupRecord = &StructAfter<LookupRecord> (coverageZ.as_array (glyphCount));
     struct ContextClosureLookupContext lookup_context = {
@@ -2196,6 +2248,8 @@ struct ContextFormat3
 			    glyphCount, (const HBUINT16 *) (coverageZ.arrayZ + 1),
 			    lookupCount, lookupRecord,
 			    0, lookup_context);
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2452,25 +2506,38 @@ static inline bool chain_context_apply_lookup (hb_ot_apply_context_t *c,
 					       const LookupRecord lookupRecord[],
 					       ChainContextApplyLookupContext &lookup_context)
 {
-  unsigned int start_index = 0, match_length = 0, end_index = 0;
-  unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
-  return match_input (c,
-		      inputCount, input,
-		      lookup_context.funcs.match, lookup_context.match_data[1],
-		      &match_length, match_positions)
-      && match_backtrack (c,
-			  backtrackCount, backtrack,
-			  lookup_context.funcs.match, lookup_context.match_data[0],
-			  &start_index)
-      && match_lookahead (c,
-			  lookaheadCount, lookahead,
-			  lookup_context.funcs.match, lookup_context.match_data[2],
-			  match_length, &end_index)
-      && (c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index),
-	  apply_lookup (c,
-			inputCount, match_positions,
-			lookupCount, lookupRecord,
-			match_length));
+  unsigned end_index = c->buffer->idx;
+  unsigned match_end = 0;
+  unsigned match_positions[HB_MAX_CONTEXT_LENGTH];
+  if (!(match_input (c,
+		     inputCount, input,
+		     lookup_context.funcs.match, lookup_context.match_data[1],
+		     &match_end, match_positions) && (end_index = match_end)
+       && match_lookahead (c,
+			   lookaheadCount, lookahead,
+			   lookup_context.funcs.match, lookup_context.match_data[2],
+			   match_end, &end_index)))
+  {
+    c->buffer->unsafe_to_concat (c->buffer->idx, end_index);
+    return false;
+  }
+
+  unsigned start_index = c->buffer->out_len;
+  if (!match_backtrack (c,
+			backtrackCount, backtrack,
+			lookup_context.funcs.match, lookup_context.match_data[0],
+			&start_index))
+  {
+    c->buffer->unsafe_to_concat_from_outbuffer (start_index, end_index);
+    return false;
+  }
+
+  c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index);
+  apply_lookup (c,
+		inputCount, match_positions,
+		lookupCount, lookupRecord,
+		match_end);
+  return true;
 }
 
 struct ChainRule
@@ -2802,8 +2869,9 @@ struct ChainContextFormat1
 
   void closure (hb_closure_context_t *c) const
   {
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     struct ChainContextClosureLookupContext lookup_context = {
       {intersects_glyph, intersected_glyph},
@@ -2812,10 +2880,14 @@ struct ChainContextFormat1
     };
 
     + hb_zip (this+coverage, hb_range ((unsigned) ruleSet.len))
-    | hb_filter (c->parent_active_glyphs (), hb_first)
+    | hb_filter ([&] (hb_codepoint_t _) {
+      return c->previous_parent_active_glyphs ().has (_);
+    }, hb_first)
     | hb_map ([&](const hb_pair_t<hb_codepoint_t, unsigned> _) { return hb_pair_t<unsigned, const ChainRuleSet&> (_.first, this+ruleSet[_.second]); })
     | hb_apply ([&] (const hb_pair_t<unsigned, const ChainRuleSet&>& _) { _.second.closure (c, _.first, lookup_context); })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2964,8 +3036,10 @@ struct ChainContextFormat2
     if (!(this+coverage).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const ClassDef &backtrack_class_def = this+backtrackClassDef;
     const ClassDef &input_class_def = this+inputClassDef;
@@ -2979,10 +3053,9 @@ struct ChainContextFormat2
        &lookahead_class_def}
     };
 
-    return
     + hb_enumerate (ruleSet)
     | hb_filter ([&] (unsigned _)
-		 { return input_class_def.intersects_class (c->cur_intersected_glyphs, _); },
+    { return input_class_def.intersects_class (&c->parent_active_glyphs (), _); },
 		 hb_first)
     | hb_apply ([&] (const hb_pair_t<unsigned, const Offset16To<ChainRuleSet>&> _)
                 {
@@ -2990,6 +3063,8 @@ struct ChainContextFormat2
                   chainrule_set.closure (c, _.first, lookup_context);
                 })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -3216,8 +3291,10 @@ struct ChainContextFormat3
     if (!(this+input[0]).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const Array16OfOffset16To<Coverage> &lookahead = StructAfter<Array16OfOffset16To<Coverage>> (input);
     const Array16Of<LookupRecord> &lookup = StructAfter<Array16Of<LookupRecord>> (lookahead);
@@ -3232,6 +3309,8 @@ struct ChainContextFormat3
 				  lookahead.len, (const HBUINT16 *) lookahead.arrayZ,
 				  lookup.len, lookup.arrayZ,
 				  0, lookup_context);
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -3706,7 +3785,7 @@ struct GSUBGPOS
     for (unsigned i : feature_indices->iter ())
     {
       hb_tag_t t = get_feature_tag (i);
-      if (t == unique_features.INVALID_KEY) continue;
+      if (t == HB_MAP_VALUE_INVALID) continue;
       if (!unique_features.has (t))
       {
         hb_set_t* indices = hb_set_create ();
@@ -3839,7 +3918,7 @@ struct GSUBGPOS
   template <typename T>
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<T> (face);
       if (unlikely (this->table->is_blocklisted (this->table.get_blob (), face)))
@@ -3861,8 +3940,7 @@ struct GSUBGPOS
       for (unsigned int i = 0; i < this->lookup_count; i++)
 	this->accels[i].init (table->get_lookup (i));
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       for (unsigned int i = 0; i < this->lookup_count; i++)
 	this->accels[i].fini ();
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.cc b/thirdparty/harfbuzz/src/hb-ot-layout.cc
index 60733648c1..a599eea6e9 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.cc
@@ -1491,10 +1491,9 @@ hb_ot_layout_lookup_substitute_closure (hb_face_t    *face,
 					unsigned int  lookup_index,
 					hb_set_t     *glyphs /* OUT */)
 {
-  hb_set_t cur_intersected_glyphs;
   hb_map_t done_lookups_glyph_count;
   hb_hashmap_t<unsigned, hb_set_t *> done_lookups_glyph_set;
-  OT::hb_closure_context_t c (face, glyphs, &cur_intersected_glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
+  OT::hb_closure_context_t c (face, glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
 
   const OT::SubstLookup& l = face->table.GSUB->table->get_lookup (lookup_index);
 
@@ -1520,10 +1519,9 @@ hb_ot_layout_lookups_substitute_closure (hb_face_t      *face,
 					 const hb_set_t *lookups,
 					 hb_set_t       *glyphs /* OUT */)
 {
-  hb_set_t cur_intersected_glyphs;
   hb_map_t done_lookups_glyph_count;
   hb_hashmap_t<unsigned, hb_set_t *> done_lookups_glyph_set;
-  OT::hb_closure_context_t c (face, glyphs, &cur_intersected_glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
+  OT::hb_closure_context_t c (face, glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
   const OT::GSUB& gsub = *face->table.GSUB->table;
 
   unsigned int iteration_count = 0;
@@ -1890,7 +1888,7 @@ apply_string (OT::hb_ot_apply_context_t *c,
     apply_forward (c, accel);
 
     if (!Proxy::inplace)
-      buffer->swap_buffers ();
+      buffer->sync ();
   }
   else
   {
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.hh b/thirdparty/harfbuzz/src/hb-ot-layout.hh
index 2c825e0c81..ede8f007db 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.hh
@@ -482,10 +482,9 @@ _hb_glyph_info_get_lig_num_comps (const hb_glyph_info_t *info)
 }
 
 static inline uint8_t
-_hb_allocate_lig_id (hb_buffer_t *buffer) {
+_hb_allocate_lig_id (hb_buffer_t *buffer)
+{
   uint8_t lig_id = buffer->next_serial () & 0x07;
-  if (unlikely (!lig_id))
-    lig_id = _hb_allocate_lig_id (buffer); /* in case of overflow */
   return lig_id;
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-meta-table.hh b/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
index e31447f8fc..93e64c5327 100644
--- a/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
@@ -71,9 +71,9 @@ struct meta
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<meta> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_t *reference_entry (hb_tag_t tag) const
     { return table->dataMaps.lsearch (tag).reference_entry (table.get_blob ()); }
@@ -119,7 +119,9 @@ struct meta
   DEFINE_SIZE_ARRAY (16, dataMaps);
 };
 
-struct meta_accelerator_t : meta::accelerator_t {};
+struct meta_accelerator_t : meta::accelerator_t {
+  meta_accelerator_t (hb_face_t *face) : meta::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-metrics.cc b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
index dbd4a1ffbe..103808cf91 100644
--- a/thirdparty/harfbuzz/src/hb-ot-metrics.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
@@ -160,9 +160,50 @@ hb_ot_metrics_get_position (hb_font_t           *font,
     (position && (*position = font->em_scalef_y (face->table.TABLE->ATTR + GET_VAR)), true))
   case HB_OT_METRICS_TAG_HORIZONTAL_CLIPPING_ASCENT:  return GET_METRIC_Y (OS2, usWinAscent);
   case HB_OT_METRICS_TAG_HORIZONTAL_CLIPPING_DESCENT: return GET_METRIC_Y (OS2, usWinDescent);
-  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE:       return GET_METRIC_Y (hhea, caretSlopeRise);
-  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RUN:        return GET_METRIC_X (hhea, caretSlopeRun);
+
+  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE:
+  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RUN:
+  {
+    unsigned mult = 1u;
+
+    if (font->slant)
+    {
+      unsigned rise = face->table.hhea->caretSlopeRise;
+      unsigned upem = face->get_upem ();
+      mult = (rise && rise < upem) ? hb_min (upem / rise, 256u) : 1u;
+    }
+
+    if (metrics_tag == HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE)
+    {
+      bool ret = GET_METRIC_Y (hhea, caretSlopeRise);
+
+      if (position)
+	*position *= mult;
+
+      return ret;
+    }
+    else
+    {
+      hb_position_t rise = 0;
+
+      if (font->slant && position && GET_METRIC_Y (hhea, caretSlopeRise))
+	rise = *position;
+
+      bool ret = GET_METRIC_X (hhea, caretSlopeRun);
+
+      if (position)
+      {
+	*position *= mult;
+
+	if (font->slant)
+	  *position += _hb_roundf (mult * font->slant_xy * rise);
+      }
+
+      return ret;
+    }
+  }
   case HB_OT_METRICS_TAG_HORIZONTAL_CARET_OFFSET:     return GET_METRIC_X (hhea, caretOffset);
+
 #ifndef HB_NO_VERTICAL
   case HB_OT_METRICS_TAG_VERTICAL_CARET_RISE:         return GET_METRIC_X (vhea, caretSlopeRise);
   case HB_OT_METRICS_TAG_VERTICAL_CARET_RUN:          return GET_METRIC_Y (vhea, caretSlopeRun);
diff --git a/thirdparty/harfbuzz/src/hb-ot-name-table.hh b/thirdparty/harfbuzz/src/hb-ot-name-table.hh
index c17bb4abb8..d52367e9b1 100644
--- a/thirdparty/harfbuzz/src/hb-ot-name-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-name-table.hh
@@ -256,7 +256,7 @@ struct name
     })
     ;
 
-    name_prime->serialize (c->serializer, it, hb_addressof (this + stringOffset));
+    name_prime->serialize (c->serializer, it, std::addressof (this + stringOffset));
     return_trace (name_prime->count);
   }
 
@@ -279,7 +279,7 @@ struct name
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<name> (face);
       assert (this->table.get_length () >= this->table->stringOffset);
@@ -288,7 +288,6 @@ struct name
       const hb_array_t<const NameRecord> all_names (this->table->nameRecordZ.arrayZ,
 						    this->table->count);
 
-      this->names.init ();
       this->names.alloc (all_names.length);
 
       for (unsigned int i = 0; i < all_names.length; i++)
@@ -318,10 +317,8 @@ struct name
       }
       this->names.resize (j);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
-      this->names.fini ();
       this->table.destroy ();
     }
 
@@ -373,7 +370,9 @@ struct name
 #undef entry_index
 #undef entry_score
 
-struct name_accelerator_t : name::accelerator_t {};
+struct name_accelerator_t : name::accelerator_t {
+  name_accelerator_t (hb_face_t *face) : name::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh b/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
index 504de2de74..0f3cd8e24f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
@@ -76,8 +76,7 @@ HB_INTERNAL bool postV2Tail::subset (hb_subset_context_t *c) const
   hb_map_t old_new_index_map, old_gid_new_index_map;
   unsigned i = 0;
 
-  post::accelerator_t _post;
-  _post.init (c->plan->source);
+  post::accelerator_t _post (c->plan->source);
 
   hb_hashmap_t<hb_bytes_t, unsigned, std::nullptr_t, unsigned, nullptr, (unsigned)-1> glyph_name_to_new_index;
   for (hb_codepoint_t new_gid = 0; new_gid < num_glyphs; new_gid++)
@@ -128,9 +127,7 @@ HB_INTERNAL bool postV2Tail::subset (hb_subset_context_t *c) const
                             })
   ;
 
-  bool ret = serialize (c->serializer, index_iter, &_post);
-  _post.fini ();
-  return_trace (ret);
+  return_trace (serialize (c->serializer, index_iter, &_post));
 }
 
 } /* namespace OT */
diff --git a/thirdparty/harfbuzz/src/hb-ot-post-table.hh b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
index 39de671707..a4844e94bc 100644
--- a/thirdparty/harfbuzz/src/hb-ot-post-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
@@ -111,10 +111,9 @@ struct post
   struct accelerator_t
   {
     friend struct postV2Tail;
-    void init (hb_face_t *face)
-    {
-      index_to_offset.init ();
 
+    accelerator_t (hb_face_t *face)
+    {
       table = hb_sanitize_context_t ().reference_table<post> (face);
       unsigned int table_length = table.get_length ();
 
@@ -132,9 +131,8 @@ struct post
 	   data += 1 + *data)
 	index_to_offset.push (data - pool);
     }
-    void fini ()
+    ~accelerator_t ()
     {
-      index_to_offset.fini ();
       hb_free (gids_sorted_by_name.get ());
       table.destroy ();
     }
@@ -254,9 +252,9 @@ struct post
 
     private:
     uint32_t version;
-    const Array16Of<HBUINT16> *glyphNameIndex;
+    const Array16Of<HBUINT16> *glyphNameIndex = nullptr;
     hb_vector_t<uint32_t> index_to_offset;
-    const uint8_t *pool;
+    const uint8_t *pool = nullptr;
     hb_atomic_ptr_t<uint16_t *> gids_sorted_by_name;
   };
 
@@ -307,7 +305,10 @@ struct post
   DEFINE_SIZE_MIN (32);
 };
 
-struct post_accelerator_t : post::accelerator_t {};
+struct post_accelerator_t : post::accelerator_t {
+  post_accelerator_t (hb_face_t *face) : post::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
index 41e3dd38ab..429974d05b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
@@ -87,6 +87,8 @@
 
 #define OT_GLYPHID /* GlyphID */ \
 	OT_UINT16
+/* Shorthand. */
+#define G	OT_GLYPHID
 
 #define OT_UARRAY(Name, Items) \
 	OT_LABEL_START(Name) \
@@ -183,8 +185,6 @@
 	Tag \
 	OT_OFFSET(manifest, Name)
 
-/* Shorthand. */
-#define G	OT_GLYPHID
 
 /*
  * Table Start
@@ -300,14 +300,40 @@ OT_TABLE_END
 /*
  * Clean up
  */
+
+#undef MANIFEST
+#undef MANIFEST_LOOKUP
+
 #undef OT_TABLE_START
 #undef OT_TABLE_END
 #undef OT_LABEL_START
 #undef OT_LABEL_END
 #undef OT_UINT8
 #undef OT_UINT16
-#undef OT_DISTANCE
 #undef OT_COUNT
+#undef OT_DISTANCE
+
+#undef OT_LABEL
+#undef OT_LIST
+
+#undef OT_TAG
+#undef OT_OFFSET
+#undef OT_GLYPHID
+#undef G
+#undef OT_UARRAY
+#undef OT_UHEADLESSARRAY
+
+#undef OT_LOOKUP_FLAG_IGNORE_MARKS
+#undef OT_LOOKUP
+#undef OT_SUBLOOKUP
+#undef OT_COVERAGE1
+#undef OT_LOOKUP_TYPE_SUBST_SINGLE
+#undef OT_LOOKUP_TYPE_SUBST_LIGATURE
+#undef OT_SUBLOOKUP_SINGLE_SUBST_FORMAT2
+#undef OT_SUBLOOKUP_LIGATURE_SUBST_FORMAT1
+#undef OT_LIGATURE_SET
+#undef OT_LIGATURE
+
 
 /*
  * Include a second time to get the table data...
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
index 222c5d6b71..2298aa92f2 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
@@ -321,6 +321,20 @@ arabic_joining (hb_buffer_t *buffer)
       info[prev].arabic_shaping_action() = entry->prev_action;
       buffer->unsafe_to_break (prev, i + 1);
     }
+    else
+    {
+      if (prev == UINT_MAX)
+      {
+        if (this_type >= JOINING_TYPE_R)
+	  buffer->unsafe_to_concat_from_outbuffer (0, i + 1);
+      }
+      else
+      {
+	if (this_type >= JOINING_TYPE_R ||
+	    (2 <= state && state <= 5) /* States that have a possible prev_action. */)
+	  buffer->unsafe_to_concat (prev, i + 1);
+      }
+    }
 
     info[i].arabic_shaping_action() = entry->curr_action;
 
@@ -337,7 +351,14 @@ arabic_joining (hb_buffer_t *buffer)
 
     const arabic_state_table_entry *entry = &arabic_state_table[state][this_type];
     if (entry->prev_action != NONE && prev != UINT_MAX)
+    {
       info[prev].arabic_shaping_action() = entry->prev_action;
+      buffer->unsafe_to_break (prev, buffer->len);
+    }
+    else if (2 <= state && state <= 5) /* States that have a possible prev_action. */
+    {
+      buffer->unsafe_to_concat (prev, buffer->len);
+    }
     break;
   }
 }
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
index 0d84a76b85..3bc9e9b961 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
@@ -140,7 +140,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
    *
    *   - LV can be precomposed, or decomposed.  Lets call those
    *     <LV> and <L,V>,
-   *   - LVT can be fully precomposed, partically precomposed, or
+   *   - LVT can be fully precomposed, partially precomposed, or
    *     fully decomposed.  Ie. <LVT>, <LV,T>, or <L,V,T>.
    *
    * The composition / decomposition is mechanical.  However, not
@@ -392,7 +392,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
      */
     (void) buffer->next_glyph ();
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 static void
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
index 5a08f878dc..76092c7f38 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
@@ -96,7 +96,7 @@ hb_syllabic_insert_dotted_circles (hb_font_t *font,
     else
       (void) buffer->next_glyph ();
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
index 4c3068173b..a1e27a83be 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
@@ -364,7 +364,7 @@ preprocess_text_thai (const hb_ot_shape_plan_t *plan,
 	buffer->merge_out_clusters (start - 1, end);
     }
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 
   /* If font has Thai GSUB, we are done. */
   if (plan->props.script == HB_SCRIPT_THAI && !plan->map.found_script[0])
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
index 045731dfb4..d2cca105a4 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
@@ -435,7 +435,7 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
     default:
       break;
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc b/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
index eb1bc79768..671f30327f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
@@ -446,6 +446,9 @@ _hb_ot_shape_fallback_mark_position (const hb_ot_shape_plan_t *plan,
   return;
 #endif
 
+  if (!buffer->message (font, "start fallback mark"))
+    return;
+
   _hb_buffer_assert_gsubgpos_vars (buffer);
 
   unsigned int start = 0;
@@ -457,6 +460,8 @@ _hb_ot_shape_fallback_mark_position (const hb_ot_shape_plan_t *plan,
       start = i;
     }
   position_cluster (plan, font, buffer, start, count, adjust_offsets_when_zeroing);
+
+  (void) buffer->message (font, "end fallback mark");
 }
 
 
@@ -492,6 +497,9 @@ _hb_ot_shape_fallback_kern (const hb_ot_shape_plan_t *plan,
 #endif
 
 #ifndef HB_DISABLE_DEPRECATED
+  if (!buffer->message (font, "start fallback kern"))
+    return;
+
   if (HB_DIRECTION_IS_HORIZONTAL (buffer->props.direction) ?
       !font->has_glyph_h_kerning_func () :
       !font->has_glyph_v_kerning_func ())
@@ -508,6 +516,8 @@ _hb_ot_shape_fallback_kern (const hb_ot_shape_plan_t *plan,
 
   if (reverse)
     buffer->reverse ();
+
+  (void) buffer->message (font, "end fallback kern");
 #endif
 }
 
@@ -525,6 +535,15 @@ _hb_ot_shape_fallback_spaces (const hb_ot_shape_plan_t *plan HB_UNUSED,
   for (unsigned int i = 0; i < count; i++)
     if (_hb_glyph_info_is_unicode_space (&info[i]) && !_hb_glyph_info_ligated (&info[i]))
     {
+      /* If font had no ASCII space and we used the invisible glyph, give it a 1/4 EM default advance. */
+      if (buffer->invisible && info[i].codepoint == buffer->invisible)
+      {
+        if (horizontal)
+	  pos[i].x_advance = +font->x_scale / 4;
+        else
+	  pos[i].y_advance = -font->y_scale / 4;
+      }
+
       hb_unicode_funcs_t::space_t space_type = _hb_glyph_info_get_unicode_space_fallback_type (&info[i]);
       hb_codepoint_t glyph;
       typedef hb_unicode_funcs_t t;
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
index 839cc9122c..aa5a8eeaa3 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
@@ -193,7 +193,8 @@ decompose_current_character (const hb_ot_shape_normalize_context_t *c, bool shor
   {
     hb_codepoint_t space_glyph;
     hb_unicode_funcs_t::space_t space_type = buffer->unicode->space_fallback_type (u);
-    if (space_type != hb_unicode_funcs_t::NOT_SPACE && c->font->get_nominal_glyph (0x0020u, &space_glyph))
+    if (space_type != hb_unicode_funcs_t::NOT_SPACE &&
+	(c->font->get_nominal_glyph (0x0020, &space_glyph) || (space_glyph = buffer->invisible)))
     {
       _hb_glyph_info_set_unicode_space_fallback_type (&buffer->cur(), space_type);
       next_char (buffer, space_glyph);
@@ -374,7 +375,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       decompose_multi_char_cluster (&c, end, always_short_circuit);
     }
     while (buffer->idx < count && buffer->successful);
-    buffer->swap_buffers ();
+    buffer->sync ();
   }
 
 
@@ -477,7 +478,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       if (info_cc (buffer->prev()) == 0)
 	starter = buffer->out_len - 1;
     }
-    buffer->swap_buffers ();
+    buffer->sync ();
   }
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape.cc b/thirdparty/harfbuzz/src/hb-ot-shape.cc
index 4dde3520d8..4bd8aaf03b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape.cc
@@ -566,7 +566,7 @@ hb_insert_dotted_circle (hb_buffer_t *buffer, hb_font_t *font)
   info.mask = buffer->cur().mask;
   (void) buffer->output_info (info);
 
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 static void
@@ -1034,7 +1034,7 @@ hb_ot_position_complex (const hb_ot_shape_context_t *c)
    * hanging over the next glyph after the final reordering.
    *
    * Note: If fallback positinoing happens, we don't care about
-   * this as it will be overriden.
+   * this as it will be overridden.
    */
   bool adjust_offsets_when_zeroing = c->plan->adjust_mark_positioning_when_zeroing &&
 				     HB_DIRECTION_IS_FORWARD (c->buffer->props.direction);
@@ -1120,7 +1120,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
   /* Propagate cluster-level glyph flags to be the same on all cluster glyphs.
    * Simplifies using them. */
 
-  if (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK))
+  if (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS))
     return;
 
   hb_glyph_info_t *info = buffer->info;
@@ -1129,11 +1129,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
   {
     unsigned int mask = 0;
     for (unsigned int i = start; i < end; i++)
-      if (info[i].mask & HB_GLYPH_FLAG_UNSAFE_TO_BREAK)
-      {
-	 mask = HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-	 break;
-      }
+      mask |= info[i].mask & HB_GLYPH_FLAG_DEFINED;
     if (mask)
       for (unsigned int i = start; i < end; i++)
 	info[i].mask |= mask;
@@ -1145,18 +1141,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
 static void
 hb_ot_shape_internal (hb_ot_shape_context_t *c)
 {
-  c->buffer->deallocate_var_all ();
-  c->buffer->scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
-  if (likely (!hb_unsigned_mul_overflows (c->buffer->len, HB_BUFFER_MAX_LEN_FACTOR)))
-  {
-    c->buffer->max_len = hb_max (c->buffer->len * HB_BUFFER_MAX_LEN_FACTOR,
-				 (unsigned) HB_BUFFER_MAX_LEN_MIN);
-  }
-  if (likely (!hb_unsigned_mul_overflows (c->buffer->len, HB_BUFFER_MAX_OPS_FACTOR)))
-  {
-    c->buffer->max_ops = hb_max (c->buffer->len * HB_BUFFER_MAX_OPS_FACTOR,
-				 (unsigned) HB_BUFFER_MAX_OPS_MIN);
-  }
+  c->buffer->enter ();
 
   /* Save the original direction, we use it later. */
   c->target_direction = c->buffer->props.direction;
@@ -1188,9 +1173,7 @@ hb_ot_shape_internal (hb_ot_shape_context_t *c)
 
   c->buffer->props.direction = c->target_direction;
 
-  c->buffer->max_len = HB_BUFFER_MAX_LEN_DEFAULT;
-  c->buffer->max_ops = HB_BUFFER_MAX_OPS_DEFAULT;
-  c->buffer->deallocate_var_all ();
+  c->buffer->leave ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
index 2c6316df4f..61d2814e93 100644
--- a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
@@ -6,8 +6,8 @@
  *
  * on files with these headers:
  *
- * <meta name="updated_at" content="2021-12-09 12:01 AM" />
- * File-Date: 2021-08-06
+ * <meta name="updated_at" content="2022-01-28 10:00 PM" />
+ * File-Date: 2021-12-29
  */
 
 #ifndef HB_OT_TAG_TABLE_HH
@@ -66,7 +66,7 @@ static const LangTag ot_languages[] = {
   {"an",	HB_TAG('A','R','G',' ')},	/* Aragonese */
 /*{"ang",	HB_TAG('A','N','G',' ')},*/	/* Old English (ca. 450-1100) -> Anglo-Saxon */
   {"aoa",	HB_TAG('C','P','P',' ')},	/* Angolar -> Creoles */
-  {"apa",	HB_TAG('A','T','H',' ')},	/* Apache [family] -> Athapaskan */
+  {"apa",	HB_TAG('A','T','H',' ')},	/* Apache [collection] -> Athapaskan */
   {"apc",	HB_TAG('A','R','A',' ')},	/* North Levantine Arabic -> Arabic */
   {"apd",	HB_TAG('A','R','A',' ')},	/* Sudanese Arabic -> Arabic */
   {"apj",	HB_TAG('A','T','H',' ')},	/* Jicarilla Apache -> Athapaskan */
@@ -86,7 +86,7 @@ static const LangTag ot_languages[] = {
   {"arz",	HB_TAG('A','R','A',' ')},	/* Egyptian Arabic -> Arabic */
   {"as",	HB_TAG('A','S','M',' ')},	/* Assamese */
 /*{"ast",	HB_TAG('A','S','T',' ')},*/	/* Asturian */
-/*{"ath",	HB_TAG('A','T','H',' ')},*/	/* Athapascan [family] -> Athapaskan */
+/*{"ath",	HB_TAG('A','T','H',' ')},*/	/* Athapascan [collection] -> Athapaskan */
   {"atj",	HB_TAG('R','C','R',' ')},	/* Atikamekw -> R-Cree */
   {"atv",	HB_TAG('A','L','T',' ')},	/* Northern Altai -> Altai */
   {"auj",	HB_TAG('B','B','R',' ')},	/* Awjilah -> Berber */
@@ -110,10 +110,10 @@ static const LangTag ot_languages[] = {
   {"azn",	HB_TAG('N','A','H',' ')},	/* Western Durango Nahuatl -> Nahuatl */
   {"azz",	HB_TAG('N','A','H',' ')},	/* Highland Puebla Nahuatl -> Nahuatl */
   {"ba",	HB_TAG('B','S','H',' ')},	/* Bashkir */
-  {"bad",	HB_TAG('B','A','D','0')},	/* Banda [family] */
+  {"bad",	HB_TAG('B','A','D','0')},	/* Banda [collection] */
   {"bag",	HB_TAG_NONE	       },	/* Tuki != Baghelkhandi */
   {"bah",	HB_TAG('C','P','P',' ')},	/* Bahamas Creole English -> Creoles */
-  {"bai",	HB_TAG('B','M','L',' ')},	/* Bamileke [family] */
+  {"bai",	HB_TAG('B','M','L',' ')},	/* Bamileke [collection] */
   {"bal",	HB_TAG('B','L','I',' ')},	/* Baluchi [macrolanguage] */
 /*{"ban",	HB_TAG('B','A','N',' ')},*/	/* Balinese */
 /*{"bar",	HB_TAG('B','A','R',' ')},*/	/* Bavarian */
@@ -135,7 +135,7 @@ static const LangTag ot_languages[] = {
   {"bea",	HB_TAG('A','T','H',' ')},	/* Beaver -> Athapaskan */
   {"beb",	HB_TAG('B','T','I',' ')},	/* Bebele -> Beti */
 /*{"bem",	HB_TAG('B','E','M',' ')},*/	/* Bemba (Zambia) */
-  {"ber",	HB_TAG('B','B','R',' ')},	/* Berber [family] */
+  {"ber",	HB_TAG('B','B','R',' ')},	/* Berber [collection] */
   {"bew",	HB_TAG('C','P','P',' ')},	/* Betawi -> Creoles */
   {"bfl",	HB_TAG('B','A','D','0')},	/* Banda-Ndélé -> Banda */
   {"bfq",	HB_TAG('B','A','D',' ')},	/* Badaga */
@@ -203,7 +203,7 @@ static const LangTag ot_languages[] = {
   {"btd",	HB_TAG('B','T','K',' ')},	/* Batak Dairi -> Batak */
   {"bti",	HB_TAG_NONE	       },	/* Burate != Beti */
   {"btj",	HB_TAG('M','L','Y',' ')},	/* Bacanese Malay -> Malay */
-/*{"btk",	HB_TAG('B','T','K',' ')},*/	/* Batak [family] */
+/*{"btk",	HB_TAG('B','T','K',' ')},*/	/* Batak [collection] */
   {"btm",	HB_TAG('B','T','M',' ')},	/* Batak Mandailing */
   {"btm",	HB_TAG('B','T','K',' ')},	/* Batak Mandailing -> Batak */
   {"bto",	HB_TAG('B','I','K',' ')},	/* Rinconada Bikol -> Bikol */
@@ -256,6 +256,8 @@ static const LangTag ot_languages[] = {
   {"chh",	HB_TAG_NONE	       },	/* Chinook != Chattisgarhi */
   {"chj",	HB_TAG('C','C','H','N')},	/* Ojitlán Chinantec -> Chinantec */
   {"chk",	HB_TAG('C','H','K','0')},	/* Chuukese */
+  {"chm",	HB_TAG('H','M','A',' ')},	/* Mari (Russia) [macrolanguage] -> High Mari */
+  {"chm",	HB_TAG('L','M','A',' ')},	/* Mari (Russia) [macrolanguage] -> Low Mari */
   {"chn",	HB_TAG('C','P','P',' ')},	/* Chinook jargon -> Creoles */
 /*{"cho",	HB_TAG('C','H','O',' ')},*/	/* Choctaw */
   {"chp",	HB_TAG('C','H','P',' ')},	/* Chipewyan */
@@ -297,10 +299,10 @@ static const LangTag ot_languages[] = {
 /*{"cop",	HB_TAG('C','O','P',' ')},*/	/* Coptic */
   {"coq",	HB_TAG('A','T','H',' ')},	/* Coquille -> Athapaskan */
   {"cpa",	HB_TAG('C','C','H','N')},	/* Palantla Chinantec -> Chinantec */
-  {"cpe",	HB_TAG('C','P','P',' ')},	/* English-based creoles and pidgins [family] -> Creoles */
-  {"cpf",	HB_TAG('C','P','P',' ')},	/* French-based creoles and pidgins [family] -> Creoles */
+  {"cpe",	HB_TAG('C','P','P',' ')},	/* English-based creoles and pidgins [collection] -> Creoles */
+  {"cpf",	HB_TAG('C','P','P',' ')},	/* French-based creoles and pidgins [collection] -> Creoles */
   {"cpi",	HB_TAG('C','P','P',' ')},	/* Chinese Pidgin English -> Creoles */
-/*{"cpp",	HB_TAG('C','P','P',' ')},*/	/* Portuguese-based creoles and pidgins [family] -> Creoles */
+/*{"cpp",	HB_TAG('C','P','P',' ')},*/	/* Portuguese-based creoles and pidgins [collection] -> Creoles */
   {"cpx",	HB_TAG('Z','H','S',' ')},	/* Pu-Xian Chinese -> Chinese, Simplified */
   {"cqd",	HB_TAG('H','M','N',' ')},	/* Chuanqiandian Cluster Miao -> Hmong */
   {"cqu",	HB_TAG('Q','U','H',' ')},	/* Chilean Quechua (retired code) -> Quechua (Bolivia) */
@@ -320,7 +322,7 @@ static const LangTag ot_languages[] = {
   {"crm",	HB_TAG('M','C','R',' ')},	/* Moose Cree */
   {"crm",	HB_TAG('L','C','R',' ')},	/* Moose Cree -> L-Cree */
   {"crm",	HB_TAG('C','R','E',' ')},	/* Moose Cree -> Cree */
-  {"crp",	HB_TAG('C','P','P',' ')},	/* Creoles and pidgins [family] -> Creoles */
+  {"crp",	HB_TAG('C','P','P',' ')},	/* Creoles and pidgins [collection] -> Creoles */
   {"crr",	HB_TAG_NONE	       },	/* Carolina Algonquian != Carrier */
   {"crs",	HB_TAG('C','P','P',' ')},	/* Seselwa Creole French -> Creoles */
   {"crt",	HB_TAG_NONE	       },	/* Iyojwa'ja Chorote != Crimean Tatar */
@@ -431,7 +433,7 @@ static const LangTag ot_languages[] = {
   {"et",	HB_TAG('E','T','I',' ')},	/* Estonian [macrolanguage] */
   {"eto",	HB_TAG('B','T','I',' ')},	/* Eton (Cameroon) -> Beti */
   {"eu",	HB_TAG('E','U','Q',' ')},	/* Basque */
-  {"euq",	HB_TAG_NONE	       },	/* Basque [family] != Basque */
+  {"euq",	HB_TAG_NONE	       },	/* Basque [collection] != Basque */
   {"eve",	HB_TAG('E','V','N',' ')},	/* Even */
   {"evn",	HB_TAG('E','V','K',' ')},	/* Evenki */
   {"ewo",	HB_TAG('B','T','I',' ')},	/* Ewondo -> Beti */
@@ -620,10 +622,11 @@ static const LangTag ot_languages[] = {
   {"ijc",	HB_TAG('I','J','O',' ')},	/* Izon -> Ijo */
   {"ije",	HB_TAG('I','J','O',' ')},	/* Biseni -> Ijo */
   {"ijn",	HB_TAG('I','J','O',' ')},	/* Kalabari -> Ijo */
-/*{"ijo",	HB_TAG('I','J','O',' ')},*/	/* Ijo [family] */
+/*{"ijo",	HB_TAG('I','J','O',' ')},*/	/* Ijo [collection] */
   {"ijs",	HB_TAG('I','J','O',' ')},	/* Southeast Ijo -> Ijo */
   {"ik",	HB_TAG('I','P','K',' ')},	/* Inupiaq [macrolanguage] -> Inupiat */
   {"ike",	HB_TAG('I','N','U',' ')},	/* Eastern Canadian Inuktitut -> Inuktitut */
+  {"ike",	HB_TAG('I','N','U','K')},	/* Eastern Canadian Inuktitut -> Nunavik Inuktitut */
   {"ikt",	HB_TAG('I','N','U',' ')},	/* Inuinnaqtun -> Inuktitut */
 /*{"ilo",	HB_TAG('I','L','O',' ')},*/	/* Iloko -> Ilokano */
   {"in",	HB_TAG('I','N','D',' ')},	/* Indonesian (retired code) */
@@ -638,6 +641,7 @@ static const LangTag ot_languages[] = {
   {"it",	HB_TAG('I','T','A',' ')},	/* Italian */
   {"itz",	HB_TAG('M','Y','N',' ')},	/* Itzá -> Mayan */
   {"iu",	HB_TAG('I','N','U',' ')},	/* Inuktitut [macrolanguage] */
+  {"iu",	HB_TAG('I','N','U','K')},	/* Inuktitut [macrolanguage] -> Nunavik Inuktitut */
   {"iw",	HB_TAG('I','W','R',' ')},	/* Hebrew (retired code) */
   {"ixl",	HB_TAG('M','Y','N',' ')},	/* Ixil -> Mayan */
   {"ja",	HB_TAG('J','A','N',' ')},	/* Japanese */
@@ -667,7 +671,7 @@ static const LangTag ot_languages[] = {
   {"kab",	HB_TAG('B','B','R',' ')},	/* Kabyle -> Berber */
   {"kac",	HB_TAG_NONE	       },	/* Kachin != Kachchi */
   {"kam",	HB_TAG('K','M','B',' ')},	/* Kamba (Kenya) */
-  {"kar",	HB_TAG('K','R','N',' ')},	/* Karen [family] */
+  {"kar",	HB_TAG('K','R','N',' ')},	/* Karen [collection] */
 /*{"kaw",	HB_TAG('K','A','W',' ')},*/	/* Kawi (Old Javanese) */
   {"kbd",	HB_TAG('K','A','B',' ')},	/* Kabardian */
   {"kby",	HB_TAG('K','N','R',' ')},	/* Manga Kanuri -> Kanuri */
@@ -876,7 +880,7 @@ static const LangTag ot_languages[] = {
   {"mam",	HB_TAG('M','A','M',' ')},	/* Mam */
   {"mam",	HB_TAG('M','Y','N',' ')},	/* Mam -> Mayan */
   {"man",	HB_TAG('M','N','K',' ')},	/* Mandingo [macrolanguage] -> Maninka */
-  {"map",	HB_TAG_NONE	       },	/* Austronesian [family] != Mapudungun */
+  {"map",	HB_TAG_NONE	       },	/* Austronesian [collection] != Mapudungun */
   {"maw",	HB_TAG_NONE	       },	/* Mampruli != Marwari */
   {"max",	HB_TAG('M','L','Y',' ')},	/* North Moluccan Malay -> Malay */
   {"max",	HB_TAG('C','P','P',' ')},	/* North Moluccan Malay -> Creoles */
@@ -936,6 +940,7 @@ static const LangTag ot_languages[] = {
   {"mnw",	HB_TAG('M','O','N','T')},	/* Mon -> Thailand Mon */
   {"mnx",	HB_TAG_NONE	       },	/* Manikion != Manx */
   {"mo",	HB_TAG('M','O','L',' ')},	/* Moldavian (retired code) */
+  {"mo",	HB_TAG('R','O','M',' ')},	/* Moldavian (retired code) -> Romanian */
   {"mod",	HB_TAG('C','P','P',' ')},	/* Mobilian -> Creoles */
 /*{"moh",	HB_TAG('M','O','H',' ')},*/	/* Mohawk */
   {"mok",	HB_TAG_NONE	       },	/* Morori != Moksha */
@@ -958,7 +963,7 @@ static const LangTag ot_languages[] = {
   {"mts",	HB_TAG_NONE	       },	/* Yora != Maltese */
   {"mud",	HB_TAG('C','P','P',' ')},	/* Mednyj Aleut -> Creoles */
   {"mui",	HB_TAG('M','L','Y',' ')},	/* Musi -> Malay */
-  {"mun",	HB_TAG_NONE	       },	/* Munda [family] != Mundari */
+  {"mun",	HB_TAG_NONE	       },	/* Munda [collection] != Mundari */
   {"mup",	HB_TAG('R','A','J',' ')},	/* Malvi -> Rajasthani */
   {"muq",	HB_TAG('H','M','N',' ')},	/* Eastern Xiangxi Miao -> Hmong */
 /*{"mus",	HB_TAG('M','U','S',' ')},*/	/* Creek -> Muscogee */
@@ -973,7 +978,7 @@ static const LangTag ot_languages[] = {
   {"mww",	HB_TAG('H','M','N',' ')},	/* Hmong Daw -> Hmong */
   {"my",	HB_TAG('B','R','M',' ')},	/* Burmese */
   {"mym",	HB_TAG('M','E','N',' ')},	/* Me’en */
-/*{"myn",	HB_TAG('M','Y','N',' ')},*/	/* Mayan [family] */
+/*{"myn",	HB_TAG('M','Y','N',' ')},*/	/* Mayan [collection] */
   {"myq",	HB_TAG('M','N','K',' ')},	/* Forest Maninka (retired code) -> Maninka */
   {"myv",	HB_TAG('E','R','Z',' ')},	/* Erzya */
   {"mzb",	HB_TAG('B','B','R',' ')},	/* Tumzabt -> Berber */
@@ -982,7 +987,7 @@ static const LangTag ot_languages[] = {
   {"na",	HB_TAG('N','A','U',' ')},	/* Nauru -> Nauruan */
   {"nag",	HB_TAG('N','A','G',' ')},	/* Naga Pidgin -> Naga-Assamese */
   {"nag",	HB_TAG('C','P','P',' ')},	/* Naga Pidgin -> Creoles */
-/*{"nah",	HB_TAG('N','A','H',' ')},*/	/* Nahuatl [family] */
+/*{"nah",	HB_TAG('N','A','H',' ')},*/	/* Nahuatl [collection] */
   {"nan",	HB_TAG('Z','H','S',' ')},	/* Min Nan Chinese -> Chinese, Simplified */
 /*{"nap",	HB_TAG('N','A','P',' ')},*/	/* Neapolitan */
   {"nas",	HB_TAG_NONE	       },	/* Naasioi != Naskapi */
@@ -1039,7 +1044,6 @@ static const LangTag ot_languages[] = {
   {"nln",	HB_TAG('N','A','H',' ')},	/* Durango Nahuatl (retired code) -> Nahuatl */
   {"nlv",	HB_TAG('N','A','H',' ')},	/* Orizaba Nahuatl -> Nahuatl */
   {"nn",	HB_TAG('N','Y','N',' ')},	/* Norwegian Nynorsk (Nynorsk, Norwegian) */
-  {"nn",	HB_TAG('N','O','R',' ')},	/* Norwegian Nynorsk -> Norwegian */
   {"nnh",	HB_TAG('B','M','L',' ')},	/* Ngiemboon -> Bamileke */
   {"nnz",	HB_TAG('B','M','L',' ')},	/* Nda'nda' -> Bamileke */
   {"no",	HB_TAG('N','O','R',' ')},	/* Norwegian [macrolanguage] */
@@ -1093,7 +1097,7 @@ static const LangTag ot_languages[] = {
   {"otw",	HB_TAG('O','J','B',' ')},	/* Ottawa -> Ojibway */
   {"oua",	HB_TAG('B','B','R',' ')},	/* Tagargrent -> Berber */
   {"pa",	HB_TAG('P','A','N',' ')},	/* Punjabi */
-  {"paa",	HB_TAG_NONE	       },	/* Papuan [family] != Palestinian Aramaic */
+  {"paa",	HB_TAG_NONE	       },	/* Papuan [collection] != Palestinian Aramaic */
 /*{"pag",	HB_TAG('P','A','G',' ')},*/	/* Pangasinan */
   {"pal",	HB_TAG_NONE	       },	/* Pahlavi != Pali */
 /*{"pam",	HB_TAG('P','A','M',' ')},*/	/* Pampanga -> Pampangan */
@@ -1308,6 +1312,9 @@ static const LangTag ot_languages[] = {
   {"sgo",	HB_TAG_NONE	       },	/* Songa (retired code) != Sango */
 /*{"sgs",	HB_TAG('S','G','S',' ')},*/	/* Samogitian */
   {"sgw",	HB_TAG('C','H','G',' ')},	/* Sebat Bet Gurage -> Chaha Gurage */
+  {"sh",	HB_TAG('B','O','S',' ')},	/* Serbo-Croatian [macrolanguage] -> Bosnian */
+  {"sh",	HB_TAG('H','R','V',' ')},	/* Serbo-Croatian [macrolanguage] -> Croatian */
+  {"sh",	HB_TAG('S','R','B',' ')},	/* Serbo-Croatian [macrolanguage] -> Serbian */
   {"shi",	HB_TAG('S','H','I',' ')},	/* Tachelhit */
   {"shi",	HB_TAG('B','B','R',' ')},	/* Tachelhit -> Berber */
   {"shl",	HB_TAG('Q','I','N',' ')},	/* Shendu -> Chin */
@@ -1329,7 +1336,7 @@ static const LangTag ot_languages[] = {
   {"skw",	HB_TAG('C','P','P',' ')},	/* Skepi Creole Dutch -> Creoles */
   {"sky",	HB_TAG_NONE	       },	/* Sikaiana != Slovak */
   {"sl",	HB_TAG('S','L','V',' ')},	/* Slovenian */
-  {"sla",	HB_TAG_NONE	       },	/* Slavic [family] != Slavey */
+  {"sla",	HB_TAG_NONE	       },	/* Slavic [collection] != Slavey */
   {"sm",	HB_TAG('S','M','O',' ')},	/* Samoan */
   {"sma",	HB_TAG('S','S','M',' ')},	/* Southern Sami */
   {"smj",	HB_TAG('L','S','M',' ')},	/* Lule Sami */
@@ -1451,7 +1458,7 @@ static const LangTag ot_languages[] = {
   {"tpi",	HB_TAG('C','P','P',' ')},	/* Tok Pisin -> Creoles */
   {"tr",	HB_TAG('T','R','K',' ')},	/* Turkish */
   {"trf",	HB_TAG('C','P','P',' ')},	/* Trinidadian Creole English -> Creoles */
-  {"trk",	HB_TAG_NONE	       },	/* Turkic [family] != Turkish */
+  {"trk",	HB_TAG_NONE	       },	/* Turkic [collection] != Turkish */
   {"tru",	HB_TAG('T','U','A',' ')},	/* Turoyo -> Turoyo Aramaic */
   {"tru",	HB_TAG('S','Y','R',' ')},	/* Turoyo -> Syriac */
   {"ts",	HB_TAG('T','S','G',' ')},	/* Tsonga */
@@ -1593,7 +1600,7 @@ static const LangTag ot_languages[] = {
   {"zlq",	HB_TAG('Z','H','A',' ')},	/* Liuqian Zhuang -> Zhuang */
   {"zmi",	HB_TAG('M','L','Y',' ')},	/* Negeri Sembilan Malay -> Malay */
   {"zmz",	HB_TAG('B','A','D','0')},	/* Mbandja -> Banda */
-  {"znd",	HB_TAG_NONE	       },	/* Zande [family] != Zande */
+  {"znd",	HB_TAG_NONE	       },	/* Zande [collection] != Zande */
   {"zne",	HB_TAG('Z','N','D',' ')},	/* Zande */
   {"zom",	HB_TAG('Q','I','N',' ')},	/* Zou -> Chin */
   {"zqe",	HB_TAG('Z','H','A',' ')},	/* Qiubei Zhuang -> Zhuang */
@@ -2607,14 +2614,8 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
     if (0 == strcmp (&lang_str[1], "o-nyn"))
     {
       /* Norwegian Nynorsk (retired code) */
-      unsigned int i;
-      hb_tag_t possible_tags[] = {
-	HB_TAG('N','Y','N',' '),  /* Norwegian Nynorsk (Nynorsk, Norwegian) */
-	HB_TAG('N','O','R',' '),  /* Norwegian */
-      };
-      for (i = 0; i < 2 && i < *count; i++)
-	tags[i] = possible_tags[i];
-      *count = i;
+      tags[0] = HB_TAG('N','Y','N',' ');  /* Norwegian Nynorsk (Nynorsk, Norwegian) */
+      *count = 1;
       return true;
     }
     break;
@@ -2623,8 +2624,14 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
 	&& subtag_matches (lang_str, limit, "-md"))
     {
       /* Romanian; Moldova */
-      tags[0] = HB_TAG('M','O','L',' ');  /* Moldavian */
-      *count = 1;
+      unsigned int i;
+      hb_tag_t possible_tags[] = {
+	HB_TAG('M','O','L',' '),  /* Moldavian */
+	HB_TAG('R','O','M',' '),  /* Romanian */
+      };
+      for (i = 0; i < 2 && i < *count; i++)
+	tags[i] = possible_tags[i];
+      *count = i;
       return true;
     }
     break;
@@ -2813,15 +2820,15 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('A','R','K',' '):  /* Rakhine */
     return hb_language_from_string ("rki", -1);  /* Rakhine */
   case HB_TAG('A','T','H',' '):  /* Athapaskan */
-    return hb_language_from_string ("ath", -1);  /* Athapascan [family] */
+    return hb_language_from_string ("ath", -1);  /* Athapascan [collection] */
   case HB_TAG('B','B','R',' '):  /* Berber */
-    return hb_language_from_string ("ber", -1);  /* Berber [family] */
+    return hb_language_from_string ("ber", -1);  /* Berber [collection] */
   case HB_TAG('B','I','K',' '):  /* Bikol */
     return hb_language_from_string ("bik", -1);  /* Bikol [macrolanguage] */
   case HB_TAG('B','T','K',' '):  /* Batak */
-    return hb_language_from_string ("btk", -1);  /* Batak [family] */
+    return hb_language_from_string ("btk", -1);  /* Batak [collection] */
   case HB_TAG('C','P','P',' '):  /* Creoles */
-    return hb_language_from_string ("crp", -1);  /* Creoles and pidgins [family] */
+    return hb_language_from_string ("crp", -1);  /* Creoles and pidgins [collection] */
   case HB_TAG('C','R','R',' '):  /* Carrier */
     return hb_language_from_string ("crx", -1);  /* Carrier */
   case HB_TAG('D','G','R',' '):  /* Dogri (macrolanguage) */
@@ -2838,6 +2845,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("fa", -1);  /* Persian [macrolanguage] */
   case HB_TAG('G','O','N',' '):  /* Gondi */
     return hb_language_from_string ("gon", -1);  /* Gondi [macrolanguage] */
+  case HB_TAG('H','M','A',' '):  /* High Mari */
+    return hb_language_from_string ("mrj", -1);  /* Western Mari */
   case HB_TAG('H','M','N',' '):  /* Hmong */
     return hb_language_from_string ("hmn", -1);  /* Hmong [macrolanguage] */
   case HB_TAG('H','N','D',' '):  /* Hindko */
@@ -2847,7 +2856,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('I','B','A',' '):  /* Iban */
     return hb_language_from_string ("iba", -1);  /* Iban */
   case HB_TAG('I','J','O',' '):  /* Ijo */
-    return hb_language_from_string ("ijo", -1);  /* Ijo [family] */
+    return hb_language_from_string ("ijo", -1);  /* Ijo [collection] */
   case HB_TAG('I','N','U',' '):  /* Inuktitut */
     return hb_language_from_string ("iu", -1);  /* Inuktitut [macrolanguage] */
   case HB_TAG('I','P','K',' '):  /* Inupiat */
@@ -2873,11 +2882,13 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('K','P','L',' '):  /* Kpelle */
     return hb_language_from_string ("kpe", -1);  /* Kpelle [macrolanguage] */
   case HB_TAG('K','R','N',' '):  /* Karen */
-    return hb_language_from_string ("kar", -1);  /* Karen [family] */
+    return hb_language_from_string ("kar", -1);  /* Karen [collection] */
   case HB_TAG('K','U','I',' '):  /* Kui */
     return hb_language_from_string ("uki", -1);  /* Kui (India) */
   case HB_TAG('K','U','R',' '):  /* Kurdish */
     return hb_language_from_string ("ku", -1);  /* Kurdish [macrolanguage] */
+  case HB_TAG('L','M','A',' '):  /* Low Mari */
+    return hb_language_from_string ("mhr", -1);  /* Eastern Mari */
   case HB_TAG('L','U','H',' '):  /* Luyia */
     return hb_language_from_string ("luy", -1);  /* Luyia [macrolanguage] */
   case HB_TAG('L','V','I',' '):  /* Latvian */
@@ -2897,9 +2908,9 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('M','O','N','T'):  /* Thailand Mon */
     return hb_language_from_string ("mnw-TH", -1);  /* Mon; Thailand */
   case HB_TAG('M','Y','N',' '):  /* Mayan */
-    return hb_language_from_string ("myn", -1);  /* Mayan [family] */
+    return hb_language_from_string ("myn", -1);  /* Mayan [collection] */
   case HB_TAG('N','A','H',' '):  /* Nahuatl */
-    return hb_language_from_string ("nah", -1);  /* Nahuatl [family] */
+    return hb_language_from_string ("nah", -1);  /* Nahuatl [collection] */
   case HB_TAG('N','E','P',' '):  /* Nepali */
     return hb_language_from_string ("ne", -1);  /* Nepali [macrolanguage] */
   case HB_TAG('N','I','S',' '):  /* Nisi */
@@ -2926,6 +2937,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("qwh", -1);  /* Huaylas Ancash Quechua */
   case HB_TAG('R','A','J',' '):  /* Rajasthani */
     return hb_language_from_string ("raj", -1);  /* Rajasthani [macrolanguage] */
+  case HB_TAG('R','O','M',' '):  /* Romanian */
+    return hb_language_from_string ("ro", -1);  /* Romanian */
   case HB_TAG('R','O','Y',' '):  /* Romany */
     return hb_language_from_string ("rom", -1);  /* Romany [macrolanguage] */
   case HB_TAG('S','Q','I',' '):  /* Albanian */
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
index 05f289db26..e066558683 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
@@ -263,7 +263,7 @@ struct fvar
     if (coords_length && *coords_length)
     {
       hb_array_t<const HBFixed> instanceCoords = instance->get_coordinates (axisCount)
-							 .sub_array (0, *coords_length);
+							 .sub_array (0, coords_length);
       for (unsigned int i = 0; i < instanceCoords.length; i++)
 	coords[i] = instanceCoords.arrayZ[i].to_float ();
     }
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
index 49b5532d40..539213c339 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
@@ -399,7 +399,7 @@ struct gvar
 				  get_offset (glyphCount) - get_offset (0)));
   }
 
-  /* GlyphVariationData not sanitized here; must be checked while accessing each glyph varation data */
+  /* GlyphVariationData not sanitized here; must be checked while accessing each glyph variation data */
   bool sanitize (hb_sanitize_context_t *c) const
   { return sanitize_shallow (c); }
 
@@ -498,9 +498,9 @@ struct gvar
   public:
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<gvar> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     private:
     struct x_getter { static float get (const contour_point_t &p) { return p.x; } };
@@ -698,7 +698,9 @@ no_more_gaps:
   DEFINE_SIZE_MIN (20);
 };
 
-struct gvar_accelerator_t : gvar::accelerator_t {};
+struct gvar_accelerator_t : gvar::accelerator_t {
+  gvar_accelerator_t (hb_face_t *face) : gvar::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
index 074b6a3785..e9d90352f0 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
@@ -177,9 +177,6 @@ struct hvarvvar_subset_plan_t
 
     inner_maps.resize (var_store->get_sub_table_count ());
 
-    for (unsigned int i = 0; i < inner_maps.length; i++)
-      inner_maps[i].init ();
-
     if (unlikely (!index_map_plans.length || !inner_sets.length || !inner_maps.length)) return;
 
     bool retain_adv_map = false;
@@ -229,8 +226,8 @@ struct hvarvvar_subset_plan_t
     for (unsigned int i = 0; i < inner_sets.length; i++)
       hb_set_destroy (inner_sets[i]);
     hb_set_destroy (adv_set);
-    inner_maps.fini_deep ();
-    index_map_plans.fini_deep ();
+    inner_maps.fini ();
+    index_map_plans.fini ();
   }
 
   hb_inc_bimap_t outer_map;
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.cc b/thirdparty/harfbuzz/src/hb-ot-var.cc
index 6b42b45cd9..0376e26b4a 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-var.cc
@@ -303,6 +303,9 @@ hb_ot_var_normalize_variations (hb_face_t            *face,
  * values for the axis are mapped to the interval [-1,1], with the default
  * axis value mapped to 0.
  *
+ * The normalized values have 14 bits of fixed-point sub-integer precision as per
+ * OpenType specification.
+ *
  * Any additional scaling defined in the face's `avar` table is also
  * applied, as described at https://docs.microsoft.com/en-us/typography/opentype/spec/avar
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.h b/thirdparty/harfbuzz/src/hb-ot-var.h
index ce201d3b4f..05147cc25e 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.h
+++ b/thirdparty/harfbuzz/src/hb-ot-var.h
@@ -109,7 +109,7 @@ typedef enum { /*< flags >*/
  * @tag: The #hb_tag_t tag identifying the design variation of the axis
  * @name_id: The `name` table Name ID that provides display names for the axis
  * @flags: The #hb_ot_var_axis_flags_t flags for the axis
- * @min_value: The mininum value on the variation axis that the font covers
+ * @min_value: The minimum value on the variation axis that the font covers
  * @default_value: The position on the variation axis corresponding to the font's defaults
  * @max_value: The maximum value on the variation axis that the font covers
  * 
diff --git a/thirdparty/harfbuzz/src/hb-repacker.hh b/thirdparty/harfbuzz/src/hb-repacker.hh
index 5c46b4cccc..b1726d8beb 100644
--- a/thirdparty/harfbuzz/src/hb-repacker.hh
+++ b/thirdparty/harfbuzz/src/hb-repacker.hh
@@ -42,26 +42,13 @@ struct graph_t
 {
   struct vertex_t
   {
-    vertex_t () :
-        distance (0),
-        space (0),
-        parents (),
-        start (0),
-        end (0),
-        priority(0) {}
-
-    void fini () {
-      obj.fini ();
-      parents.fini ();
-    }
-
     hb_serialize_context_t::object_t obj;
-    int64_t distance;
-    int64_t space;
+    int64_t distance = 0 ;
+    int64_t space = 0 ;
     hb_vector_t<unsigned> parents;
-    unsigned start;
-    unsigned end;
-    unsigned priority;
+    unsigned start = 0;
+    unsigned end = 0;
+    unsigned priority = 0;
 
     bool is_shared () const
     {
@@ -186,7 +173,7 @@ struct graph_t
 
   ~graph_t ()
   {
-    vertices_.fini_deep ();
+    vertices_.fini ();
   }
 
   bool in_error () const
@@ -309,7 +296,7 @@ struct graph_t
     remap_all_obj_indices (id_map, &sorted_graph);
 
     hb_swap (vertices_, sorted_graph);
-    sorted_graph.fini_deep ();
+    sorted_graph.fini ();
   }
 
   /*
@@ -369,7 +356,7 @@ struct graph_t
     remap_all_obj_indices (id_map, &sorted_graph);
 
     hb_swap (vertices_, sorted_graph);
-    sorted_graph.fini_deep ();
+    sorted_graph.fini ();
   }
 
   /*
@@ -402,11 +389,15 @@ struct graph_t
     while (roots)
     {
       unsigned next = HB_SET_VALUE_INVALID;
+      if (unlikely (!check_success (!roots.in_error ()))) break;
       if (!roots.next (&next)) break;
 
       hb_set_t connected_roots;
       find_connected_nodes (next, roots, visited, connected_roots);
+      if (unlikely (!check_success (!connected_roots.in_error ()))) break;
+
       isolate_subgraph (connected_roots);
+      if (unlikely (!check_success (!connected_roots.in_error ()))) break;
 
       unsigned next_space = this->next_space ();
       num_roots_for_space_.push (0);
@@ -423,6 +414,8 @@ struct graph_t
       //                into the 32 bit space as needed, instead of using isolation.
     }
 
+
+
     return true;
   }
 
@@ -865,7 +858,7 @@ struct graph_t
     // Redundant ones are filtered out later on by the visited set.
     // According to https://www3.cs.stonybrook.edu/~rezaul/papers/TR-07-54.pdf
     // for practical performance this is faster then using a more advanced queue
-    // (such as a fibonaacci queue) with a fast decrease priority.
+    // (such as a fibonacci queue) with a fast decrease priority.
     for (unsigned i = 0; i < vertices_.length; i++)
     {
       if (i == vertices_.length - 1)
@@ -1074,6 +1067,7 @@ struct graph_t
                              hb_set_t& visited,
                              hb_set_t& connected)
   {
+    if (unlikely (!check_success (!visited.in_error ()))) return;
     if (visited.has (start_idx)) return;
     visited.add (start_idx);
 
diff --git a/thirdparty/harfbuzz/src/hb-serialize.hh b/thirdparty/harfbuzz/src/hb-serialize.hh
index 823c0be8b5..6615f033c5 100644
--- a/thirdparty/harfbuzz/src/hb-serialize.hh
+++ b/thirdparty/harfbuzz/src/hb-serialize.hh
@@ -279,7 +279,7 @@ struct hb_serialize_context_t
     object_pool.release (obj);
   }
 
-  /* Set share to false when an object is unlikely sharable with others
+  /* Set share to false when an object is unlikely shareable with others
    * so not worth an attempt, or a contiguous table is serialized as
    * multiple consecutive objects in the reverse order so can't be shared.
    */
@@ -381,7 +381,7 @@ struct hb_serialize_context_t
   // Adding a virtual link from object a to object b will ensure that object b is always packed after
   // object a in the final serialized order.
   //
-  // This is useful in certain situtations where there needs to be a specific ordering in the
+  // This is useful in certain situations where there needs to be a specific ordering in the
   // final serialization. Such as when platform bugs require certain orderings, or to provide
   //  guidance to the repacker for better offset overflow resolution.
   void add_virtual_link (objidx_t objidx)
@@ -510,7 +510,7 @@ struct hb_serialize_context_t
   { return reinterpret_cast<Type *> (this->head); }
   template <typename Type>
   Type *start_embed (const Type &obj) const
-  { return start_embed (hb_addressof (obj)); }
+  { return start_embed (std::addressof (obj)); }
 
   bool err (hb_serialize_error_t err_type)
   {
@@ -548,7 +548,7 @@ struct hb_serialize_context_t
   }
   template <typename Type>
   Type *embed (const Type &obj)
-  { return embed (hb_addressof (obj)); }
+  { return embed (std::addressof (obj)); }
 
   template <typename Type, typename ...Ts> auto
   _copy (const Type &src, hb_priority<1>, Ts&&... ds) HB_RETURN
@@ -595,19 +595,19 @@ struct hb_serialize_context_t
   }
   template <typename Type>
   Type *extend_size (Type &obj, size_t size)
-  { return extend_size (hb_addressof (obj), size); }
+  { return extend_size (std::addressof (obj), size); }
 
   template <typename Type>
   Type *extend_min (Type *obj) { return extend_size (obj, obj->min_size); }
   template <typename Type>
-  Type *extend_min (Type &obj) { return extend_min (hb_addressof (obj)); }
+  Type *extend_min (Type &obj) { return extend_min (std::addressof (obj)); }
 
   template <typename Type, typename ...Ts>
   Type *extend (Type *obj, Ts&&... ds)
   { return extend_size (obj, obj->get_size (std::forward<Ts> (ds)...)); }
   template <typename Type, typename ...Ts>
   Type *extend (Type &obj, Ts&&... ds)
-  { return extend (hb_addressof (obj), std::forward<Ts> (ds)...); }
+  { return extend (std::addressof (obj), std::forward<Ts> (ds)...); }
 
   /* Output routines. */
   hb_bytes_t copy_bytes () const
diff --git a/thirdparty/harfbuzz/src/hb-style.cc b/thirdparty/harfbuzz/src/hb-style.cc
index f1b44cea53..c0c5c4832c 100644
--- a/thirdparty/harfbuzz/src/hb-style.cc
+++ b/thirdparty/harfbuzz/src/hb-style.cc
@@ -48,13 +48,12 @@ _hb_angle_to_ratio (float a)
 {
   return tanf (a * float (M_PI / 180.));
 }
-#if 0
+
 static inline float
 _hb_ratio_to_angle (float r)
 {
   return atanf (r) * float (180. / M_PI);
 }
-#endif
 
 /**
  * hb_style_get_value:
@@ -73,7 +72,8 @@ float
 hb_style_get_value (hb_font_t *font, hb_style_tag_t style_tag)
 {
   if (unlikely (style_tag == HB_STYLE_TAG_SLANT_RATIO))
-    return _hb_angle_to_ratio (hb_style_get_value (font, HB_STYLE_TAG_SLANT_ANGLE));
+    return _hb_angle_to_ratio (hb_style_get_value (font, HB_STYLE_TAG_SLANT_ANGLE))
+	 + font->slant;
 
   hb_face_t *face = font->face;
 
@@ -109,7 +109,14 @@ hb_style_get_value (hb_font_t *font, hb_style_tag_t style_tag)
 	   : 12.f;
   }
   case HB_STYLE_TAG_SLANT_ANGLE:
-    return face->table.post->table->italicAngle.to_float ();
+  {
+    float angle = face->table.post->table->italicAngle.to_float ();
+
+    if (font->slant)
+      angle = _hb_ratio_to_angle (font->slant + _hb_angle_to_ratio (angle));
+
+    return angle;
+  }
   case HB_STYLE_TAG_WIDTH:
     return face->table.OS2->has_data ()
 	   ? face->table.OS2->get_width ()
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff-common.hh b/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
index 7fd96ca86d..18657705fa 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
+++ b/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
@@ -275,60 +275,36 @@ struct subr_flattener_t
 
 struct subr_closures_t
 {
-  subr_closures_t () : valid (false), global_closure (nullptr)
-  { local_closures.init (); }
-
-  void init (unsigned int fd_count)
+  subr_closures_t (unsigned int fd_count) : valid (false), global_closure (), local_closures ()
   {
     valid = true;
-    global_closure = hb_set_create ();
-    if (global_closure == hb_set_get_empty ())
-      valid = false;
     if (!local_closures.resize (fd_count))
       valid = false;
-
-    for (unsigned int i = 0; i < local_closures.length; i++)
-    {
-      local_closures[i] = hb_set_create ();
-      if (local_closures[i] == hb_set_get_empty ())
-	valid = false;
-    }
-  }
-
-  void fini ()
-  {
-    hb_set_destroy (global_closure);
-    for (unsigned int i = 0; i < local_closures.length; i++)
-      hb_set_destroy (local_closures[i]);
-    local_closures.fini ();
   }
 
   void reset ()
   {
-    hb_set_clear (global_closure);
+    global_closure.clear();
     for (unsigned int i = 0; i < local_closures.length; i++)
-      hb_set_clear (local_closures[i]);
+      local_closures[i].clear();
   }
 
   bool is_valid () const { return valid; }
   bool  valid;
-  hb_set_t  *global_closure;
-  hb_vector_t<hb_set_t *> local_closures;
+  hb_set_t  global_closure;
+  hb_vector_t<hb_set_t> local_closures;
 };
 
 struct parsed_cs_op_t : op_str_t
 {
   void init (unsigned int subr_num_ = 0)
   {
-    op_str_t::init ();
     subr_num = subr_num_;
     drop_flag = false;
     keep_flag = false;
     skip_flag = false;
   }
 
-  void fini () { op_str_t::fini (); }
-
   bool for_drop () const { return drop_flag; }
   void set_drop ()       { if (!for_keep ()) drop_flag = true; }
 
@@ -416,16 +392,6 @@ struct parsed_cs_str_t : parsed_values_t<parsed_cs_op_t>
 
 struct parsed_cs_str_vec_t : hb_vector_t<parsed_cs_str_t>
 {
-  void init (unsigned int len_ = 0)
-  {
-    SUPER::init ();
-    if (unlikely (!resize (len_)))
-      return;
-    for (unsigned int i = 0; i < length; i++)
-      (*this)[i].init ();
-  }
-  void fini () { SUPER::fini_deep (); }
-
   private:
   typedef hb_vector_t<parsed_cs_str_t> SUPER;
 };
@@ -496,7 +462,7 @@ struct subr_subset_param_t
 
 struct subr_remap_t : hb_inc_bimap_t
 {
-  void create (hb_set_t *closure)
+  void create (const hb_set_t *closure)
   {
     /* create a remapping of subroutine numbers from old to new.
      * no optimization based on usage counts. fonttools doesn't appear doing that either.
@@ -526,19 +492,9 @@ struct subr_remap_t : hb_inc_bimap_t
 
 struct subr_remaps_t
 {
-  subr_remaps_t ()
+  subr_remaps_t (unsigned int fdCount)
   {
-    global_remap.init ();
-    local_remaps.init ();
-  }
-
-  ~subr_remaps_t () { fini (); }
-
-  void init (unsigned int fdCount)
-  {
-    if (unlikely (!local_remaps.resize (fdCount))) return;
-    for (unsigned int i = 0; i < fdCount; i++)
-      local_remaps[i].init ();
+    local_remaps.resize (fdCount);
   }
 
   bool in_error()
@@ -548,15 +504,9 @@ struct subr_remaps_t
 
   void create (subr_closures_t& closures)
   {
-    global_remap.create (closures.global_closure);
+    global_remap.create (&closures.global_closure);
     for (unsigned int i = 0; i < local_remaps.length; i++)
-      local_remaps[i].create (closures.local_closures[i]);
-  }
-
-  void fini ()
-  {
-    global_remap.fini ();
-    local_remaps.fini_deep ();
+      local_remaps[i].create (&closures.local_closures[i]);
   }
 
   subr_remap_t	       global_remap;
@@ -567,21 +517,8 @@ template <typename SUBSETTER, typename SUBRS, typename ACC, typename ENV, typena
 struct subr_subsetter_t
 {
   subr_subsetter_t (ACC &acc_, const hb_subset_plan_t *plan_)
-    : acc (acc_), plan (plan_)
-  {
-    parsed_charstrings.init ();
-    parsed_global_subrs.init ();
-    parsed_local_subrs.init ();
-  }
-
-  ~subr_subsetter_t ()
-  {
-    closures.fini ();
-    remaps.fini ();
-    parsed_charstrings.fini_deep ();
-    parsed_global_subrs.fini_deep ();
-    parsed_local_subrs.fini_deep ();
-  }
+      : acc (acc_), plan (plan_), closures(acc_.fdCount), remaps(acc_.fdCount)
+  {}
 
   /* Subroutine subsetting with --no-desubroutinize runs in phases:
    *
@@ -599,11 +536,8 @@ struct subr_subsetter_t
    */
   bool subset (void)
   {
-    closures.init (acc.fdCount);
-    remaps.init (acc.fdCount);
-
-    parsed_charstrings.init (plan->num_output_glyphs ());
-    parsed_global_subrs.init (acc.globalSubrs->count);
+    parsed_charstrings.resize (plan->num_output_glyphs ());
+    parsed_global_subrs.resize (acc.globalSubrs->count);
 
     if (unlikely (remaps.in_error()
                   || parsed_charstrings.in_error ()
@@ -615,7 +549,7 @@ struct subr_subsetter_t
 
     for (unsigned int i = 0; i < acc.fdCount; i++)
     {
-      parsed_local_subrs[i].init (acc.privateDicts[i].localSubrs->count);
+      parsed_local_subrs[i].resize (acc.privateDicts[i].localSubrs->count);
       if (unlikely (parsed_local_subrs[i].in_error ())) return false;
     }
     if (unlikely (!closures.valid))
@@ -638,7 +572,7 @@ struct subr_subsetter_t
       subr_subset_param_t  param;
       param.init (&parsed_charstrings[i],
 		  &parsed_global_subrs,  &parsed_local_subrs[fd],
-		  closures.global_closure, closures.local_closures[fd],
+		  &closures.global_closure, &closures.local_closures[fd],
 		  plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 
       if (unlikely (!interp.interpret (param)))
@@ -662,7 +596,7 @@ struct subr_subsetter_t
 	subr_subset_param_t  param;
 	param.init (&parsed_charstrings[i],
 		    &parsed_global_subrs,  &parsed_local_subrs[fd],
-		    closures.global_closure, closures.local_closures[fd],
+		    &closures.global_closure, &closures.local_closures[fd],
                     plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 
 	drop_hints_param_t  drop;
@@ -687,7 +621,7 @@ struct subr_subsetter_t
 	subr_subset_param_t  param;
 	param.init (&parsed_charstrings[i],
 		    &parsed_global_subrs,  &parsed_local_subrs[fd],
-		    closures.global_closure, closures.local_closures[fd],
+		    &closures.global_closure, &closures.local_closures[fd],
                     plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 	collect_subr_refs_in_str (parsed_charstrings[i], param);
       }
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff1.cc b/thirdparty/harfbuzz/src/hb-subset-cff1.cc
index b4e24122c9..35fecd67bc 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff1.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-cff1.cc
@@ -362,43 +362,11 @@ struct cff1_subr_subsetter_t : subr_subsetter_t<cff1_subr_subsetter_t, CFF1Subrs
 
 struct cff_subset_plan {
   cff_subset_plan ()
-    : info (),
-      orig_fdcount (0),
-      subset_fdcount (1),
-      subset_fdselect_format (0),
-      drop_hints (false),
-      desubroutinize(false)
   {
-    topdict_mod.init ();
-    subset_fdselect_ranges.init ();
-    fdmap.init ();
-    subset_charstrings.init ();
-    subset_globalsubrs.init ();
-    subset_localsubrs.init ();
-    fontdicts_mod.init ();
-    subset_enc_code_ranges.init ();
-    subset_enc_supp_codes.init ();
-    subset_charset_ranges.init ();
-    sidmap.init ();
     for (unsigned int i = 0; i < name_dict_values_t::ValCount; i++)
       topDictModSIDs[i] = CFF_UNDEF_SID;
   }
 
-  ~cff_subset_plan ()
-  {
-    topdict_mod.fini ();
-    subset_fdselect_ranges.fini ();
-    fdmap.fini ();
-    subset_charstrings.fini_deep ();
-    subset_globalsubrs.fini_deep ();
-    subset_localsubrs.fini_deep ();
-    fontdicts_mod.fini ();
-    subset_enc_code_ranges.fini ();
-    subset_enc_supp_codes.fini ();
-    subset_charset_ranges.fini ();
-    sidmap.fini ();
-  }
-
   void plan_subset_encoding (const OT::cff1::accelerator_subset_t &acc, hb_subset_plan_t *plan)
   {
     const Encoding *encoding = acc.encoding;
@@ -672,9 +640,9 @@ struct cff_subset_plan {
   cff1_sub_table_info_t		info;
 
   unsigned int    num_glyphs;
-  unsigned int    orig_fdcount;
-  unsigned int    subset_fdcount;
-  unsigned int    subset_fdselect_format;
+  unsigned int    orig_fdcount = 0;
+  unsigned int    subset_fdcount = 1;
+  unsigned int    subset_fdselect_format = 0;
   hb_vector_t<code_pair_t>   subset_fdselect_ranges;
 
   /* font dict index remap table from fullset FDArray to subset FDArray.
@@ -686,7 +654,7 @@ struct cff_subset_plan {
   hb_vector_t<str_buff_vec_t>	subset_localsubrs;
   hb_vector_t<cff1_font_dict_values_mod_t>  fontdicts_mod;
 
-  bool		drop_hints;
+  bool		drop_hints = false;
 
   bool		gid_renum;
   bool		subset_encoding;
@@ -702,7 +670,7 @@ struct cff_subset_plan {
   remap_sid_t	sidmap;
   unsigned int	topDictModSIDs[name_dict_values_t::ValCount];
 
-  bool		desubroutinize;
+  bool		desubroutinize = false;
 };
 
 static bool _serialize_cff1 (hb_serialize_context_t *c,
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff2.cc b/thirdparty/harfbuzz/src/hb-subset-cff2.cc
index 896ae64016..92dd6b1d2c 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff2.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-cff2.cc
@@ -233,29 +233,6 @@ struct cff2_subr_subsetter_t : subr_subsetter_t<cff2_subr_subsetter_t, CFF2Subrs
 };
 
 struct cff2_subset_plan {
-  cff2_subset_plan ()
-    : orig_fdcount (0),
-      subset_fdcount(1),
-      subset_fdselect_size (0),
-      subset_fdselect_format (0),
-      drop_hints (false),
-      desubroutinize (false)
-  {
-    subset_fdselect_ranges.init ();
-    fdmap.init ();
-    subset_charstrings.init ();
-    subset_globalsubrs.init ();
-    subset_localsubrs.init ();
-  }
-
-  ~cff2_subset_plan ()
-  {
-    subset_fdselect_ranges.fini ();
-    fdmap.fini ();
-    subset_charstrings.fini_deep ();
-    subset_globalsubrs.fini_deep ();
-    subset_localsubrs.fini_deep ();
-  }
 
   bool create (const OT::cff2::accelerator_subset_t &acc,
 	      hb_subset_plan_t *plan)
@@ -320,10 +297,10 @@ struct cff2_subset_plan {
 
   cff2_sub_table_info_t info;
 
-  unsigned int    orig_fdcount;
-  unsigned int    subset_fdcount;
-  unsigned int	  subset_fdselect_size;
-  unsigned int    subset_fdselect_format;
+  unsigned int    orig_fdcount = 0;
+  unsigned int    subset_fdcount = 1;
+  unsigned int	  subset_fdselect_size = 0;
+  unsigned int    subset_fdselect_format = 0;
   hb_vector_t<code_pair_t>   subset_fdselect_ranges;
 
   hb_inc_bimap_t   fdmap;
@@ -332,8 +309,8 @@ struct cff2_subset_plan {
   str_buff_vec_t	    subset_globalsubrs;
   hb_vector_t<str_buff_vec_t> subset_localsubrs;
 
-  bool	    drop_hints;
-  bool	    desubroutinize;
+  bool	    drop_hints = false;
+  bool	    desubroutinize = false;
 };
 
 static bool _serialize_cff2 (hb_serialize_context_t *c,
@@ -473,12 +450,8 @@ _hb_subset_cff2 (const OT::cff2::accelerator_subset_t  &acc,
 bool
 hb_subset_cff2 (hb_subset_context_t *c)
 {
-  OT::cff2::accelerator_subset_t acc;
-  acc.init (c->plan->source);
-  bool result = likely (acc.is_valid ()) && _hb_subset_cff2 (acc, c);
-  acc.fini ();
-
-  return result;
+  OT::cff2::accelerator_subset_t acc (c->plan->source);
+  return acc.is_valid () && _hb_subset_cff2 (acc, c);
 }
 
 #endif
diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.cc b/thirdparty/harfbuzz/src/hb-subset-plan.cc
index 883ab82093..af4fcb8137 100644
--- a/thirdparty/harfbuzz/src/hb-subset-plan.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-plan.cc
@@ -228,10 +228,8 @@ _cmap_closure (hb_face_t	   *face,
 	       const hb_set_t	   *unicodes,
 	       hb_set_t		   *glyphset)
 {
-  OT::cmap::accelerator_t cmap;
-  cmap.init (face);
+  OT::cmap::accelerator_t cmap (face);
   cmap.table->closure_glyphs (unicodes, glyphset);
-  cmap.fini ();
 }
 
 static void _colr_closure (hb_face_t *face,
@@ -239,8 +237,7 @@ static void _colr_closure (hb_face_t *face,
                            hb_map_t *palettes_map,
                            hb_set_t *glyphs_colred)
 {
-  OT::COLR::accelerator_t colr;
-  colr.init (face);
+  OT::COLR::accelerator_t colr (face);
   if (!colr.is_valid ()) return;
 
   unsigned iteration_count = 0;
@@ -263,7 +260,6 @@ static void _colr_closure (hb_face_t *face,
   colr.closure_V0palette_indices (glyphs_colred, &palette_indices);
   _remap_indexes (&layer_indices, layers_map);
   _remap_palette_indexes (&palette_indices, palettes_map);
-  colr.fini ();
 }
 
 static inline void
@@ -294,8 +290,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
                               const hb_set_t *glyphs,
                               hb_subset_plan_t *plan)
 {
-  OT::cmap::accelerator_t cmap;
-  cmap.init (plan->source);
+  OT::cmap::accelerator_t cmap (plan->source);
 
   constexpr static const int size_threshold = 4096;
 
@@ -343,8 +338,6 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
 
   + plan->codepoint_to_glyph->keys ()   | hb_sink (plan->unicodes);
   + plan->codepoint_to_glyph->values () | hb_sink (plan->_glyphset_gsub);
-
-  cmap.fini ();
 }
 
 static void
@@ -353,13 +346,9 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
 			  bool close_over_gpos,
 			  bool close_over_gdef)
 {
-  OT::glyf::accelerator_t glyf;
-#ifndef HB_NO_SUBSET_CFF
-  OT::cff1::accelerator_t cff;
-#endif
-  glyf.init (plan->source);
+  OT::glyf::accelerator_t glyf (plan->source);
 #ifndef HB_NO_SUBSET_CFF
-  cff.init (plan->source);
+  OT::cff1::accelerator_t cff (plan->source);
 #endif
 
   plan->_glyphset_gsub->add (0); // Not-def
@@ -419,11 +408,6 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
 				       plan->layout_variation_indices,
 				       plan->layout_variation_idx_map);
 #endif
-
-#ifndef HB_NO_SUBSET_CFF
-  cff.fini ();
-#endif
-  glyf.fini ();
 }
 
 static void
diff --git a/thirdparty/harfbuzz/src/hb-uniscribe.cc b/thirdparty/harfbuzz/src/hb-uniscribe.cc
index 0e5a114f7d..50f71ce9ce 100644
--- a/thirdparty/harfbuzz/src/hb-uniscribe.cc
+++ b/thirdparty/harfbuzz/src/hb-uniscribe.cc
@@ -878,7 +878,8 @@ retry:
   if (backward)
     hb_buffer_reverse (buffer);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   /* Wow, done! */
   return true;
diff --git a/thirdparty/harfbuzz/src/hb-vector.hh b/thirdparty/harfbuzz/src/hb-vector.hh
index b0a1e5e966..6c7d32e49d 100644
--- a/thirdparty/harfbuzz/src/hb-vector.hh
+++ b/thirdparty/harfbuzz/src/hb-vector.hh
@@ -32,11 +32,14 @@
 #include "hb-null.hh"
 
 
-template <typename Type>
-struct hb_vector_t
+template <typename Type,
+	  bool sorted=false>
+struct hb_vector_t : std::conditional<sorted, hb_vector_t<Type, false>, hb_empty_t>::type
 {
   typedef Type item_t;
   static constexpr unsigned item_size = hb_static_size (Type);
+  using array_t = typename std::conditional<sorted, hb_sorted_array_t<Type>, hb_array_t<Type>>::type;
+  using c_array_t = typename std::conditional<sorted, hb_sorted_array_t<const Type>, hb_array_t<const Type>>::type;
 
   hb_vector_t () = default;
   hb_vector_t (std::initializer_list<Type> lst) : hb_vector_t ()
@@ -82,16 +85,10 @@ struct hb_vector_t
 
   void fini ()
   {
+    shrink_vector (0);
     hb_free (arrayZ);
     init ();
   }
-  void fini_deep ()
-  {
-    unsigned int count = length;
-    for (unsigned int i = 0; i < count; i++)
-      arrayZ[i].fini ();
-    fini ();
-  }
 
   void reset ()
   {
@@ -152,24 +149,24 @@ struct hb_vector_t
   template <typename T>
   hb_vector_t& operator << (T&& v) { push (std::forward<T> (v)); return *this; }
 
-  hb_array_t<      Type> as_array ()       { return hb_array (arrayZ, length); }
-  hb_array_t<const Type> as_array () const { return hb_array (arrayZ, length); }
+  array_t   as_array ()       { return hb_array (arrayZ, length); }
+  c_array_t as_array () const { return hb_array (arrayZ, length); }
 
   /* Iterator. */
-  typedef hb_array_t<const Type>   iter_t;
-  typedef hb_array_t<      Type> writer_t;
+  typedef c_array_t   iter_t;
+  typedef array_t   writer_t;
     iter_t   iter () const { return as_array (); }
   writer_t writer ()       { return as_array (); }
   operator   iter_t () const { return   iter (); }
   operator writer_t ()       { return writer (); }
 
-  hb_array_t<const Type> sub_array (unsigned int start_offset, unsigned int count) const
+  c_array_t sub_array (unsigned int start_offset, unsigned int count) const
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<const Type> sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */) const
+  c_array_t sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */) const
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int count)
+  array_t sub_array (unsigned int start_offset, unsigned int count)
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */)
+  array_t sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */)
   { return as_array ().sub_array (start_offset, count); }
 
   hb_sorted_array_t<Type> as_sorted_array ()
@@ -192,6 +189,7 @@ struct hb_vector_t
   template <typename T>
   Type *push (T&& v)
   {
+    /* TODO Emplace? */
     Type *p = push ();
     if (p == &Crap (Type))
       // If push failed to allocate then don't copy v, since this may cause
@@ -204,6 +202,92 @@ struct hb_vector_t
 
   bool in_error () const { return allocated < 0; }
 
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_copy_assignable<T>::value)>
+  Type *
+  realloc_vector (unsigned new_allocated)
+  {
+    return (Type *) hb_realloc (arrayZ, new_allocated * sizeof (Type));
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_copy_assignable<T>::value)>
+  Type *
+  realloc_vector (unsigned new_allocated)
+  {
+    Type *new_array = (Type *) hb_malloc (new_allocated * sizeof (Type));
+    if (likely (new_array))
+    {
+      for (unsigned i = 0; i < length; i++)
+	new (std::addressof (new_array[i])) Type ();
+      for (unsigned i = 0; i < (unsigned) length; i++)
+	new_array[i] = std::move (arrayZ[i]);
+      unsigned old_length = length;
+      shrink_vector (0);
+      length = old_length;
+      hb_free (arrayZ);
+    }
+    return new_array;
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_constructible<T>::value ||
+			  !std::is_default_constructible<T>::value)>
+  void
+  grow_vector (unsigned size)
+  {
+    memset (arrayZ + length, 0, (size - length) * sizeof (*arrayZ));
+    length = size;
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_constructible<T>::value &&
+			   std::is_default_constructible<T>::value)>
+  void
+  grow_vector (unsigned size)
+  {
+    while (length < size)
+    {
+      length++;
+      new (std::addressof (arrayZ[length - 1])) Type ();
+    }
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_destructible<T>::value)>
+  void
+  shrink_vector (unsigned size)
+  {
+    length = size;
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_destructible<T>::value)>
+  void
+  shrink_vector (unsigned size)
+  {
+    while ((unsigned) length > size)
+    {
+      arrayZ[(unsigned) length - 1].~Type ();
+      length--;
+    }
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_copy_assignable<T>::value)>
+  void
+  shift_down_vector (unsigned i)
+  {
+    memmove (static_cast<void *> (&arrayZ[i - 1]),
+	     static_cast<void *> (&arrayZ[i]),
+	     (length - i) * sizeof (Type));
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_copy_assignable<T>::value)>
+  void
+  shift_down_vector (unsigned i)
+  {
+    for (; i < length; i++)
+      arrayZ[i - 1] = std::move (arrayZ[i]);
+  }
+
   /* Allocate for size but don't adjust length. */
   bool alloc (unsigned int size)
   {
@@ -225,7 +309,7 @@ struct hb_vector_t
       (new_allocated < (unsigned) allocated) ||
       hb_unsigned_mul_overflows (new_allocated, sizeof (Type));
     if (likely (!overflows))
-      new_array = (Type *) hb_realloc (arrayZ, new_allocated * sizeof (Type));
+      new_array = realloc_vector (new_allocated);
 
     if (unlikely (!new_array))
     {
@@ -246,7 +330,9 @@ struct hb_vector_t
       return false;
 
     if (size > length)
-      memset (arrayZ + length, 0, (size - length) * sizeof (*arrayZ));
+      grow_vector (size);
+    else if (size < length)
+      shrink_vector (size);
 
     length = size;
     return true;
@@ -255,48 +341,38 @@ struct hb_vector_t
   Type pop ()
   {
     if (!length) return Null (Type);
-    return std::move (arrayZ[--length]); /* Does this move actually work? */
+    Type v = std::move (arrayZ[length - 1]);
+    arrayZ[length - 1].~Type ();
+    length--;
+    return v;
   }
 
   void remove (unsigned int i)
   {
     if (unlikely (i >= length))
       return;
-    memmove (static_cast<void *> (&arrayZ[i]),
-	     static_cast<void *> (&arrayZ[i + 1]),
-	     (length - i - 1) * sizeof (Type));
+    arrayZ[i].~Type ();
+    shift_down_vector (i + 1);
     length--;
   }
 
   void shrink (int size_)
   {
     unsigned int size = size_ < 0 ? 0u : (unsigned int) size_;
-     if (size < length)
-       length = size;
-  }
+    if (size >= length)
+      return;
 
-  template <typename T>
-  Type *find (T v)
-  {
-    for (unsigned int i = 0; i < length; i++)
-      if (arrayZ[i] == v)
-	return &arrayZ[i];
-    return nullptr;
-  }
-  template <typename T>
-  const Type *find (T v) const
-  {
-    for (unsigned int i = 0; i < length; i++)
-      if (arrayZ[i] == v)
-	return &arrayZ[i];
-    return nullptr;
+    shrink_vector (size);
   }
 
+
+  /* Sorting API. */
   void qsort (int (*cmp)(const void*, const void*))
   { as_array ().qsort (cmp); }
   void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
   { as_array ().qsort (start, end); }
 
+  /* Unsorted search API. */
   template <typename T>
   Type *lsearch (const T &x, Type *not_found = nullptr)
   { return as_array ().lsearch (x, not_found); }
@@ -306,47 +382,25 @@ struct hb_vector_t
   template <typename T>
   bool lfind (const T &x, unsigned *pos = nullptr) const
   { return as_array ().lfind (x, pos); }
-};
 
-template <typename Type>
-struct hb_sorted_vector_t : hb_vector_t<Type>
-{
-  hb_sorted_vector_t () = default;
-  ~hb_sorted_vector_t () = default;
-  hb_sorted_vector_t (hb_sorted_vector_t& o) = default;
-  hb_sorted_vector_t (hb_sorted_vector_t &&o) = default;
-  hb_sorted_vector_t (std::initializer_list<Type> lst) : hb_vector_t<Type> (lst) {}
-  template <typename Iterable,
-	    hb_requires (hb_is_iterable (Iterable))>
-  hb_sorted_vector_t (const Iterable &o) : hb_vector_t<Type> (o) {}
-  hb_sorted_vector_t& operator = (const hb_sorted_vector_t &o) = default;
-  hb_sorted_vector_t& operator = (hb_sorted_vector_t &&o) = default;
-  friend void swap (hb_sorted_vector_t& a, hb_sorted_vector_t& b)
-  { hb_swap ((hb_vector_t<Type>&) (a), (hb_vector_t<Type>&) (b)); }
-
-  hb_sorted_array_t<      Type> as_array ()       { return hb_sorted_array (this->arrayZ, this->length); }
-  hb_sorted_array_t<const Type> as_array () const { return hb_sorted_array (this->arrayZ, this->length); }
-
-  /* Iterator. */
-  typedef hb_sorted_array_t<const Type> const_iter_t;
-  typedef hb_sorted_array_t<      Type>       iter_t;
-  const_iter_t  iter () const { return as_array (); }
-  const_iter_t citer () const { return as_array (); }
-	iter_t  iter ()       { return as_array (); }
-  operator       iter_t ()       { return iter (); }
-  operator const_iter_t () const { return iter (); }
-
-  template <typename T>
+  /* Sorted search API. */
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   Type *bsearch (const T &x, Type *not_found = nullptr)
   { return as_array ().bsearch (x, not_found); }
-  template <typename T>
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   const Type *bsearch (const T &x, const Type *not_found = nullptr) const
   { return as_array ().bsearch (x, not_found); }
-  template <typename T>
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   bool bfind (const T &x, unsigned int *i = nullptr,
 	      hb_not_found_t not_found = HB_NOT_FOUND_DONT_STORE,
 	      unsigned int to_store = (unsigned int) -1) const
   { return as_array ().bfind (x, i, not_found, to_store); }
 };
 
+template <typename Type>
+using hb_sorted_vector_t = hb_vector_t<Type, true>;
+
 #endif /* HB_VECTOR_HH */
diff --git a/thirdparty/harfbuzz/src/hb-version.h b/thirdparty/harfbuzz/src/hb-version.h
index 52b124b745..493a09f8cf 100644
--- a/thirdparty/harfbuzz/src/hb-version.h
+++ b/thirdparty/harfbuzz/src/hb-version.h
@@ -47,20 +47,20 @@ HB_BEGIN_DECLS
  *
  * The minor component of the library version available at compile-time.
  */
-#define HB_VERSION_MINOR 2
+#define HB_VERSION_MINOR 3
 /**
  * HB_VERSION_MICRO:
  *
  * The micro component of the library version available at compile-time.
  */
-#define HB_VERSION_MICRO 0
+#define HB_VERSION_MICRO 2
 
 /**
  * HB_VERSION_STRING:
  *
  * A string literal containing the library version available at compile-time.
  */
-#define HB_VERSION_STRING "3.2.0"
+#define HB_VERSION_STRING "3.3.2"
 
 /**
  * HB_VERSION_ATLEAST:
diff --git a/thirdparty/harfbuzz/src/hb.hh b/thirdparty/harfbuzz/src/hb.hh
index 1f14267525..b9f5f71415 100644
--- a/thirdparty/harfbuzz/src/hb.hh
+++ b/thirdparty/harfbuzz/src/hb.hh
@@ -447,6 +447,7 @@ static int HB_UNUSED _hb_errno = 0;
 #ifndef HB_USE_ATEXIT
 #  define HB_USE_ATEXIT 0
 #endif
+#ifndef hb_atexit
 #if !HB_USE_ATEXIT
 #  define hb_atexit(_) HB_STMT_START { if (0) (_) (); } HB_STMT_END
 #else /* HB_USE_ATEXIT */
@@ -457,6 +458,7 @@ static int HB_UNUSED _hb_errno = 0;
 #    define hb_atexit(f) static hb_atexit_t<f> _hb_atexit_##__LINE__;
 #  endif
 #endif
+#endif
 
 /* Lets assert int types.  Saves trouble down the road. */
 static_assert ((sizeof (hb_codepoint_t) == 4), "");
diff --git a/thirdparty/libwebp/AUTHORS b/thirdparty/libwebp/AUTHORS
index 30abde0326..8307c2099d 100644
--- a/thirdparty/libwebp/AUTHORS
+++ b/thirdparty/libwebp/AUTHORS
@@ -32,6 +32,7 @@ Contributors:
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
 - Sam Clegg (sbc at chromium dot org)
 - Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
diff --git a/thirdparty/libwebp/src/dec/vp8_dec.c b/thirdparty/libwebp/src/dec/vp8_dec.c
index 5f405e4c2a..2003935ec4 100644
--- a/thirdparty/libwebp/src/dec/vp8_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8_dec.c
@@ -403,7 +403,7 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
   if (!VP8GetBit(br, p[3], "coeffs")) {
diff --git a/thirdparty/libwebp/src/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h
index 20526a87c4..9af22f8cc6 100644
--- a/thirdparty/libwebp/src/dec/vp8i_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8i_dec.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 1
 #define DEC_MIN_VERSION 2
-#define DEC_REV_VERSION 1
+#define DEC_REV_VERSION 2
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/thirdparty/libwebp/src/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c
index 73c3b54fff..78db014030 100644
--- a/thirdparty/libwebp/src/dec/vp8l_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
diff --git a/thirdparty/libwebp/src/demux/anim_decode.c b/thirdparty/libwebp/src/demux/anim_decode.c
index 2bf4dcffe0..e077ffb536 100644
--- a/thirdparty/libwebp/src/demux/anim_decode.c
+++ b/thirdparty/libwebp/src/demux/anim_decode.c
@@ -23,6 +23,14 @@
 
 #define NUM_CHANNELS 4
 
+// Channel extraction from a uint32_t representation of a uint8_t RGBA/BGRA
+// buffer.
+#ifdef WORDS_BIGENDIAN
+#define CHANNEL_SHIFT(i) (24 - (i) * 8)
+#else
+#define CHANNEL_SHIFT(i) ((i) * 8)
+#endif
+
 typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
 static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels);
@@ -209,35 +217,35 @@ static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
   const uint8_t dst_channel = (dst >> shift) & 0xff;
   const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
   assert(blend_unscaled < (1ULL << 32) / scale);
-  return (blend_unscaled * scale) >> 24;
+  return (blend_unscaled * scale) >> CHANNEL_SHIFT(3);
 }
 
 // Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
 static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
 
   if (src_a == 0) {
     return dst;
   } else {
-    const uint8_t dst_a = (dst >> 24) & 0xff;
+    const uint8_t dst_a = (dst >> CHANNEL_SHIFT(3)) & 0xff;
     // This is the approximate integer arithmetic for the actual formula:
     // dst_factor_a = (dst_a * (255 - src_a)) / 255.
     const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
     const uint8_t blend_a = src_a + dst_factor_a;
     const uint32_t scale = (1UL << 24) / blend_a;
 
-    const uint8_t blend_r =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
-    const uint8_t blend_g =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
-    const uint8_t blend_b =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    const uint8_t blend_r = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(0));
+    const uint8_t blend_g = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(1));
+    const uint8_t blend_b = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(2));
     assert(src_a + dst_factor_a < 256);
 
-    return (blend_r << 0) |
-           (blend_g << 8) |
-           (blend_b << 16) |
-           ((uint32_t)blend_a << 24);
+    return ((uint32_t)blend_r << CHANNEL_SHIFT(0)) |
+           ((uint32_t)blend_g << CHANNEL_SHIFT(1)) |
+           ((uint32_t)blend_b << CHANNEL_SHIFT(2)) |
+           ((uint32_t)blend_a << CHANNEL_SHIFT(3));
   }
 }
 
@@ -247,7 +255,7 @@ static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelNonPremult(src[i], dst[i]);
     }
@@ -264,7 +272,7 @@ static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
 
 // Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
 static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
   return src + ChannelwiseMultiply(dst, 256 - src_a);
 }
 
@@ -274,7 +282,7 @@ static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
                                  int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelPremult(src[i], dst[i]);
     }
diff --git a/thirdparty/libwebp/src/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c
index 547a7725de..f04a2b8450 100644
--- a/thirdparty/libwebp/src/demux/demux.c
+++ b/thirdparty/libwebp/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 1
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 1
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/thirdparty/libwebp/src/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h
index 513e159bb3..c4f57e4d5b 100644
--- a/thirdparty/libwebp/src/dsp/dsp.h
+++ b/thirdparty/libwebp/src/dsp/dsp.h
@@ -119,7 +119,12 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h.
+#if defined(_MSC_VER) && \
+  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+   (_MSC_VER >= 1920 && defined(_M_ARM64)))
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
 #endif
diff --git a/thirdparty/libwebp/src/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c
index 43bf1245c5..601962ba76 100644
--- a/thirdparty/libwebp/src/dsp/enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/enc_neon.c
@@ -9,7 +9,7 @@
 //
 // ARM NEON version of speed-critical encoding functions.
 //
-// adapted from libvpx (http://www.webmproject.org/code/)
+// adapted from libvpx (https://www.webmproject.org/code/)
 
 #include "src/dsp/dsp.h"
 
diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c
index d8bbb02b35..84a54296fd 100644
--- a/thirdparty/libwebp/src/dsp/lossless.c
+++ b/thirdparty/libwebp/src/dsp/lossless.c
@@ -107,63 +107,77 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
diff --git a/thirdparty/libwebp/src/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h
index ebd316d1ed..c26c6bca07 100644
--- a/thirdparty/libwebp/src/dsp/lossless.h
+++ b/thirdparty/libwebp/src/dsp/lossless.h
@@ -28,23 +28,38 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
 
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
diff --git a/thirdparty/libwebp/src/dsp/lossless_common.h b/thirdparty/libwebp/src/dsp/lossless_common.h
index 96a106f9ee..6a2f736b5e 100644
--- a/thirdparty/libwebp/src/dsp/lossless_common.h
+++ b/thirdparty/libwebp/src/dsp/lossless_common.h
@@ -179,7 +179,7 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
   int x;                                                             \
   assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c
index c3e8537ade..1580631e38 100644
--- a/thirdparty/libwebp/src/dsp/lossless_enc.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc.c
@@ -745,7 +745,7 @@ static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
   assert(upper != NULL);                                                   \
   for (x = 0; x < num_pixels; ++x) {                                       \
     const uint32_t pred =                                                  \
-        VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x);              \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
     out[x] = VP8LSubPixels(in[x], pred);                                   \
   }                                                                        \
 }
diff --git a/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 9888854d57..bfe5ea6b38 100644
--- a/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -188,46 +188,51 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average3(left, top[0], top[1]);
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
 }
 
-static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[-1]);
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
 }
 
-static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[0]);
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
 }
 
-static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Average4(left, top[-1], top[0], top[1]);
+  return Average4(*left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Select(top[0], left, top[-1]);
+  return Select(top[0], *left, top[-1]);
 }
 
-static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractFull(left, top[0], top[-1]);
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
diff --git a/thirdparty/libwebp/src/dsp/lossless_neon.c b/thirdparty/libwebp/src/dsp/lossless_neon.c
index 76a1b6f873..89e3e013a0 100644
--- a/thirdparty/libwebp/src/dsp/lossless_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c
index 3a0eb440db..396cb0bdfc 100644
--- a/thirdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c
@@ -138,42 +138,51 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
diff --git a/thirdparty/libwebp/src/dsp/msa_macro.h b/thirdparty/libwebp/src/dsp/msa_macro.h
index de026a1d9e..51f6c643ab 100644
--- a/thirdparty/libwebp/src/dsp/msa_macro.h
+++ b/thirdparty/libwebp/src/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/thirdparty/libwebp/src/dsp/neon.h b/thirdparty/libwebp/src/dsp/neon.h
index aa1dea1301..c591f9b9a7 100644
--- a/thirdparty/libwebp/src/dsp/neon.h
+++ b/thirdparty/libwebp/src/dsp/neon.h
@@ -12,10 +12,12 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "src/dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/thirdparty/libwebp/src/dsp/yuv.h b/thirdparty/libwebp/src/dsp/yuv.h
index c12be1d094..66a397d117 100644
--- a/thirdparty/libwebp/src/dsp/yuv.h
+++ b/thirdparty/libwebp/src/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/thirdparty/libwebp/src/enc/frame_enc.c b/thirdparty/libwebp/src/enc/frame_enc.c
index af538d83ba..b93d9e5b99 100644
--- a/thirdparty/libwebp/src/enc/frame_enc.c
+++ b/thirdparty/libwebp/src/enc/frame_enc.c
@@ -778,6 +778,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   // Roughly refresh the proba eight times per pass
   int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
   int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
   const int do_search = enc->do_search_;
   VP8EncIterator it;
   VP8EncProba* const proba = &enc->proba_;
@@ -805,6 +806,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     uint64_t size_p0 = 0;
     uint64_t distortion = 0;
     int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
     VP8IteratorInit(enc, &it);
     SetLoopParams(enc, stats.q);
     if (is_last_pass) {
@@ -832,7 +836,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
         VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+        ok = VP8IteratorProgress(&it, pass_progress);
       }
       VP8IteratorSaveBoundary(&it);
     } while (ok && VP8IteratorNext(&it));
@@ -878,7 +882,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                        (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
   return PostLoopFinalize(&it, ok);
 }
 
diff --git a/thirdparty/libwebp/src/enc/predictor_enc.c b/thirdparty/libwebp/src/enc/predictor_enc.c
index 2e6762ea0d..2b5c767280 100644
--- a/thirdparty/libwebp/src/enc/predictor_enc.c
+++ b/thirdparty/libwebp/src/enc/predictor_enc.c
@@ -249,7 +249,7 @@ static WEBP_INLINE void GetResidual(
       } else if (x == 0) {
         predict = upper_row[x];  // Top.
       } else {
-        predict = pred_func(current_row[x - 1], upper_row + x);
+        predict = pred_func(&current_row[x - 1], upper_row + x);
       }
 #if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
diff --git a/thirdparty/libwebp/src/enc/quant_enc.c b/thirdparty/libwebp/src/enc/quant_enc.c
index 01eb565c7f..6cede28ab4 100644
--- a/thirdparty/libwebp/src/enc/quant_enc.c
+++ b/thirdparty/libwebp/src/enc/quant_enc.c
@@ -585,6 +585,9 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
@@ -593,7 +596,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
   CostArrayPtr const costs =
       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
-  const int first = (coeff_type == 0) ? 1 : 0;
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
@@ -657,16 +660,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int level = level0 + m;
+      const int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
       score_t base_score;
-      score_t best_cur_score = MAX_COST;
-      int best_prev = 0;   // default, in case
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
 
-      ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
       if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
         // Node is dead.
         continue;
       }
@@ -682,18 +686,24 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
-      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
         // eliminated since their score can't be better than the current best.
-        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
+        cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        const score_t score =
-            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
         if (score < best_cur_score) {
           best_cur_score = score;
           best_prev = p;
         }
       }
+      best_cur_score += base_score;
       // Store best finding in current node.
       cur->sign = sign;
       cur->level = level;
@@ -701,11 +711,11 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       ss_cur[m].score = best_cur_score;
 
       // Now, record best terminal node (and thus best entry in the graph).
-      if (level != 0) {
+      if (level != 0 && best_cur_score < best_score) {
         const score_t last_pos_cost =
             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-        const score_t score = best_cur_score + last_pos_score;
+        score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
           best_path[0] = n;                     // best eob position
@@ -717,10 +727,16 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   }
 
   // Fresh start
-  memset(in + first, 0, (16 - first) * sizeof(*in));
-  memset(out + first, 0, (16 - first) * sizeof(*out));
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
   if (best_path[0] == -1) {
-    return 0;   // skip!
+    return 0;  // skip!
   }
 
   {
@@ -775,9 +791,9 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     for (y = 0, n = 0; y < 4; ++y) {
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
-        const int non_zero =
-            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                 &dqm->y1_, dqm->lambda_trellis_i16_);
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
         rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
@@ -818,7 +834,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
@@ -927,9 +943,9 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       for (y = 0; y < 2; ++y) {
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-          const int non_zero =
-              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
-                                   &dqm->uv_, dqm->lambda_trellis_uv_);
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
diff --git a/thirdparty/libwebp/src/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 67e9509367..b4bba08f27 100644
--- a/thirdparty/libwebp/src/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 1
 #define ENC_MIN_VERSION 2
-#define ENC_REV_VERSION 1
+#define ENC_REV_VERSION 2
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
diff --git a/thirdparty/libwebp/src/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h
index 330da66754..d9bf9b3770 100644
--- a/thirdparty/libwebp/src/mux/muxi.h
+++ b/thirdparty/libwebp/src/mux/muxi.h
@@ -29,7 +29,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 1
 #define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 1
+#define MUX_REV_VERSION 2
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/thirdparty/libwebp/src/utils/huffman_encode_utils.c b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
index fd7a47d8f7..585db91951 100644
--- a/thirdparty/libwebp/src/utils/huffman_encode_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -161,7 +161,7 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // especially when population counts are longer than 2**tree_limit, but
 // we are not planning to use this with extremely long blocks.
 //
-// See http://en.wikipedia.org/wiki/Huffman_coding
+// See https://en.wikipedia.org/wiki/Huffman_coding
 static void GenerateOptimalTree(const uint32_t* const histogram,
                                 int histogram_size,
                                 HuffmanTree* tree, int tree_depth_limit,
diff --git a/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
index f65b6cdbb6..97e7893704 100644
--- a/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/thirdparty/libwebp/src/utils/utils.c b/thirdparty/libwebp/src/utils/utils.c
index 9e464c16ce..a7c3a70fef 100644
--- a/thirdparty/libwebp/src/utils/utils.c
+++ b/thirdparty/libwebp/src/utils/utils.c
@@ -23,7 +23,7 @@
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
diff --git a/thirdparty/libwebp/src/webp/decode.h b/thirdparty/libwebp/src/webp/decode.h
index 44fcd64a84..d98247509a 100644
--- a/thirdparty/libwebp/src/webp/decode.h
+++ b/thirdparty/libwebp/src/webp/decode.h
@@ -85,7 +85,7 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
diff --git a/thirdparty/misc/patches/polypartition-godot-types.patch b/thirdparty/misc/patches/polypartition-godot-types.patch
index 782f02e8dc..61737f9fd2 100644
--- a/thirdparty/misc/patches/polypartition-godot-types.patch
+++ b/thirdparty/misc/patches/polypartition-godot-types.patch
@@ -1,19 +1,16 @@
 diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
-index 3a8a6efa83..5e94793b79 100644
+index 3a8a6efa83..8c5409bf24 100644
 --- a/thirdparty/misc/polypartition.cpp
 +++ b/thirdparty/misc/polypartition.cpp
-@@ -23,10 +23,7 @@
- 
- #include "polypartition.h"
- 
--#include <math.h>
--#include <string.h>
+@@ -26,7 +26,6 @@
+ #include <math.h>
+ #include <string.h>
  #include <algorithm>
 -#include <vector>
  
  TPPLPoly::TPPLPoly() {
    hole = false;
-@@ -186,7 +183,7 @@ int TPPLPartition::Intersects(TPPLPoint &p11, TPPLPoint &p12, TPPLPoint &p21, TP
+@@ -186,7 +185,7 @@ int TPPLPartition::Intersects(TPPLPoint &p11, TPPLPoint &p12, TPPLPoint &p21, TP
  // Removes holes from inpolys by merging them with non-holes.
  int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
    TPPLPolyList polys;
@@ -22,7 +19,7 @@ index 3a8a6efa83..5e94793b79 100644
    long i, i2, holepointindex, polypointindex;
    TPPLPoint holepoint, polypoint, bestpolypoint;
    TPPLPoint linep1, linep2;
-@@ -198,15 +195,15 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -198,15 +197,15 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
  
    // Check for the trivial case of no holes.
    hasholes = false;
@@ -42,7 +39,7 @@ index 3a8a6efa83..5e94793b79 100644
      }
      return 1;
    }
-@@ -216,8 +213,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -216,8 +215,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
    while (1) {
      // Find the hole point with the largest x.
      hasholes = false;
@@ -53,7 +50,7 @@ index 3a8a6efa83..5e94793b79 100644
          continue;
        }
  
-@@ -227,8 +224,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -227,8 +226,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
          holepointindex = 0;
        }
  
@@ -64,7 +61,7 @@ index 3a8a6efa83..5e94793b79 100644
            holeiter = iter;
            holepointindex = i;
          }
-@@ -237,24 +234,24 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -237,24 +236,24 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
      if (!hasholes) {
        break;
      }
@@ -98,7 +95,7 @@ index 3a8a6efa83..5e94793b79 100644
          if (pointfound) {
            v1 = Normalize(polypoint - holepoint);
            v2 = Normalize(bestpolypoint - holepoint);
-@@ -263,13 +260,13 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -263,13 +262,13 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
            }
          }
          pointvisible = true;
@@ -117,7 +114,7 @@ index 3a8a6efa83..5e94793b79 100644
              if (Intersects(holepoint, polypoint, linep1, linep2)) {
                pointvisible = false;
                break;
-@@ -292,18 +289,18 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -292,18 +291,18 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
        return 0;
      }
  
@@ -142,7 +139,7 @@ index 3a8a6efa83..5e94793b79 100644
        i2++;
      }
  
-@@ -312,8 +309,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -312,8 +311,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
      polys.push_back(newpoly);
    }
  
@@ -153,7 +150,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    return 1;
-@@ -524,13 +521,13 @@ int TPPLPartition::Triangulate_EC(TPPLPoly *poly, TPPLPolyList *triangles) {
+@@ -524,13 +523,13 @@ int TPPLPartition::Triangulate_EC(TPPLPoly *poly, TPPLPolyList *triangles) {
  
  int TPPLPartition::Triangulate_EC(TPPLPolyList *inpolys, TPPLPolyList *triangles) {
    TPPLPolyList outpolys;
@@ -170,7 +167,7 @@ index 3a8a6efa83..5e94793b79 100644
        return 0;
      }
    }
-@@ -543,7 +540,7 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -543,7 +542,7 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
    }
  
    TPPLPolyList triangles;
@@ -179,7 +176,7 @@ index 3a8a6efa83..5e94793b79 100644
    TPPLPoly *poly1 = NULL, *poly2 = NULL;
    TPPLPoly newpoly;
    TPPLPoint d1, d2, p1, p2, p3;
-@@ -578,19 +575,19 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -578,19 +577,19 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
      return 0;
    }
  
@@ -203,7 +200,7 @@ index 3a8a6efa83..5e94793b79 100644
  
          for (i21 = 0; i21 < poly2->GetNumPoints(); i21++) {
            if ((d2.x != poly2->GetPoint(i21).x) || (d2.y != poly2->GetPoint(i21).y)) {
-@@ -660,16 +657,16 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -660,16 +659,16 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
        }
  
        triangles.erase(iter2);
@@ -224,7 +221,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    return 1;
-@@ -677,13 +674,13 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -677,13 +676,13 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
  
  int TPPLPartition::ConvexPartition_HM(TPPLPolyList *inpolys, TPPLPolyList *parts) {
    TPPLPolyList outpolys;
@@ -241,7 +238,7 @@ index 3a8a6efa83..5e94793b79 100644
        return 0;
      }
    }
-@@ -824,8 +821,8 @@ int TPPLPartition::Triangulate_OPT(TPPLPoly *poly, TPPLPolyList *triangles) {
+@@ -824,8 +823,8 @@ int TPPLPartition::Triangulate_OPT(TPPLPoly *poly, TPPLPolyList *triangles) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_back(newdiagonal);
@@ -252,7 +249,7 @@ index 3a8a6efa83..5e94793b79 100644
      diagonals.pop_front();
      bestvertex = dpstates[diagonal.index2][diagonal.index1].bestvertex;
      if (bestvertex == -1) {
-@@ -873,10 +870,10 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
+@@ -873,10 +872,10 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
      pairs->push_front(newdiagonal);
      dpstates[a][b].weight = w;
    } else {
@@ -265,7 +262,7 @@ index 3a8a6efa83..5e94793b79 100644
        pairs->pop_front();
      }
      pairs->push_front(newdiagonal);
-@@ -885,7 +882,7 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
+@@ -885,7 +884,7 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
  
  void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
    DiagonalList *pairs = NULL;
@@ -274,7 +271,7 @@ index 3a8a6efa83..5e94793b79 100644
    long top;
    long w;
  
-@@ -902,23 +899,23 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -902,23 +901,23 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
    }
    if (j - i > 1) {
      pairs = &(dpstates[i][j].pairs);
@@ -305,7 +302,7 @@ index 3a8a6efa83..5e94793b79 100644
        }
      }
    }
-@@ -927,7 +924,7 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -927,7 +926,7 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
  
  void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
    DiagonalList *pairs = NULL;
@@ -314,7 +311,7 @@ index 3a8a6efa83..5e94793b79 100644
    long top;
    long w;
  
-@@ -946,21 +943,21 @@ void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -946,21 +945,21 @@ void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPS
    if (k - j > 1) {
      pairs = &(dpstates[j][k].pairs);
  
@@ -343,7 +340,7 @@ index 3a8a6efa83..5e94793b79 100644
        }
      } else {
        w++;
-@@ -981,11 +978,11 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -981,11 +980,11 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    DiagonalList diagonals, diagonals2;
    Diagonal diagonal, newdiagonal;
    DiagonalList *pairs = NULL, *pairs2 = NULL;
@@ -358,7 +355,7 @@ index 3a8a6efa83..5e94793b79 100644
    bool ijreal, jkreal;
  
    n = poly->GetNumPoints();
-@@ -1110,35 +1107,35 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1110,35 +1109,35 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_front(newdiagonal);
@@ -403,7 +400,7 @@ index 3a8a6efa83..5e94793b79 100644
                pairs2->pop_back();
              } else {
                break;
-@@ -1153,21 +1150,21 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1153,21 +1152,21 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
          diagonals.push_front(newdiagonal);
        }
      } else {
@@ -431,7 +428,7 @@ index 3a8a6efa83..5e94793b79 100644
                pairs2->pop_front();
              } else {
                break;
-@@ -1197,8 +1194,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1197,8 +1196,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_front(newdiagonal);
@@ -442,7 +439,7 @@ index 3a8a6efa83..5e94793b79 100644
      diagonals.pop_front();
      if ((diagonal.index2 - diagonal.index1) <= 1) {
        continue;
-@@ -1210,8 +1207,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1210,8 +1209,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
      indices.push_back(diagonal.index2);
      diagonals2.push_front(diagonal);
  
@@ -453,7 +450,7 @@ index 3a8a6efa83..5e94793b79 100644
        diagonals2.pop_front();
        if ((diagonal.index2 - diagonal.index1) <= 1) {
          continue;
-@@ -1220,16 +1217,16 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1220,16 +1219,16 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
        jkreal = true;
        pairs = &(dpstates[diagonal.index1][diagonal.index2].pairs);
        if (!vertices[diagonal.index1].isConvex) {
@@ -476,7 +473,7 @@ index 3a8a6efa83..5e94793b79 100644
            jkreal = false;
          }
        }
-@@ -1253,11 +1250,12 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1253,11 +1252,12 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
        indices.push_back(j);
      }
  
@@ -492,7 +489,7 @@ index 3a8a6efa83..5e94793b79 100644
        k++;
      }
      parts->push_back(newpoly);
-@@ -1281,7 +1279,7 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1281,7 +1281,7 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
  // "Computational Geometry: Algorithms and Applications"
  // by Mark de Berg, Otfried Cheong, Marc van Kreveld, and Mark Overmars.
  int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monotonePolys) {
@@ -501,7 +498,7 @@ index 3a8a6efa83..5e94793b79 100644
    MonotoneVertex *vertices = NULL;
    long i, numvertices, vindex, vindex2, newnumvertices, maxnumvertices;
    long polystartindex, polyendindex;
-@@ -1291,11 +1289,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1291,11 +1291,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    bool error = false;
  
    numvertices = 0;
@@ -515,7 +512,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    maxnumvertices = numvertices * 3;
-@@ -1303,8 +1298,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1303,8 +1300,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    newnumvertices = numvertices;
  
    polystartindex = 0;
@@ -526,7 +523,7 @@ index 3a8a6efa83..5e94793b79 100644
      polyendindex = polystartindex + poly->GetNumPoints() - 1;
      for (i = 0; i < poly->GetNumPoints(); i++) {
        vertices[i + polystartindex].p = poly->GetPoint(i);
-@@ -1360,14 +1355,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1360,14 +1357,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    // Note that while set doesn't actually have to be implemented as
    // a tree, complexity requirements for operations are the same as
    // for the balanced binary search tree.
@@ -546,7 +543,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    // For each vertex.
-@@ -1387,13 +1382,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1387,13 +1384,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = vertices[v->next].p;
          newedge.index = vindex;
@@ -564,7 +561,7 @@ index 3a8a6efa83..5e94793b79 100644
            error = true;
            break;
          }
-@@ -1412,29 +1408,30 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1412,29 +1410,30 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
@@ -601,7 +598,7 @@ index 3a8a6efa83..5e94793b79 100644
            error = true;
            break;
          }
-@@ -1452,25 +1449,25 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1452,25 +1451,25 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
@@ -632,7 +629,7 @@ index 3a8a6efa83..5e94793b79 100644
              error = true;
              break;
            }
-@@ -1488,27 +1485,28 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1488,27 +1487,28 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
            newedge.p1 = v2->p;
            newedge.p2 = vertices[v2->next].p;
            newedge.index = vindex2;
@@ -668,7 +665,7 @@ index 3a8a6efa83..5e94793b79 100644
          }
          break;
      }
-@@ -1569,8 +1567,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1569,8 +1569,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
  
  // Adds a diagonal to the doubly-connected list of vertices.
  void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
@@ -679,7 +676,7 @@ index 3a8a6efa83..5e94793b79 100644
    long newindex1, newindex2;
  
    newindex1 = *numvertices;
-@@ -1597,14 +1595,14 @@ void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, lon
+@@ -1597,14 +1597,14 @@ void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, lon
    vertextypes[newindex1] = vertextypes[index1];
    edgeTreeIterators[newindex1] = edgeTreeIterators[index1];
    helpers[newindex1] = helpers[index1];
@@ -698,7 +695,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  }
  
-@@ -1830,13 +1828,13 @@ int TPPLPartition::TriangulateMonotone(TPPLPoly *inPoly, TPPLPolyList *triangles
+@@ -1830,13 +1830,13 @@ int TPPLPartition::TriangulateMonotone(TPPLPoly *inPoly, TPPLPolyList *triangles
  
  int TPPLPartition::Triangulate_MONO(TPPLPolyList *inpolys, TPPLPolyList *triangles) {
    TPPLPolyList monotone;
diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
index 5e94793b79..8c5409bf24 100644
--- a/thirdparty/misc/polypartition.cpp
+++ b/thirdparty/misc/polypartition.cpp
@@ -23,6 +23,8 @@
 
 #include "polypartition.h"
 
+#include <math.h>
+#include <string.h>
 #include <algorithm>
 
 TPPLPoly::TPPLPoly() {
diff --git a/thirdparty/msdfgen/core/edge-coloring.cpp b/thirdparty/msdfgen/core/edge-coloring.cpp
index 370f9aa38d..914f1769fd 100644
--- a/thirdparty/msdfgen/core/edge-coloring.cpp
+++ b/thirdparty/msdfgen/core/edge-coloring.cpp
@@ -473,7 +473,7 @@ void edgeColoringByDistance(Shape &shape, double angleThreshold, unsigned long l
         edgeMatrix[i] = &edgeMatrixStorage[i*splineCount];
     int nextEdge = 0;
     for (; nextEdge < graphEdgeCount && !*graphEdgeDistances[nextEdge]; ++nextEdge) {
-        int elem = graphEdgeDistances[nextEdge]-distanceMatrixBase;
+        int elem = (int) (graphEdgeDistances[nextEdge]-distanceMatrixBase);
         int row = elem/splineCount;
         int col = elem%splineCount;
         edgeMatrix[row][col] = 1;
@@ -483,7 +483,7 @@ void edgeColoringByDistance(Shape &shape, double angleThreshold, unsigned long l
     std::vector<int> coloring(2*splineCount);
     colorSecondDegreeGraph(&coloring[0], &edgeMatrix[0], splineCount, seed);
     for (; nextEdge < graphEdgeCount; ++nextEdge) {
-        int elem = graphEdgeDistances[nextEdge]-distanceMatrixBase;
+        int elem = (int) (graphEdgeDistances[nextEdge]-distanceMatrixBase);
         tryAddEdge(&coloring[0], &edgeMatrix[0], splineCount, elem/splineCount, elem%splineCount, &coloring[splineCount]);
     }
 
diff --git a/thirdparty/msdfgen/core/equation-solver.cpp b/thirdparty/msdfgen/core/equation-solver.cpp
index fbe906428b..4144fa3340 100644
--- a/thirdparty/msdfgen/core/equation-solver.cpp
+++ b/thirdparty/msdfgen/core/equation-solver.cpp
@@ -4,17 +4,15 @@
 #define _USE_MATH_DEFINES
 #include <cmath>
 
-#define TOO_LARGE_RATIO 1e12
-
 namespace msdfgen {
 
 int solveQuadratic(double x[2], double a, double b, double c) {
-    // a = 0 -> linear equation
-    if (a == 0 || fabs(b)+fabs(c) > TOO_LARGE_RATIO*fabs(a)) {
-        // a, b = 0 -> no solution
-        if (b == 0 || fabs(c) > TOO_LARGE_RATIO*fabs(b)) {
+    // a == 0 -> linear equation
+    if (a == 0 || fabs(b) > 1e12*fabs(a)) {
+        // a == 0, b == 0 -> no solution
+        if (b == 0) {
             if (c == 0)
-                return -1; // 0 = 0
+                return -1; // 0 == 0
             return 0;
         }
         x[0] = -c/b;
@@ -35,41 +33,38 @@ int solveQuadratic(double x[2], double a, double b, double c) {
 
 static int solveCubicNormed(double x[3], double a, double b, double c) {
     double a2 = a*a;
-    double q  = (a2 - 3*b)/9; 
-    double r  = (a*(2*a2-9*b) + 27*c)/54;
+    double q = 1/9.*(a2-3*b);
+    double r = 1/54.*(a*(2*a2-9*b)+27*c);
     double r2 = r*r;
     double q3 = q*q*q;
-    double A, B;
+    a *= 1/3.;
     if (r2 < q3) {
         double t = r/sqrt(q3);
         if (t < -1) t = -1;
         if (t > 1) t = 1;
         t = acos(t);
-        a /= 3; q = -2*sqrt(q);
-        x[0] = q*cos(t/3)-a;
-        x[1] = q*cos((t+2*M_PI)/3)-a;
-        x[2] = q*cos((t-2*M_PI)/3)-a;
+        q = -2*sqrt(q);
+        x[0] = q*cos(1/3.*t)-a;
+        x[1] = q*cos(1/3.*(t+2*M_PI))-a;
+        x[2] = q*cos(1/3.*(t-2*M_PI))-a;
         return 3;
     } else {
-        A = -pow(fabs(r)+sqrt(r2-q3), 1/3.); 
-        if (r < 0) A = -A;
-        B = A == 0 ? 0 : q/A;
-        a /= 3;
-        x[0] = (A+B)-a;
-        x[1] = -0.5*(A+B)-a;
-        x[2] = 0.5*sqrt(3.)*(A-B);
-        if (fabs(x[2]) < 1e-14)
+        double u = (r < 0 ? 1 : -1)*pow(fabs(r)+sqrt(r2-q3), 1/3.); 
+        double v = u == 0 ? 0 : q/u;
+        x[0] = (u+v)-a;
+        if (u == v || fabs(u-v) < 1e-12*fabs(u+v)) {
+            x[1] = -.5*(u+v)-a;
             return 2;
+        }
         return 1;
     }
 }
 
 int solveCubic(double x[3], double a, double b, double c, double d) {
     if (a != 0) {
-        double bn = b/a, cn = c/a, dn = d/a;
-        // Check that a isn't "almost zero"
-        if (fabs(bn) < TOO_LARGE_RATIO && fabs(cn) < TOO_LARGE_RATIO && fabs(dn) < TOO_LARGE_RATIO)
-            return solveCubicNormed(x, bn, cn, dn);
+        double bn = b/a;
+        if (fabs(bn) < 1e6) // Above this ratio, the numerical error gets larger than if we treated a as zero
+            return solveCubicNormed(x, bn, c/a, d/a);
     }
     return solveQuadratic(x, b, c, d);
 }
diff --git a/thirdparty/thorvg/AUTHORS b/thirdparty/thorvg/AUTHORS
index 66057232b6..ec06c49118 100644
--- a/thirdparty/thorvg/AUTHORS
+++ b/thirdparty/thorvg/AUTHORS
@@ -13,3 +13,5 @@ Pankaj Kumar <pankaj.m1@samsung.com>
 Patryk Kaczmarek <patryk.k@partner.samsung.com>
 Michal Maciola <m.maciola@samsung.com>
 Peter Vullings <peter@projectitis.com>
+K. S. Ernest (iFire) Lee <ernest.lee@chibifire.com>
+Rémi Verschelde <rverschelde@gmail.com>
diff --git a/thirdparty/thorvg/inc/config.h b/thirdparty/thorvg/inc/config.h
index 04a450b1bb..41e8f6dafa 100644
--- a/thirdparty/thorvg/inc/config.h
+++ b/thirdparty/thorvg/inc/config.h
@@ -13,5 +13,5 @@
 
 #define THORVG_JPG_LOADER_SUPPORT 1
 
-#define THORVG_VERSION_STRING "0.7.0"
+#define THORVG_VERSION_STRING "0.7.1"
 #endif
diff --git a/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch b/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch
deleted file mode 100644
index a174880306..0000000000
--- a/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-index def8ae169a..cf103774c5 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-@@ -51,6 +51,7 @@
- 
- #define _USE_MATH_DEFINES       //Math Constants are not defined in Standard C/C++.
- 
-+#include <cstring>
- #include <fstream>
- #include <float.h>
- #include <math.h>
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-index 2b62315de8..32685ee620 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-@@ -50,6 +50,7 @@
- 
- #define _USE_MATH_DEFINES       //Math Constants are not defined in Standard C/C++.
- 
-+#include <cstring>
- #include <math.h>
- #include <clocale>
- #include <ctype.h>
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-index 8701fe32b1..ae17634f31 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-@@ -49,6 +49,7 @@
- */
- 
- 
-+#include <cstring>
- #include <string>
- #include "tvgMath.h"
- #include "tvgSvgLoaderCommon.h"
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-index d5b9cdcf7b..9f269b29a2 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-@@ -20,6 +20,7 @@
-  * SOFTWARE.
-  */
- 
-+#include <cstring>
- #include <math.h>
- #include <memory.h>
- #include "tvgSvgUtil.h"
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-index 2e3d5928d9..1571aa4e25 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-@@ -20,6 +20,7 @@
-  * SOFTWARE.
-  */
- 
-+#include <cstring>
- #include <ctype.h>
- #include <string>
- 
-diff --git a/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp b/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-index 9450d80e88..9dd57e5a89 100644
---- a/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-+++ b/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-@@ -24,6 +24,8 @@
- #include "tvgTvgSaver.h"
- #include "tvgLzw.h"
- 
-+#include <cstring>
-+
- #ifdef _WIN32
-     #include <malloc.h>
- #else
diff --git a/thirdparty/thorvg/patches/thorvg-pr1166-vs2017-minmax.patch b/thirdparty/thorvg/patches/thorvg-pr1166-vs2017-minmax.patch
deleted file mode 100644
index 0b045bd05a..0000000000
--- a/thirdparty/thorvg/patches/thorvg-pr1166-vs2017-minmax.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
-index 78537e7726..c75e73760e 100644
---- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
-+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
-@@ -23,6 +23,7 @@
- #include "tvgSwCommon.h"
- #include "tvgTaskScheduler.h"
- #include "tvgSwRenderer.h"
-+#include "tvgMath.h"
- 
- /************************************************************************/
- /* Internal Class Implementation                                        */
-@@ -594,10 +595,10 @@ void* SwRenderer::prepareCommon(SwTask* task, const RenderTransform* transform,
-     task->surface = surface;
-     task->mpool = mpool;
-     task->flags = flags;
--    task->bbox.min.x = max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
--    task->bbox.min.y = max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
--    task->bbox.max.x = min(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
--    task->bbox.max.y = min(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
-+    task->bbox.min.x = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
-+    task->bbox.min.y = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
-+    task->bbox.max.x = mathMin(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
-+    task->bbox.max.y = mathMin(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
- 
-     if (!task->pushed) {
-         task->pushed = true;
-diff --git a/thirdparty/thorvg/src/lib/tvgMath.h b/thirdparty/thorvg/src/lib/tvgMath.h
-index 9e5c915fc3..94b4fe1cf1 100644
---- a/thirdparty/thorvg/src/lib/tvgMath.h
-+++ b/thirdparty/thorvg/src/lib/tvgMath.h
-@@ -29,6 +29,10 @@
- #include "tvgCommon.h"
- 
- 
-+#define mathMin(x, y) (((x) < (y)) ? (x) : (y))
-+#define mathMax(x, y) (((x) > (y)) ? (x) : (y))
-+
-+
- static inline bool mathZero(float a)
- {
-     return (fabsf(a) < FLT_EPSILON) ? true : false;
-@@ -154,4 +158,4 @@ static inline Matrix mathMultiply(const Matrix* lhs, const Matrix* rhs)
- }
- 
- 
--#endif //_TVG_MATH_H_
-\ No newline at end of file
-+#endif //_TVG_MATH_H_
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
index fe22fce017..f9974d9847 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
@@ -84,8 +84,8 @@ bool imagePrepare(SwImage* image, const Matrix* transform, const SwBBox& clipReg
 
     //Fast track: Non-transformed image but just shifted.
     if (image->direct) {
-        image->ox = -static_cast<uint32_t>(round(transform->e13));
-        image->oy = -static_cast<uint32_t>(round(transform->e23));
+        image->ox = -static_cast<int32_t>(round(transform->e13));
+        image->oy = -static_cast<int32_t>(round(transform->e23));
     //Figure out the scale factor by transform matrix
     } else {
         auto scaleX = sqrtf((transform->e11 * transform->e11) + (transform->e21 * transform->e21));
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
index deebed16ee..56bc2f77dc 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
@@ -481,7 +481,10 @@ static bool _rasterScaledRleRGBAImage(SwSurface* surface, const SwImage* image,
 static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity)
 {
     Matrix itransform;
-    if (transform && !mathInverse(transform, &itransform)) return false;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
 
     auto halfScale = _halfScale(image->scale);
 
@@ -816,7 +819,10 @@ static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, con
 static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity)
 {
     Matrix itransform;
-    if (transform && !mathInverse(transform, &itransform)) return false;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
 
     auto halfScale = _halfScale(image->scale);
 
@@ -1113,12 +1119,12 @@ static bool _rasterTranslucentLinearGradientRle(SwSurface* surface, const SwRleD
         auto dst = &surface->buffer[span->y * surface->stride + span->x];
         fillFetchLinear(fill, buffer, span->y, span->x, span->len);
         if (span->coverage == 255) {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = buffer[i] + ALPHA_BLEND(*dst, _ialpha(buffer[i]));
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
             }
         } else {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[i], span->coverage);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
                 *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
             }
         }
@@ -1142,8 +1148,8 @@ static bool _rasterSolidLinearGradientRle(SwSurface* surface, const SwRleData* r
         } else {
             fillFetchLinear(fill, buf, span->y, span->x, span->len);
             auto dst = &surface->buffer[span->y * surface->stride + span->x];
-            for (uint32_t i = 0; i < span->len; ++i) {
-                dst[i] = INTERPOLATE(span->coverage, buf[i], dst[i]);
+            for (uint32_t x = 0; x < span->len; ++x) {
+                dst[x] = INTERPOLATE(span->coverage, buf[x], dst[x]);
             }
         }
     }
@@ -1302,12 +1308,12 @@ static bool _rasterTranslucentRadialGradientRle(SwSurface* surface, const SwRleD
         auto dst = &surface->buffer[span->y * surface->stride + span->x];
         fillFetchRadial(fill, buffer, span->y, span->x, span->len);
         if (span->coverage == 255) {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = buffer[i] + ALPHA_BLEND(*dst, _ialpha(buffer[i]));
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
             }
         } else {
-           for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[i], span->coverage);
+           for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
                 *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
             }
         }
@@ -1332,8 +1338,8 @@ static bool _rasterSolidRadialGradientRle(SwSurface* surface, const SwRleData* r
         } else {
             fillFetchRadial(fill, buf, span->y, span->x, span->len);
             auto ialpha = 255 - span->coverage;
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = ALPHA_BLEND(buf[i], span->coverage) + ALPHA_BLEND(*dst, ialpha);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = ALPHA_BLEND(buf[x], span->coverage) + ALPHA_BLEND(*dst, ialpha);
             }
         }
     }
@@ -1487,7 +1493,7 @@ bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint
 bool rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, const SwBBox& bbox, uint32_t opacity)
 {
     //Verify Boundary
-    if (bbox.max.x < 0 || bbox.max.y < 0 || bbox.min.x >= surface->w || bbox.min.y >= surface->h) return false;
+    if (bbox.max.x < 0 || bbox.max.y < 0 || bbox.min.x >= static_cast<SwCoord>(surface->w) || bbox.min.y >= static_cast<SwCoord>(surface->h)) return false;
 
     //TOOD: switch (image->format)
     //TODO: case: _rasterRGBImage()
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
index 4e8d342137..e96307c874 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
@@ -58,8 +58,8 @@
     y = yStart;
 
     while (y < yEnd) {
-        x1 = _xa;
-        x2 = _xb;
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
 
         if (!region) {
             minx = INT32_MAX;
@@ -160,4 +160,4 @@ next:
     xb = _xb;
     ua = _ua;
     va = _va;
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/lib/tvgMath.h b/thirdparty/thorvg/src/lib/tvgMath.h
index 94b4fe1cf1..423fb6eb1b 100644
--- a/thirdparty/thorvg/src/lib/tvgMath.h
+++ b/thirdparty/thorvg/src/lib/tvgMath.h
@@ -47,7 +47,7 @@ static inline bool mathEqual(float a, float b)
 
 static inline bool mathRightAngle(const Matrix* m)
 {
-   auto radian = fabsf(atan2(m->e21, m->e11));
+   auto radian = fabsf(atan2f(m->e21, m->e11));
    if (radian < FLT_EPSILON || mathEqual(radian, float(M_PI_2)) || mathEqual(radian, float(M_PI))) return true;
    return false;
 }
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp b/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
index 8846613c6b..f27881da42 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
@@ -47,6 +47,7 @@ JpgLoader::~JpgLoader()
 {
     jpgdDelete(decoder);
     if (freeData) free(data);
+    free(image);
 }
 
 
@@ -128,5 +129,9 @@ unique_ptr<Surface> JpgLoader::bitmap()
 
 void JpgLoader::run(unsigned tid)
 {
+    if (image) {
+        free(image);
+        image = nullptr;
+    }
     image = jpgdDecompress(decoder);
 }
 \ No newline at end of file
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
index fa72734ec4..4ccc5788d5 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
@@ -1080,7 +1080,9 @@ namespace DCT_Upsample
 // Unconditionally frees all allocated m_blocks.
 void jpeg_decoder::free_all_blocks()
 {
+    delete(m_pStream);
     m_pStream = nullptr;
+
     for (mem_block *b = m_pMem_blocks; b; ) {
         mem_block *n = b->m_pNext;
         free(b);
@@ -2815,7 +2817,6 @@ int jpeg_decoder::begin_decoding()
 jpeg_decoder::~jpeg_decoder()
 {
     free_all_blocks();
-    delete(m_pStream);
 }
 
 
@@ -3025,4 +3026,4 @@ unsigned char* jpgdDecompress(jpeg_decoder* decoder)
         }
     }
     return pImage_data;
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
index d32ffd99d4..ca9cb35c32 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 - 2022 Samsung Electronics Co., Ltd. All rights reserved.
 
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp b/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
index c6d95be5ba..3cc08e902b 100644
--- a/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
@@ -72,6 +72,7 @@ PngLoader::PngLoader()
 PngLoader::~PngLoader()
 {
     if (freeData) free(data);
+    free(image);
 }
 
 
@@ -121,7 +122,7 @@ bool PngLoader::open(const char* data, uint32_t size, bool copy)
     clear();
 
     lodepng_state_init(&state);
-    
+
     unsigned int width, height;
     if (lodepng_inspect(&width, &height, &state, (unsigned char*)(data), size) > 0) return false;
 
@@ -180,10 +181,14 @@ unique_ptr<Surface> PngLoader::bitmap()
 
 void PngLoader::run(unsigned tid)
 {
+    if (image) {
+        free(image);
+        image = nullptr;
+    }
     auto width = static_cast<unsigned>(w);
     auto height = static_cast<unsigned>(h);
 
     lodepng_decode(&image, &width, &height, &state, data, size);
 
     _premultiply((uint32_t*)(image), width, height);
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
index cf103774c5..08b3308165 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
@@ -541,7 +541,7 @@ static void _toColor(const char* str, uint8_t* r, uint8_t* g, uint8_t* b, char**
                 }
             }
         }
-    } else if (len >= 3 && !strncmp(str, "url", 3)) {
+    } else if (ref && len >= 3 && !strncmp(str, "url", 3)) {
         *ref = _idFromUrl((const char*)(str + 3));
     } else {
         //Handle named color
@@ -789,7 +789,7 @@ static bool _attrParseSvgNode(void* data, const char* key, const char* value)
         return simpleXmlParseW3CAttribute(value, _parseStyleAttr, loader);
     }
 #ifdef THORVG_LOG_ENABLED
-    else if ((!strcmp(key, "x") || !strcmp(key, "y")) && fabsf(svgUtilStrtof(value, nullptr)) > FLT_EPSILON ) {
+    else if ((!strcmp(key, "x") || !strcmp(key, "y")) && fabsf(svgUtilStrtof(value, nullptr)) > FLT_EPSILON) {
         TVGLOG("SVG", "Unsupported attributes used [Elements type: Svg][Attribute: %s][Value: %s]", key, value);
     }
 #endif
@@ -1611,6 +1611,7 @@ static bool _attrParseImageNode(void* data, const char* key, const char* value)
     }
 
     if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (image->href && value) free(image->href);
         image->href = _idFromHref(value);
     } else if (!strcmp(key, "id")) {
         if (node->id && value) free(node->id);
@@ -1728,6 +1729,112 @@ error_grad_alloc:
 }
 
 
+static void _styleInherit(SvgStyleProperty* child, const SvgStyleProperty* parent)
+{
+    if (parent == nullptr) return;
+    //Inherit the property of parent if not present in child.
+    if (!child->curColorSet) {
+        child->color = parent->color;
+        child->curColorSet = parent->curColorSet;
+    }
+    //Fill
+    if (!((int)child->fill.flags & (int)SvgFillFlags::Paint)) {
+        child->fill.paint.color = parent->fill.paint.color;
+        child->fill.paint.none = parent->fill.paint.none;
+        child->fill.paint.curColor = parent->fill.paint.curColor;
+        if (parent->fill.paint.url) child->fill.paint.url = _copyId(parent->fill.paint.url);
+    }
+    if (!((int)child->fill.flags & (int)SvgFillFlags::Opacity)) {
+        child->fill.opacity = parent->fill.opacity;
+    }
+    if (!((int)child->fill.flags & (int)SvgFillFlags::FillRule)) {
+        child->fill.fillRule = parent->fill.fillRule;
+    }
+    //Stroke
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Paint)) {
+        child->stroke.paint.color = parent->stroke.paint.color;
+        child->stroke.paint.none = parent->stroke.paint.none;
+        child->stroke.paint.curColor = parent->stroke.paint.curColor;
+        child->stroke.paint.url = parent->stroke.paint.url ? _copyId(parent->stroke.paint.url) : nullptr;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
+        child->stroke.opacity = parent->stroke.opacity;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Width)) {
+        child->stroke.width = parent->stroke.width;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Dash)) {
+        if (parent->stroke.dash.array.count > 0) {
+            child->stroke.dash.array.clear();
+            child->stroke.dash.array.reserve(parent->stroke.dash.array.count);
+            for (uint32_t i = 0; i < parent->stroke.dash.array.count; ++i) {
+                child->stroke.dash.array.push(parent->stroke.dash.array.data[i]);
+            }
+        }
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Cap)) {
+        child->stroke.cap = parent->stroke.cap;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Join)) {
+        child->stroke.join = parent->stroke.join;
+    }
+}
+
+
+static void _styleCopy(SvgStyleProperty* to, const SvgStyleProperty* from)
+{
+    if (from == nullptr) return;
+    //Copy the properties of 'from' only if they were explicitly set (not the default ones).
+    if (from->curColorSet) {
+        to->color = from->color;
+        to->curColorSet = true;
+    }
+    //Fill
+    to->fill.flags = (SvgFillFlags)((int)to->fill.flags | (int)from->fill.flags);
+    if (((int)from->fill.flags & (int)SvgFillFlags::Paint)) {
+        to->fill.paint.color = from->fill.paint.color;
+        to->fill.paint.none = from->fill.paint.none;
+        to->fill.paint.curColor = from->fill.paint.curColor;
+        if (from->fill.paint.url) to->fill.paint.url = _copyId(from->fill.paint.url);
+    }
+    if (((int)from->fill.flags & (int)SvgFillFlags::Opacity)) {
+        to->fill.opacity = from->fill.opacity;
+    }
+    if (((int)from->fill.flags & (int)SvgFillFlags::FillRule)) {
+        to->fill.fillRule = from->fill.fillRule;
+    }
+    //Stroke
+    to->stroke.flags = (SvgStrokeFlags)((int)to->stroke.flags | (int)from->stroke.flags);
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Paint)) {
+        to->stroke.paint.color = from->stroke.paint.color;
+        to->stroke.paint.none = from->stroke.paint.none;
+        to->stroke.paint.curColor = from->stroke.paint.curColor;
+        to->stroke.paint.url = from->stroke.paint.url ? _copyId(from->stroke.paint.url) : nullptr;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
+        to->stroke.opacity = from->stroke.opacity;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Width)) {
+        to->stroke.width = from->stroke.width;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Dash)) {
+        if (from->stroke.dash.array.count > 0) {
+            to->stroke.dash.array.clear();
+            to->stroke.dash.array.reserve(from->stroke.dash.array.count);
+            for (uint32_t i = 0; i < from->stroke.dash.array.count; ++i) {
+                to->stroke.dash.array.push(from->stroke.dash.array.data[i]);
+            }
+        }
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Cap)) {
+        to->stroke.cap = from->stroke.cap;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Join)) {
+        to->stroke.join = from->stroke.join;
+    }
+}
+
+
 static void _copyAttr(SvgNode* to, const SvgNode* from)
 {
     //Copy matrix attribute
@@ -1736,7 +1843,8 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
         if (to->transform) *to->transform = *from->transform;
     }
     //Copy style attribute
-    *to->style = *from->style;
+    _styleCopy(to->style, from->style);
+    to->style->flags = (SvgStyleFlags)((int)to->style->flags | (int)from->style->flags);
     if (from->style->fill.paint.url) to->style->fill.paint.url = strdup(from->style->fill.paint.url);
     if (from->style->stroke.paint.url) to->style->stroke.paint.url = strdup(from->style->stroke.paint.url);
     if (from->style->clipPath.url) to->style->clipPath.url = strdup(from->style->clipPath.url);
@@ -1780,15 +1888,17 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
             break;
         }
         case SvgNodeType::Polygon: {
-            to->node.polygon.pointsCount = from->node.polygon.pointsCount;
-            to->node.polygon.points = (float*)malloc(to->node.polygon.pointsCount * sizeof(float));
-            memcpy(to->node.polygon.points, from->node.polygon.points, to->node.polygon.pointsCount * sizeof(float));
+            if ((to->node.polygon.pointsCount = from->node.polygon.pointsCount)) {
+                to->node.polygon.points = (float*)malloc(to->node.polygon.pointsCount * sizeof(float));
+                memcpy(to->node.polygon.points, from->node.polygon.points, to->node.polygon.pointsCount * sizeof(float));
+            }
             break;
         }
         case SvgNodeType::Polyline: {
-            to->node.polyline.pointsCount = from->node.polyline.pointsCount;
-            to->node.polyline.points = (float*)malloc(to->node.polyline.pointsCount * sizeof(float));
-            memcpy(to->node.polyline.points, from->node.polyline.points, to->node.polyline.pointsCount * sizeof(float));
+            if ((to->node.polyline.pointsCount = from->node.polyline.pointsCount)) {
+                to->node.polyline.points = (float*)malloc(to->node.polyline.pointsCount * sizeof(float));
+                memcpy(to->node.polyline.points, from->node.polyline.points, to->node.polyline.pointsCount * sizeof(float));
+            }
             break;
         }
         case SvgNodeType::Image: {
@@ -1806,35 +1916,45 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
 }
 
 
-static void _cloneNode(SvgNode* from, SvgNode* parent)
+static void _cloneNode(SvgNode* from, SvgNode* parent, int depth)
 {
+    /* Exception handling: Prevent invalid SVG data input.
+       The size is the arbitrary value, we need an experimental size. */
+    if (depth == 8192) {
+        TVGERR("SVG", "Infinite recursive call - stopped after %d calls! Svg file may be incorrectly formatted.", depth);
+        return;
+    }
+
     SvgNode* newNode;
-    if (!from || !parent) return;
+    if (!from || !parent || from == parent) return;
 
     newNode = _createNode(parent, from->type);
-
     if (!newNode) return;
 
+    _styleInherit(newNode->style, parent->style);
     _copyAttr(newNode, from);
 
     auto child = from->child.data;
     for (uint32_t i = 0; i < from->child.count; ++i, ++child) {
-        _cloneNode(*child, newNode);
+        _cloneNode(*child, newNode, depth + 1);
     }
 }
 
 
-static void _postponeCloneNode(SvgLoaderData* loader, SvgNode *node, char* id) {
+static void _postponeCloneNode(SvgLoaderData* loader, SvgNode *node, char* id)
+{
     loader->cloneNodes.push({node, id});
 }
 
 
-static void _clonePostponedNodes(Array<SvgNodeIdPair>* cloneNodes) {
+static void _clonePostponedNodes(Array<SvgNodeIdPair>* cloneNodes, SvgNode* doc)
+{
     for (uint32_t i = 0; i < cloneNodes->count; ++i) {
         auto nodeIdPair = cloneNodes->data[i];
         auto defs = _getDefsNode(nodeIdPair.node);
         auto nodeFrom = _findChildById(defs, nodeIdPair.id);
-        _cloneNode(nodeFrom, nodeIdPair.node);
+        if (!nodeFrom) nodeFrom = _findChildById(doc, nodeIdPair.id);
+        _cloneNode(nodeFrom, nodeIdPair.node, 0);
         free(nodeIdPair.id);
     }
 }
@@ -1875,7 +1995,7 @@ static bool _attrParseUseNode(void* data, const char* key, const char* value)
         defs = _getDefsNode(node);
         nodeFrom = _findChildById(defs, id);
         if (nodeFrom) {
-            _cloneNode(nodeFrom, node);
+            _cloneNode(nodeFrom, node, 0);
             free(id);
         } else {
             //some svg export software include <defs> element at the end of the file
@@ -1883,10 +2003,6 @@ static bool _attrParseUseNode(void* data, const char* key, const char* value)
             //after the whole file is parsed
             _postponeCloneNode(loader, node, id);
         }
-    } else if (!strcmp(key, "clip-path")) {
-        _handleClipPathAttr(loader, node, value);
-    } else if (!strcmp(key, "mask")) {
-        _handleMaskAttr(loader, node, value);
     } else {
         return _attrParseGNode(data, key, value);
     }
@@ -2081,10 +2197,12 @@ static bool _attrParseRadialGradientNode(void* data, const char* key, const char
     }
 
     if (!strcmp(key, "id")) {
+        if (grad->id && value) free(grad->id);
         grad->id = _copyId(value);
     } else if (!strcmp(key, "spreadMethod")) {
         grad->spread = _parseSpreadValue(value);
     } else if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (grad->ref && value) free(grad->ref);
         grad->ref = _idFromHref(value);
     } else if (!strcmp(key, "gradientUnits") && !strcmp(value, "userSpaceOnUse")) {
         grad->userSpace = true;
@@ -2269,10 +2387,12 @@ static bool _attrParseLinearGradientNode(void* data, const char* key, const char
     }
 
     if (!strcmp(key, "id")) {
+        if (grad->id && value) free(grad->id);
         grad->id = _copyId(value);
     } else if (!strcmp(key, "spreadMethod")) {
         grad->spread = _parseSpreadValue(value);
     } else if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (grad->ref && value) free(grad->ref);
         grad->ref = _idFromHref(value);
     } else if (!strcmp(key, "gradientUnits") && !strcmp(value, "userSpaceOnUse")) {
         grad->userSpace = true;
@@ -2408,6 +2528,7 @@ static void _svgLoaderParserXmlOpen(SvgLoaderData* loader, const char* content,
 
     if ((method = _findGroupFactory(tagName))) {
         //Group
+        if (empty) return;
         if (!loader->doc) {
             if (strcmp(tagName, "svg")) return; //Not a valid svg document
             node = method(loader, nullptr, attrs, attrsLength);
@@ -2493,59 +2614,8 @@ static bool _svgLoaderParser(void* data, SimpleXMLType type, const char* content
 }
 
 
-static void _styleInherit(SvgStyleProperty* child, const SvgStyleProperty* parent)
+static void _inefficientNodeCheck(TVG_UNUSED SvgNode* node)
 {
-    if (parent == nullptr) return;
-    //Inherit the property of parent if not present in child.
-    //Fill
-    if (!((int)child->fill.flags & (int)SvgFillFlags::Paint)) {
-        child->fill.paint.color = parent->fill.paint.color;
-        child->fill.paint.none = parent->fill.paint.none;
-        child->fill.paint.curColor = parent->fill.paint.curColor;
-        if (parent->fill.paint.url) child->fill.paint.url = _copyId(parent->fill.paint.url);
-    } else if (child->fill.paint.curColor && !child->curColorSet) {
-        child->color = parent->color;
-    }
-    if (!((int)child->fill.flags & (int)SvgFillFlags::Opacity)) {
-        child->fill.opacity = parent->fill.opacity;
-    }
-    if (!((int)child->fill.flags & (int)SvgFillFlags::FillRule)) {
-        child->fill.fillRule = parent->fill.fillRule;
-    }
-    //Stroke
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Paint)) {
-        child->stroke.paint.color = parent->stroke.paint.color;
-        child->stroke.paint.none = parent->stroke.paint.none;
-        child->stroke.paint.curColor = parent->stroke.paint.curColor;
-        child->stroke.paint.url = parent->stroke.paint.url ? _copyId(parent->stroke.paint.url) : nullptr;
-    } else if (child->stroke.paint.curColor && !child->curColorSet) {
-        child->color = parent->color;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
-        child->stroke.opacity = parent->stroke.opacity;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Width)) {
-        child->stroke.width = parent->stroke.width;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Dash)) {
-        if (parent->stroke.dash.array.count > 0) {
-            child->stroke.dash.array.clear();
-            child->stroke.dash.array.reserve(parent->stroke.dash.array.count);
-            for (uint32_t i = 0; i < parent->stroke.dash.array.count; ++i) {
-                child->stroke.dash.array.push(parent->stroke.dash.array.data[i]);
-            }
-        }
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Cap)) {
-        child->stroke.cap = parent->stroke.cap;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Join)) {
-        child->stroke.join = parent->stroke.join;
-    }
-}
-
-
-static void _inefficientNodeCheck(TVG_UNUSED SvgNode* node){
 #ifdef THORVG_LOG_ENABLED
     auto type = simpleXmlNodeTypeToString(node->type);
 
@@ -2838,14 +2908,14 @@ void SvgLoader::run(unsigned tid)
     if (loaderData.doc) {
         _updateStyle(loaderData.doc, nullptr);
         auto defs = loaderData.doc->node.doc.defs;
-        if (defs) _updateGradient(loaderData.doc, &defs->node.defs.gradients);
-
-        if (loaderData.gradients.count > 0) _updateGradient(loaderData.doc, &loaderData.gradients);
 
         _updateComposite(loaderData.doc, loaderData.doc);
         if (defs) _updateComposite(loaderData.doc, defs);
 
-        if (loaderData.cloneNodes.count > 0) _clonePostponedNodes(&loaderData.cloneNodes);
+        if (loaderData.cloneNodes.count > 0) _clonePostponedNodes(&loaderData.cloneNodes, loaderData.doc);
+
+        if (loaderData.gradients.count > 0) _updateGradient(loaderData.doc, &loaderData.gradients);
+        if (defs) _updateGradient(loaderData.doc, &defs->node.defs.gradients);
     }
     root = svgSceneBuild(loaderData.doc, vx, vy, vw, vh, w, h, preserveAspect, svgPath);
 }
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
index 1571aa4e25..ee199da231 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
@@ -220,15 +220,15 @@ static SimpleXMLType _getXMLType(const char* itr, const char* itrEnd, size_t &to
         if ((itr + sizeof("<!DOCTYPE>") - 1 < itrEnd) && (!memcmp(itr + 2, "DOCTYPE", sizeof("DOCTYPE") - 1)) && ((itr[2 + sizeof("DOCTYPE") - 1] == '>') || (isspace((unsigned char)itr[2 + sizeof("DOCTYPE") - 1])))) {
             toff = sizeof("!DOCTYPE") - 1;
             return SimpleXMLType::Doctype;
-        } else if (itr + sizeof("<!>") - 1 < itrEnd) {
-            toff = sizeof("!") - 1;
-            return SimpleXMLType::DoctypeChild;
         } else if ((itr + sizeof("<![CDATA[]]>") - 1 < itrEnd) && (!memcmp(itr + 2, "[CDATA[", sizeof("[CDATA[") - 1))) {
             toff = sizeof("![CDATA[") - 1;
             return SimpleXMLType::CData;
         } else if ((itr + sizeof("<!---->") - 1 < itrEnd) && (!memcmp(itr + 2, "--", sizeof("--") - 1))) {
             toff = sizeof("!--") - 1;
             return SimpleXMLType::Comment;
+        } else if (itr + sizeof("<!>") - 1 < itrEnd) {
+            toff = sizeof("!") - 1;
+            return SimpleXMLType::DoctypeChild;
         }
         return SimpleXMLType::Open;
     }
diff --git a/thirdparty/thorvg/update-thorvg.sh b/thirdparty/thorvg/update-thorvg.sh
index c200131eba..ce3d5eed1c 100755
--- a/thirdparty/thorvg/update-thorvg.sh
+++ b/thirdparty/thorvg/update-thorvg.sh
@@ -1,4 +1,4 @@
-VERSION=0.7.0
+VERSION=0.7.1
 rm -rf AUTHORS inc LICENSE src *.zip
 curl -L -O https://github.com/Samsung/thorvg/archive/refs/tags/v$VERSION.zip
 bsdtar --strip-components=1 -xvf *.zip