161 files changed, 16745 insertions, 9145 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 2cb5458952..34c33c3b56 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -52,13 +52,13 @@ Includes some patches in the `patches` folder which have been sent upstream.
 
 ## cvtt
 
-- Upstream: https://github.com/elasota/cvtt
-- Version: 1.0.0-beta4 (cc8472a04ba110fe999c686d07af40f7839051fd, 2018)
+- Upstream: https://github.com/elasota/ConvectionKernels
+- Version: git (dc2dbbe0ae2cf2be06ef56d1021e2222a56c7fe2, 2021)
 - License: MIT
 
 Files extracted from upstream source:
 
-- all .cpp, .h, and .txt files in ConvectionKernels/
+- all .cpp, .h, and .txt files except the folders MakeTables and etc2packer.
 
 
 ## doctest
@@ -138,10 +138,10 @@ Files extracted from upstream source:
   * License: OFL-1.1
   * Comment: Use UI font variant if available, because it has tight vertical metrics and
     good for UI.
-- `Hack_Regular.ttf`:
-  * Upstream: https://github.com/source-foundry/Hack
-  * Version: 3.003 (2018)
-  * License: MIT + Bitstream Vera License
+- `JetBrainsMono_Regular.ttf`:
+	* Upstream: https://github.com/JetBrains/JetBrainsMono
+  * Version: 2.242
+  * License: OFL-1.1
 - `DroidSans*.ttf`:
   * Upstream: https://android.googlesource.com/platform/frameworks/base/+/master/data/fonts/
   * Version: ? (pre-2014 commit when DroidSansJapanese.ttf was obsoleted)
@@ -152,6 +152,7 @@ Files extracted from upstream source:
   * License: Apache 2.0
 
 
+
 ## freetype
 
 - Upstream: https://www.freetype.org
@@ -205,7 +206,7 @@ Files extracted from upstream source:
 ## harfbuzz
 
 - Upstream: https://github.com/harfbuzz/harfbuzz
-- Version: 3.2.0 (be91d2917d9860326cb5fd1d03ffe1042a72f6d3, 2021)
+- Version: 3.3.1 (45df259538c204540819d74456d30ffb40df488a, 2022)
 - License: MIT
 
 Files extracted from upstream source:
@@ -308,7 +309,7 @@ Files extracted from upstream source:
 ## libwebp
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 1.2.1 (9ce5843dbabcfd3f7c39ec7ceba9cbeb213cbfdf, 2021)
+- Version: 1.2.2 (b0a860891dcd4c0c2d7c6149e5cccb6eb881cc21, 2022)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -316,10 +317,6 @@ Files extracted from upstream source:
 - `src/*` except from: `.am`, `.rc` and `.in` files
 - `AUTHORS`, `COPYING`, `PATENTS`
 
-Important: The files `utils/bit_reader_utils.{c,h}` have Godot-made
-changes to ensure they build for Javascript/HTML5. Those
-changes are marked with `// -- GODOT --` comments.
-
 
 ## mbedtls
 
@@ -605,7 +602,7 @@ instead of `miniz.h` as an external dependency.
 ## thorvg
 
 - Upstream: https://github.com/Samsung/thorvg
-- Version: 0.7.0 (e527f565b770f0a41df821e6618ccaeea94f465e, 2021)
+- Version: 0.7.1 (d53eb2a880002cb770ace1c1ace9c5dfcfc28252, 2022)
 - License: MIT
 
 Files extracted from upstream source:
@@ -613,8 +610,6 @@ Files extracted from upstream source:
 See `thorvg/update-thorvg.sh` for extraction instructions. Set the version
 number and run the script.
 
-Patches in the `patches` directory should be re-applied after updates.
-
 
 ## vhacd
 
diff --git a/thirdparty/cvtt/ConvectionKernels.cpp b/thirdparty/cvtt/ConvectionKernels.cpp
deleted file mode 100644
index 8d379344e1..0000000000
--- a/thirdparty/cvtt/ConvectionKernels.cpp
+++ /dev/null
@@ -1,7586 +0,0 @@
-/*
-Convection Texture Tools
-Copyright (c) 2018 Eric Lasota
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject
-to the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
--------------------------------------------------------------------------------------
-
-Portions based on DirectX Texture Library (DirectXTex)
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-Licensed under the MIT License.
-
-http://go.microsoft.com/fwlink/?LinkId=248926
-*/
-#include "ConvectionKernels.h"
-#include "ConvectionKernels_BC7_SingleColor.h"
-
-#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
-#define CVTT_USE_SSE2
-#endif
-
-#ifdef CVTT_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#include <float.h>
-#include <assert.h>
-#include <string.h>
-#include <algorithm>
-#include <math.h>
-
-#define UNREFERENCED_PARAMETER(n) ((void)n)
-
-namespace cvtt
-{
-#ifdef CVTT_USE_SSE2
-    // SSE2 version
-    struct ParallelMath
-    {
-        typedef uint16_t ScalarUInt16;
-        typedef int16_t ScalarSInt16;
-
-        template<unsigned int TRoundingMode>
-        struct RoundForScope
-        {
-            unsigned int m_oldCSR;
-
-            RoundForScope()
-            {
-                m_oldCSR = _mm_getcsr();
-                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
-            }
-
-            ~RoundForScope()
-            {
-                _mm_setcsr(m_oldCSR);
-            }
-        };
-
-        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
-        {
-        };
-
-        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
-        {
-        };
-
-        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
-        {
-        };
-
-        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
-        {
-        };
-
-        static const int ParallelSize = 8;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        template<int TSubtype>
-        struct VInt16
-        {
-            __m128i m_value;
-
-            inline VInt16 operator+(int16_t other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
-                return result;
-            }
-
-            inline VInt16 operator+(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_add_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator|(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator&(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator-(const VInt16 &other) const
-            {
-                VInt16 result;
-                result.m_value = _mm_sub_epi16(m_value, other.m_value);
-                return result;
-            }
-
-            inline VInt16 operator<<(int bits) const
-            {
-                VInt16 result;
-                result.m_value = _mm_slli_epi16(m_value, bits);
-                return result;
-            }
-        };
-
-        typedef VInt16<IntSubtype_Signed> SInt16;
-        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
-        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
-        typedef VInt16<IntSubtype_Abstract> AInt16;
-
-        template<int TSubtype>
-        struct VInt32
-        {
-            __m128i m_values[2];
-
-            inline VInt32 operator+(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator-(const VInt32& other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline VInt32 operator<<(const int other) const
-            {
-                VInt32 result;
-                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
-                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
-                return result;
-            }
-        };
-
-        typedef VInt32<IntSubtype_Signed> SInt32;
-        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
-        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
-        typedef VInt32<IntSubtype_Abstract> AInt32;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-#ifdef CVTT_PERMIT_ALIASING
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt32<TSubtype>&>(src);
-            }
-
-            template<int TSrcSubtype>
-            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
-            {
-                return reinterpret_cast<VInt16<TSubtype>&>(src);
-            }
-#else
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_values[0] = src.m_values[0];
-                result.m_values[1] = src.m_values[1];
-                return result;
-            }
-
-            template<int TSrcSubtype>
-            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
-            {
-                TTargetType result;
-                result.m_value = src.m_value;
-                return result;
-            }
-#endif
-        };
-
-        struct Int64
-        {
-            __m128i m_values[4];
-        };
-
-        struct Float
-        {
-            __m128 m_values[2];
-
-            inline Float operator+(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator+(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator-(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator-() const
-            {
-                Float result;
-                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
-                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(const Float& other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator*(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-
-            inline Float operator/(const Float &other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
-                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
-                return result;
-            }
-
-            inline Float operator/(float other) const
-            {
-                Float result;
-                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
-                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
-                return result;
-            }
-        };
-
-        struct Int16CompFlag
-        {
-            __m128i m_value;
-
-            inline Int16CompFlag operator&(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_and_si128(m_value, other.m_value);
-                return result;
-            }
-
-            inline Int16CompFlag operator|(const Int16CompFlag &other) const
-            {
-                Int16CompFlag result;
-                result.m_value = _mm_or_si128(m_value, other.m_value);
-                return result;
-            }
-        };
-
-        struct FloatCompFlag
-        {
-            __m128 m_values[2];
-        };
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
-            return result;
-        }
-
-        template<int TSubtype>
-        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
-        {
-            VInt16<TSubtype> result;
-            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
-            return result;
-        }
-
-        template<int TSubtype>
-        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
-        }
-
-        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
-            return result;
-        }
-
-        template<int TSubtype>
-        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
-        {
-            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
-        }
-
-        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
-        {
-            for (int i = 0; i < 2; i++)
-                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
-        }
-
-        static void MakeSafeDenominator(Float& v)
-        {
-            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
-        }
-
-        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
-        {
-            int lostBits = 16 - precision;
-            if (lostBits == 0)
-                return v;
-
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
-            return result;
-        }
-
-        static UInt16 Min(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Min(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Min(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Min(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Max(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i bitFlip = _mm_set1_epi16(-32768);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
-            return result;
-        }
-
-        static SInt16 Max(const SInt16 &a, const SInt16 &b)
-        {
-            SInt16 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt15 Max(const UInt15 &a, const UInt15 &b)
-        {
-            UInt15 result;
-            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Float Max(const Float &a, const Float &b)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float Clamp(const Float &v, float min, float max)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
-            return result;
-        }
-
-        static Float Reciprocal(const Float &v)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
-            return result;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
-        {
-            int16_t values[8];
-            for (int i = 0; i < 8; i++)
-                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
-
-            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
-        }
-
-        static Float MakeFloat(float v)
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
-            return f;
-        }
-
-        static Float MakeFloatZero()
-        {
-            Float f;
-            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
-            return f;
-        }
-
-        static UInt16 MakeUInt16(uint16_t v)
-        {
-            UInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt16 MakeSInt16(int16_t v)
-        {
-            SInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static AInt16 MakeAInt16(int16_t v)
-        {
-            AInt16 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static UInt15 MakeUInt15(uint16_t v)
-        {
-            UInt15 result;
-            result.m_value = _mm_set1_epi16(static_cast<short>(v));
-            return result;
-        }
-
-        static SInt32 MakeSInt32(int32_t v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static UInt31 MakeUInt31(uint32_t v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_set1_epi32(v);
-            result.m_values[1] = _mm_set1_epi32(v);
-            return result;
-        }
-
-        static uint16_t Extract(const UInt16 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const SInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static uint16_t Extract(const UInt15 &v, int offset)
-        {
-            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
-        }
-
-        static int16_t Extract(const AInt16 &v, int offset)
-        {
-            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
-        }
-
-        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
-        {
-            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
-        }
-
-        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
-        {
-            reinterpret_cast<int16_t*>(&dest)[offset] = v;
-        }
-
-        static float ExtractFloat(const Float& v, int offset)
-        {
-            return reinterpret_cast<const float*>(&v)[offset];
-        }
-
-        static void PutFloat(Float &dest, int offset, float v)
-        {
-            reinterpret_cast<float*>(&dest)[offset] = v;
-        }
-
-        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Less(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        template<int TSubtype>
-        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static FloatCompFlag Equal(const Float &a, const Float &b)
-        {
-            FloatCompFlag result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
-            return result;
-        }
-
-        static Float ToFloat(const UInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static UInt31 ToUInt31(const UInt16 &v)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const UInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
-            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
-            return result;
-        }
-
-        static SInt32 ToInt32(const SInt16 &v)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
-            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
-            return result;
-        }
-
-        static Float ToFloat(const SInt16 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
-            return result;
-        }
-
-        static Float ToFloat(const UInt15 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
-            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
-            return result;
-        }
-
-        static Float ToFloat(const UInt31 &v)
-        {
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
-            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
-            return result;
-        }
-
-        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
-        {
-            __m128i lo = _mm_castps_si128(v.m_values[0]);
-            __m128i hi = _mm_castps_si128(v.m_values[1]);
-
-            Int16CompFlag result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
-        {
-            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
-            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
-
-            FloatCompFlag result;
-            result.m_values[0] = _mm_castsi128_ps(lo);
-            result.m_values[1] = _mm_castsi128_ps(hi);
-            return result;
-        }
-
-        static Int16CompFlag MakeBoolInt16(bool b)
-        {
-            Int16CompFlag result;
-            if (b)
-                result.m_value = _mm_set1_epi16(-1);
-            else
-                result.m_value = _mm_setzero_si128();
-            return result;
-        }
-
-        static FloatCompFlag MakeBoolFloat(bool b)
-        {
-            FloatCompFlag result;
-            if (b)
-                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
-            else
-                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
-            return result;
-        }
-
-        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
-        {
-            Int16CompFlag result;
-            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
-            return result;
-        }
-
-        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
-            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt16 result;
-            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
-            return result;
-        }
-
-        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
-        {
-            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
-            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
-
-            __m128i packed = _mm_packs_epi32(lo, hi);
-
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(lo, hi);
-            return result;
-        }
-
-        static Float Sqrt(const Float &f)
-        {
-            Float result;
-            for (int i = 0; i < 2; i++)
-                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
-            return result;
-        }
-
-        static UInt16 Abs(const SInt16 &a)
-        {
-            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
-            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
-
-            UInt16 result;
-            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
-            return result;
-        }
-
-        static Float Abs(const Float& a)
-        {
-            __m128 invMask = _mm_set1_ps(-0.0f);
-
-            Float result;
-            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
-            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
-            return result;
-        }
-
-        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
-
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(diff, diff);
-            return result;
-        }
-
-        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
-
-            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
-            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
-            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
-            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
-
-            Float result;
-            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
-            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
-
-            return result;
-        }
-
-        static Float TwosCLHalfToFloat(const SInt16 &v)
-        {
-            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
-
-            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
-            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
-            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
-
-            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
-
-            // Convert exponent to high-bits 
-            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
-
-            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
-
-            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
-            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
-
-            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
-            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
-
-            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
-
-            Float result;
-            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
-            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
-
-            return result;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static UInt16 RightShift(const UInt16 &v, int bits)
-        {
-            UInt16 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt31 RightShift(const UInt31 &v, int bits)
-        {
-            UInt31 result;
-            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 RightShift(const SInt16 &v, int bits)
-        {
-            SInt16 result;
-            result.m_value = _mm_srai_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static UInt15 RightShift(const UInt15 &v, int bits)
-        {
-            UInt15 result;
-            result.m_value = _mm_srli_epi16(v.m_value, bits);
-            return result;
-        }
-
-        static SInt32 RightShift(const SInt32 &v, int bits)
-        {
-            SInt32 result;
-            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
-            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
-            return result;
-        }
-
-        static SInt16 ToSInt16(const SInt32 &v)
-        {
-            SInt16 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt32 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt16 ToUInt16(const UInt31 &v)
-        {
-            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
-            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
-
-            UInt16 result;
-            result.m_value = _mm_packs_epi32(low, high);
-            return result;
-        }
-
-        static UInt15 ToUInt15(const UInt31 &v)
-        {
-            UInt15 result;
-            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            SInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt32 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            UInt16 result;
-            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
-        {
-            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
-            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
-
-            UInt31 result;
-            result.m_values[0] = _mm_unpacklo_epi16(low, high);
-            result.m_values[1] = _mm_unpackhi_epi16(low, high);
-            return result;
-        }
-
-        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
-        {
-            return XMultiply(b, a);
-        }
-
-        static bool AnySet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) != 0;
-        }
-
-        static bool AllSet(const Int16CompFlag &v)
-        {
-            return _mm_movemask_epi8(v.m_value) == 0xffff;
-        }
-
-        static bool AnySet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
-        }
-
-        static bool AllSet(const FloatCompFlag &v)
-        {
-            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
-        }
-    };
-
-#else
-    // Scalar version
-    struct ParallelMath
-    {
-        struct RoundTowardZeroForScope
-        {
-        };
-
-        struct RoundTowardNearestForScope
-        {
-        };
-
-        struct RoundUpForScope
-        {
-        };
-
-        struct RoundDownForScope
-        {
-        };
-
-        static const int ParallelSize = 1;
-
-        enum Int16Subtype
-        {
-            IntSubtype_Signed,
-            IntSubtype_UnsignedFull,
-            IntSubtype_UnsignedTruncated,
-            IntSubtype_Abstract,
-        };
-
-        typedef int32_t SInt16;
-        typedef int32_t UInt15;
-        typedef int32_t UInt16;
-        typedef int32_t AInt16;
-
-        typedef int32_t SInt32;
-        typedef int32_t UInt31;
-        typedef int32_t UInt32;
-        typedef int32_t AInt32;
-
-        typedef int32_t ScalarUInt16;
-        typedef int32_t ScalarSInt16;
-
-        typedef float Float;
-
-        template<class TTargetType>
-        struct LosslessCast
-        {
-            static const int32_t& Cast(const int32_t &src)
-            {
-                return src;
-            }
-        };
-
-        typedef bool Int16CompFlag;
-        typedef bool FloatCompFlag;
-
-        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
-        {
-            return a + b;
-        }
-
-        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
-        {
-            return a - b;
-        }
-
-        static float Select(bool flag, float a, float b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t Select(bool flag, int32_t a, int32_t b)
-        {
-            return flag ? a : b;
-        }
-
-        static int32_t SelectOrZero(bool flag, int32_t a)
-        {
-            return flag ? a : 0;
-        }
-
-        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static int32_t ConditionalNegate(bool flag, int32_t v)
-        {
-            return (flag) ? -v : v;
-        }
-
-        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void ConditionalSet(float& dest, bool flag, float src)
-        {
-            if (flag)
-                dest = src;
-        }
-
-        static void NotConditionalSet(float& dest, bool flag, float src)
-        {
-            if (!flag)
-                dest = src;
-        }
-
-        static void MakeSafeDenominator(float& v)
-        {
-            if (v == 0.0f)
-                v = 1.0f;
-        }
-
-        static int32_t SignedRightShift(int32_t v, int bits)
-        {
-            return v >> bits;
-        }
-
-        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
-        {
-            v = (v << (32 - precision)) & 0xffffffff;
-            return SignedRightShift(v, 32 - precision);
-        }
-
-        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
-        {
-            return v & ((1 << precision) - 1);
-        }
-
-        static int32_t Min(int32_t a, int32_t b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static float Min(float a, float b)
-        {
-            if (a < b)
-                return a;
-            return b;
-        }
-
-        static int32_t Max(int32_t a, int32_t b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Max(float a, float b)
-        {
-            if (a > b)
-                return a;
-            return b;
-        }
-
-        static float Abs(float a)
-        {
-            return fabsf(a);
-        }
-
-        static int32_t Abs(int32_t a)
-        {
-            if (a < 0)
-                return -a;
-            return a;
-        }
-
-        static float Clamp(float v, float min, float max)
-        {
-            if (v < min)
-                return min;
-            if (v > max)
-                return max;
-            return v;
-        }
-
-        static float Reciprocal(float v)
-        {
-            return 1.0f / v;
-        }
-
-        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
-        {
-            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
-        }
-
-        static float MakeFloat(float v)
-        {
-            return v;
-        }
-
-        static float MakeFloatZero()
-        {
-            return 0.0f;
-        }
-
-        static int32_t MakeUInt16(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeAInt16(int16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt15(uint16_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeSInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t MakeUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t Extract(int32_t v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static float ExtractFloat(float v, int offset)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            return v;
-        }
-
-        static void PutFloat(float &dest, int offset, float v)
-        {
-            UNREFERENCED_PARAMETER(offset);
-            dest = v;
-        }
-
-        static bool Less(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool Less(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(int32_t a, int32_t b)
-        {
-            return a < b;
-        }
-
-        static bool LessOrEqual(float a, float b)
-        {
-            return a < b;
-        }
-
-        static bool Equal(int32_t a, int32_t b)
-        {
-            return a == b;
-        }
-
-        static bool Equal(float a, float b)
-        {
-            return a == b;
-        }
-
-        static float ToFloat(int32_t v)
-        {
-            return static_cast<float>(v);
-        }
-
-        static int32_t ToUInt31(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToInt32(int32_t v)
-        {
-            return v;
-        }
-
-        static bool FloatFlagToInt16(bool v)
-        {
-            return v;
-        }
-
-        static bool Int16FlagToFloat(bool v)
-        {
-            return v;
-        }
-
-        static bool MakeBoolInt16(bool b)
-        {
-            return b;
-        }
-
-        static bool MakeBoolFloat(bool b)
-        {
-            return b;
-        }
-
-        static bool AndNot(bool a, bool b)
-        {
-            return a && !b;
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
-        {
-            UNREFERENCED_PARAMETER(rtz);
-            return static_cast<int>(v);
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
-        {
-            UNREFERENCED_PARAMETER(ru);
-            return static_cast<int>(ceilf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
-        {
-            UNREFERENCED_PARAMETER(rd);
-            return static_cast<int>(floorf(v));
-        }
-
-        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
-        {
-            UNREFERENCED_PARAMETER(rtn);
-            return static_cast<int>(floorf(v + 0.5f));
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        template<class TRoundMode>
-        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
-        {
-            return RoundAndConvertToInt(v, roundingMode);
-        }
-
-        static float Sqrt(float f)
-        {
-            return sqrtf(f);
-        }
-
-        static int32_t SqDiffUInt8(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static int32_t SqDiffSInt16(int32_t a, int32_t b)
-        {
-            int32_t delta = a - b;
-            return delta * delta;
-        }
-
-        static float TwosCLHalfToFloat(int32_t v)
-        {
-            int32_t absV = (v < 0) ? -v : v;
-
-            int32_t signBits = (absV & -32768);
-            int32_t mantissa = (absV & 0x03ff);
-            int32_t exponent = (absV & 0x7c00);
-
-            bool isDenormal = (exponent == 0);
-
-            // Convert exponent to high-bits
-            exponent = (exponent >> 3) + 14336;
-
-            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
-
-            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
-
-            float f, correction;
-            memcpy(&f, &fBits, 4);
-            memcpy(&correction, &denormalCorrection, 4);
-
-            return f - correction;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
-        {
-            Float fa = TwosCLHalfToFloat(a);
-            Float fb = TwosCLHalfToFloat(b);
-
-            Float diff = fa - fb;
-            return diff * diff;
-        }
-
-        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
-        {
-            Float fa = TwosCLHalfToFloat(a) * aWeight;
-
-            Float diff = fa - b;
-            return diff * diff;
-        }
-
-        static int32_t RightShift(int32_t v, int bits)
-        {
-            return SignedRightShift(v, bits);
-        }
-
-        static int32_t ToSInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt16(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t ToUInt15(int32_t v)
-        {
-            return v;
-        }
-
-        static int32_t XMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static int32_t CompactMultiply(int32_t a, int32_t b)
-        {
-            return a * b;
-        }
-
-        static bool AnySet(bool v)
-        {
-            return v;
-        }
-
-        static bool AllSet(bool v)
-        {
-            return v;
-        }
-    };
-
-#endif
-
-    namespace Internal
-    {
-        namespace BC7Data
-        {
-            enum AlphaMode
-            {
-                AlphaMode_Combined,
-                AlphaMode_Separate,
-                AlphaMode_None,
-            };
-
-            enum PBitMode
-            {
-                PBitMode_PerEndpoint,
-                PBitMode_PerSubset,
-                PBitMode_None
-            };
-
-            struct BC7ModeInfo
-            {
-                PBitMode m_pBitMode;
-                AlphaMode m_alphaMode;
-                int m_rgbBits;
-                int m_alphaBits;
-                int m_partitionBits;
-                int m_numSubsets;
-                int m_indexBits;
-                int m_alphaIndexBits;
-                bool m_hasIndexSelector;
-            };
-
-            BC7ModeInfo g_modes[] =
-            {
-                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
-                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
-                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
-                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
-
-                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
-                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
-                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
-            };
-
-			const int g_weight2[] = { 0, 21, 43, 64 };
-			const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
-			const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-
-			const int *g_weightTables[] =
-			{
-				NULL,
-				NULL,
-				g_weight2,
-				g_weight3,
-				g_weight4
-			};
-
-            struct BC6HModeInfo
-            {
-                uint16_t m_modeID;
-                bool m_partitioned;
-                bool m_transformed;
-                int m_aPrec;
-                int m_bPrec[3];
-            };
-
-            // [partitioned][precision]
-            bool g_hdrModesExistForPrecision[2][17] =
-            {
-                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
-                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
-                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
-            };
-
-            BC6HModeInfo g_hdrModes[] =
-            {
-                { 0x00, true,  true,  10,{ 5, 5, 5 } },
-                { 0x01, true,  true,  7,{ 6, 6, 6 } },
-                { 0x02, true,  true,  11,{ 5, 4, 4 } },
-                { 0x06, true,  true,  11,{ 4, 5, 4 } },
-                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
-                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
-                { 0x12, true,  true,  8,{ 6, 5, 5 } },
-                { 0x16, true,  true,  8,{ 5, 6, 5 } },
-                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
-                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
-                { 0x03, false, false, 10,{ 10, 10, 10 } },
-                { 0x07, false, true,  11,{ 9, 9, 9 } },
-                { 0x0b, false, true,  12,{ 8, 8, 8 } },
-                { 0x0f, false, true,  16,{ 4, 4, 4 } },
-            };
-
-            const int g_maxHDRPrecision = 16;
-
-            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
-
-            static uint16_t g_partitionMap[64] =
-            {
-                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
-                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
-                0xC800, 0xFFEC, 0xFE80, 0xE800,
-                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
-                0xF710, 0x008E, 0x7100, 0x08CE,
-                0x008C, 0x7310, 0x3100, 0x8CCE,
-                0x088C, 0x3110, 0x6666, 0x366C,
-                0x17E8, 0x0FF0, 0x718E, 0x399C,
-                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
-                0x3c3c, 0x55aa, 0x9696, 0xa55a,
-                0x73ce, 0x13c8, 0x324c, 0x3bdc,
-                0x6996, 0xc33c, 0x9966, 0x660,
-                0x272, 0x4e4, 0x4e40, 0x2720,
-                0xc936, 0x936c, 0x39c6, 0x639c,
-                0x9336, 0x9cc6, 0x817e, 0xe718,
-                0xccf0, 0xfcc, 0x7744, 0xee22,
-            };
-
-            static uint32_t g_partitionMap2[64] =
-            {
-                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
-                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
-                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
-                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
-                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
-                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
-                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
-                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
-                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
-                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
-                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
-                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
-                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
-                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
-                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
-                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
-            };
-
-            static int g_fixupIndexes2[64] =
-            {
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15,15,15,15,
-                15, 2, 8, 2,
-                2, 8, 8,15,
-                2, 8, 2, 2,
-                8, 8, 2, 2,
-
-                15,15, 6, 8,
-                2, 8,15,15,
-                2, 8, 2, 2,
-                2,15,15, 6,
-                6, 2, 6, 8,
-                15,15, 2, 2,
-                15,15,15,15,
-                15, 2, 2,15,
-            };
-
-            static int g_fixupIndexes3[64][2] =
-            {
-                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
-                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
-                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
-                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
-                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
-                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
-                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
-                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
-
-                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
-                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
-                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
-                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
-                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
-                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
-                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
-                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
-            };
-
-            static const unsigned char g_fragments[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
-                0, 1, 2, 3,  // 16, 4
-                0, 1, 4,  // 20, 3
-                0, 1, 2, 4,  // 23, 4
-                2, 3, 7,  // 27, 3
-                1, 2, 3, 7,  // 30, 4
-                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
-                0, 1, 4, 8,  // 42, 4
-                0, 1, 2, 4, 5, 8,  // 46, 6
-                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
-                1, 4, 5, 6, 9,  // 60, 5
-                2, 5, 6, 7, 10,  // 65, 5
-                5, 6, 9, 10,  // 70, 4
-                2, 3, 7, 11,  // 74, 4
-                1, 2, 3, 6, 7, 11,  // 78, 6
-                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
-                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
-                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
-                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
-                0, 4, 8, 12,  // 128, 4
-                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
-                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
-                3, 6, 7, 8, 9, 12,  // 158, 6
-                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
-                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
-                5, 8, 9, 10, 13,  // 192, 5
-                8, 12, 13,  // 197, 3
-                4, 8, 12, 13,  // 200, 4
-                2, 3, 6, 9, 12, 13,  // 204, 6
-                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
-                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
-                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
-                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
-                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
-                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
-                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
-                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
-                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
-                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
-                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
-                6, 9, 10, 11, 14,  // 317, 5
-                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
-                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
-                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
-                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
-                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
-                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
-                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
-                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
-                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
-                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
-                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
-                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
-                8, 12, 13, 14,  // 418, 4
-                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
-                4, 8, 9, 12, 13, 14,  // 430, 6
-                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
-                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
-                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
-                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
-                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
-                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
-                3, 7, 11, 15,  // 490, 4
-                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
-                0, 4, 5, 10, 11, 15,  // 502, 6
-                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
-                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
-                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
-                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
-                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
-                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
-                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
-                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
-                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
-                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
-                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
-                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
-                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
-                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
-                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
-                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
-                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
-                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
-                0, 1, 5, 10, 14, 15,  // 669, 6
-                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
-                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
-                11, 14, 15,  // 691, 3
-                7, 11, 14, 15,  // 694, 4
-                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
-                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
-                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
-                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
-                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
-                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
-                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
-                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
-                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
-                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
-                11, 13, 14, 15,  // 792, 4
-                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
-                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
-                7, 10, 11, 13, 14, 15,  // 814, 6
-                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
-                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
-                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
-                12, 13, 14, 15,  // 848, 4
-                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
-                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
-                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
-                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
-                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
-                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
-                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
-                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
-                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
-                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
-                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
-                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
-                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
-                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
-                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
-                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
-                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
-                0, 2,  // 1040, 2
-                1, 3,  // 1042, 2
-                0, 1, 4, 5,  // 1044, 4
-                0, 1, 2, 4, 5,  // 1048, 5
-                2, 3, 6,  // 1053, 3
-                0, 2, 4, 6,  // 1056, 4
-                1, 2, 5, 6,  // 1060, 4
-                0, 1, 2, 3, 5, 6,  // 1064, 6
-                0, 1, 2, 4, 5, 6,  // 1070, 6
-                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
-                0, 3, 4, 7,  // 1083, 4
-                0, 1, 2, 3, 4, 7,  // 1087, 6
-                1, 3, 5, 7,  // 1093, 4
-                2, 3, 6, 7,  // 1097, 4
-                1, 2, 3, 6, 7,  // 1101, 5
-                1, 2, 3, 5, 6, 7,  // 1106, 6
-                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
-                4, 5, 6, 7,  // 1119, 4
-                0, 8,  // 1123, 2
-                0, 1, 4, 5, 8,  // 1125, 5
-                0, 1, 8, 9,  // 1130, 4
-                4, 5, 8, 9,  // 1134, 4
-                0, 1, 4, 5, 8, 9,  // 1138, 6
-                2, 6, 8, 9,  // 1144, 4
-                6, 7, 8, 9,  // 1148, 4
-                0, 2, 4, 6, 8, 10,  // 1152, 6
-                1, 2, 5, 6, 9, 10,  // 1158, 6
-                0, 3, 4, 7, 9, 10,  // 1164, 6
-                0, 1, 2, 8, 9, 10,  // 1170, 6
-                4, 5, 6, 8, 9, 10,  // 1176, 6
-                3, 11,  // 1182, 2
-                2, 3, 6, 7, 11,  // 1184, 5
-                0, 3, 8, 11,  // 1189, 4
-                0, 3, 4, 7, 8, 11,  // 1193, 6
-                1, 3, 5, 7, 9, 11,  // 1199, 6
-                2, 3, 10, 11,  // 1205, 4
-                1, 5, 10, 11,  // 1209, 4
-                4, 5, 10, 11,  // 1213, 4
-                6, 7, 10, 11,  // 1217, 4
-                2, 3, 6, 7, 10, 11,  // 1221, 6
-                1, 2, 3, 9, 10, 11,  // 1227, 6
-                5, 6, 7, 9, 10, 11,  // 1233, 6
-                8, 9, 10, 11,  // 1239, 4
-                4, 12,  // 1243, 2
-                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
-                8, 9, 12,  // 1253, 3
-                0, 4, 5, 8, 9, 12,  // 1256, 6
-                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
-                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
-                1, 5, 9, 13,  // 1276, 4
-                6, 7, 9, 13,  // 1280, 4
-                1, 4, 7, 10, 13,  // 1284, 5
-                1, 6, 8, 11, 13,  // 1289, 5
-                0, 1, 12, 13,  // 1294, 4
-                4, 5, 12, 13,  // 1298, 4
-                0, 1, 6, 7, 12, 13,  // 1302, 6
-                0, 1, 4, 8, 12, 13,  // 1308, 6
-                8, 9, 12, 13,  // 1314, 4
-                4, 8, 9, 12, 13,  // 1318, 5
-                4, 5, 8, 9, 12, 13,  // 1323, 6
-                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
-                0, 1, 6, 10, 12, 13,  // 1336, 6
-                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
-                0, 1, 10, 11, 12, 13,  // 1349, 6
-                2, 4, 7, 9, 14,  // 1355, 5
-                4, 5, 10, 14,  // 1360, 4
-                2, 6, 10, 14,  // 1364, 4
-                2, 5, 8, 11, 14,  // 1368, 5
-                0, 2, 12, 14,  // 1373, 4
-                8, 10, 12, 14,  // 1377, 4
-                4, 6, 8, 10, 12, 14,  // 1381, 6
-                13, 14,  // 1387, 2
-                9, 10, 13, 14,  // 1389, 4
-                5, 6, 9, 10, 13, 14,  // 1393, 6
-                0, 1, 2, 12, 13, 14,  // 1399, 6
-                4, 5, 6, 12, 13, 14,  // 1405, 6
-                8, 9, 12, 13, 14,  // 1411, 5
-                8, 9, 10, 12, 13, 14,  // 1416, 6
-                7, 15,  // 1422, 2
-                0, 5, 10, 15,  // 1424, 4
-                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
-                10, 11, 15,  // 1436, 3
-                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
-                3, 6, 7, 10, 11, 15,  // 1446, 6
-                12, 15,  // 1452, 2
-                0, 3, 12, 15,  // 1454, 4
-                4, 7, 12, 15,  // 1458, 4
-                0, 3, 6, 9, 12, 15,  // 1462, 6
-                0, 3, 5, 10, 12, 15,  // 1468, 6
-                8, 11, 12, 15,  // 1474, 4
-                5, 6, 8, 11, 12, 15,  // 1478, 6
-                4, 7, 8, 11, 12, 15,  // 1484, 6
-                1, 3, 13, 15,  // 1490, 4
-                9, 11, 13, 15,  // 1494, 4
-                5, 7, 9, 11, 13, 15,  // 1498, 6
-                2, 3, 14, 15,  // 1504, 4
-                2, 3, 4, 5, 14, 15,  // 1508, 6
-                6, 7, 14, 15,  // 1514, 4
-                2, 3, 5, 9, 14, 15,  // 1518, 6
-                2, 3, 8, 9, 14, 15,  // 1524, 6
-                10, 14, 15,  // 1530, 3
-                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
-                2, 3, 7, 11, 14, 15,  // 1540, 6
-                10, 11, 14, 15,  // 1546, 4
-                7, 10, 11, 14, 15,  // 1550, 5
-                6, 7, 10, 11, 14, 15,  // 1555, 6
-                1, 2, 3, 13, 14, 15,  // 1561, 6
-                5, 6, 7, 13, 14, 15,  // 1567, 6
-                10, 11, 13, 14, 15,  // 1573, 5
-                9, 10, 11, 13, 14, 15,  // 1578, 6
-                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
-                9, 10, 12, 13, 14, 15,  // 1592, 6
-                8, 11, 12, 13, 14, 15,  // 1598, 6
-                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
-            };
-            static const int g_shapeRanges[][2] =
-            {
-                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
-                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
-                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
-                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
-                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
-                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
-                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
-                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
-                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
-                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
-                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
-                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
-                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
-                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
-                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
-                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
-                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
-                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
-                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
-                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
-                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
-                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
-                { 1604, 8 },
-            };
-            static const int g_shapes1[][2] =
-            {
-                { 0, 16 }
-            };
-            static const int g_shapes2[64][2] =
-            {
-                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
-                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
-                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
-                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
-                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
-                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
-                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
-                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
-            };
-            static const int g_shapes3[64][3] =
-            {
-                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
-                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
-                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
-                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
-                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
-                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
-                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
-                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
-            };
-
-            static const int g_shapeList1[] =
-            {
-                0,
-            };
-
-            static const int g_shapeList1Collapse[] =
-            {
-                0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-            static const int g_shapeList2[] =
-            {
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
-                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
-                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
-                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
-                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
-                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-                122, 123, 124, 125, 126, 127, 128,
-            };
-            static const int g_shapeList2Collapse[] =
-            {
-                -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-                43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-                54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
-                65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
-                76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
-                87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
-                98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
-                109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-                120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList12[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128,
-            };
-
-            static const int g_shapeList12Collapse[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1,
-            };
-
-            static const int g_shapeList3[] =
-            {
-                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
-                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
-                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
-                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
-                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
-                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
-                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
-                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
-                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
-                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
-                235, 236, 237, 238, 239, 240, 241, 242,
-            };
-
-            static const int g_shapeList3Collapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1,
-                -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1,
-                -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1,
-                11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13,
-                -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
-                16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21,
-                22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28,
-                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
-                40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-                62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
-                73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
-                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
-                95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
-                106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-                117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-                128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
-                139,
-            };
-
-            static const int g_shapeList3Short[] =
-            {
-                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
-                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
-                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
-                233, 237, 240,
-            };
-
-            static const int g_shapeList3ShortCollapse[] =
-            {
-                -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1,
-                -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1,
-                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1,
-                12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14,
-                15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17,
-                18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23,
-                -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26,
-                27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1,
-                -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1,
-                -1,
-            };
-
-            static const int g_shapeListAll[] =
-            {
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
-                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
-                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
-                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
-                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
-                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
-                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
-                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
-                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
-                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
-                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
-                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
-                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
-                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
-                242,
-            };
-
-            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
-            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
-            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
-            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
-            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
-            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
-            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
-
-            static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3;
-        }
-
-        namespace BC6HData
-        {
-            enum EField
-            {
-                NA, // N/A
-                M,  // Mode
-                D,  // Shape
-                RW,
-                RX,
-                RY,
-                RZ,
-                GW,
-                GX,
-                GY,
-                GZ,
-                BW,
-                BX,
-                BY,
-                BZ,
-            };
-
-            struct ModeDescriptor
-            {
-                EField m_eField;
-                uint8_t   m_uBit;
-            };
-
-            const ModeDescriptor g_modeDescriptors[14][82] =
-            {
-                {   // Mode 1 (0x00) - 10 5 5 5
-                    { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 2 (0x01) - 7 6 6 6
-                    { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 3 (0x02) - 11 5 4 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 4 (0x06) - 11 4 5 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 5 (0x0a) - 11 4 4 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
-                    { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 6 (0x0e) - 9 5 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 7 (0x12) - 8 6 5 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 8 (0x16) - 8 5 6 5
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 9 (0x1a) - 8 5 5 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 10 (0x1e) - 6 6 6 6
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
-                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
-                    { D, 3 },{ D, 4 },
-                },
-
-                {   // Mode 11 (0x03) - 10 10
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 12 (0x07) - 11 9
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 13 (0x0b) - 12 8
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
-                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
-                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
-                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-
-                {   // Mode 14 (0x0f) - 16 4
-                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
-                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
-                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
-                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
-                    { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
-                    { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
-                    { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
-                    { NA, 0 },{ NA, 0 },
-                },
-            };
-        }
-
-        struct PackingVector
-        {
-            uint32_t m_vector[4];
-            int m_offset;
-
-            void Init()
-            {
-                for (int i = 0; i < 4; i++)
-                    m_vector[i] = 0;
-
-                m_offset = 0;
-            }
-
-            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
-            {
-                int vOffset = m_offset >> 5;
-                int bitOffset = m_offset & 0x1f;
-
-                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
-
-                int overflowBits = bitOffset + bits - 32;
-                if (overflowBits > 0)
-                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
-
-                m_offset += bits;
-            }
-
-            inline void Flush(uint8_t* output)
-            {
-                assert(m_offset == 128);
-
-                for (int v = 0; v < 4; v++)
-                {
-                    uint32_t chunk = m_vector[v];
-                    for (int b = 0; b < 4; b++)
-                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
-                }
-            }
-        };
-
-
-		struct UnpackingVector
-		{
-			uint32_t m_vector[4];
-
-			void Init(const uint8_t *bytes)
-			{
-				for (int i = 0; i < 4; i++)
-					m_vector[i] = 0;
-
-				for (int b = 0; b < 16; b++)
-					m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
-			}
-
-			inline ParallelMath::ScalarUInt16 Unpack(int bits)
-			{
-				uint32_t bitMask = (1 << bits) - 1;
-
-				ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
-
-				for (int i = 0; i < 4; i++)
-				{
-					m_vector[i] >>= bits;
-					if (i != 3)
-						m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
-				}
-
-				return result;
-			}
-		};
-
-        void ComputeTweakFactors(int tweak, int range, float *outFactors)
-        {
-            int totalUnits = range - 1;
-            int minOutsideUnits = ((tweak >> 1) & 1);
-            int maxOutsideUnits = (tweak & 1);
-            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
-
-            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
-            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
-        }
-
-        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
-        {
-            if (isSigned)
-            {
-                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
-                return (v * 32.0f + offset) / 31.0f;
-            }
-            else
-                return (v * 64.0f + 30.0f) / 31.0f;
-        }
-
-        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
-        {
-#ifdef CVTT_ENABLE_ASSERTS
-            for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                assert(ParallelMath::Extract(v, i) != -32768)
-#endif
-
-            ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
-            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
-
-            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
-            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
-            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
-            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
-
-            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
-        }
-
-        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
-        {
-            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
-        }
-
-        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
-        {
-            for (int epi = 0; epi < 2; epi++)
-            {
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    if (isSigned)
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
-                    else
-                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
-                }
-            }
-        }
-
-        template<int TVectorSize>
-        class UnfinishedEndpoints
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            UnfinishedEndpoints()
-            {
-            }
-
-            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = offset[ch];
-            }
-
-            UnfinishedEndpoints(const UnfinishedEndpoints& other)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_base[ch] = other.m_base[ch];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_offset[ch] = other.m_offset[ch];
-            }
-
-            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt15 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
-                    }
-
-                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
-                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
-                }
-            }
-
-            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
-            {
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 channelEPs[2];
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
-                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
-                    }
-
-                    outEP0[ch] = channelEPs[0];
-                    outEP1[ch] = channelEPs[1];
-                }
-            }
-
-            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tweakFactors[2];
-                ComputeTweakFactors(tweak, range, tweakFactors);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
-                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
-                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
-                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
-                }
-            }
-
-            template<int TNewVectorSize>
-            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
-            {
-                MFloat newBase[TNewVectorSize];
-                MFloat newOffset[TNewVectorSize];
-
-                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
-                {
-                    newBase[ch] = m_base[ch];
-                    newOffset[ch] = m_offset[ch];
-                }
-
-                MFloat fillerV = ParallelMath::MakeFloat(filler);
-
-                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
-                {
-                    newBase[ch] = fillerV;
-                    newOffset[ch] = ParallelMath::MakeFloatZero();
-                }
-
-                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
-            }
-
-        private:
-            MFloat m_base[TVectorSize];
-            MFloat m_offset[TVectorSize];
-        };
-
-        template<int TMatrixSize>
-        class PackedCovarianceMatrix
-        {
-        public:
-            // 0: xx,
-            // 1: xy, yy
-            // 3: xz, yz, zz 
-            // 6: xw, yw, zw, ww
-            // ... etc.
-            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
-
-            typedef ParallelMath::Float MFloat;
-
-            PackedCovarianceMatrix()
-            {
-                for (int i = 0; i < PyramidSize; i++)
-                    m_values[i] = ParallelMath::MakeFloatZero();
-            }
-
-            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
-            {
-                int index = 0;
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    for (int col = 0; col <= row; col++)
-                    {
-                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
-                        index++;
-                    }
-                }
-            }
-
-            void Product(MFloat *outVec, const MFloat *inVec)
-            {
-                for (int row = 0; row < TMatrixSize; row++)
-                {
-                    MFloat sum = ParallelMath::MakeFloatZero();
-
-                    int index = (row * (row + 1)) >> 1;
-                    for (int col = 0; col < TMatrixSize; col++)
-                    {
-                        sum = sum + inVec[col] * m_values[index];
-                        if (col >= row)
-                            index += col + 1;
-                        else
-                            index++;
-                    }
-
-                    outVec[row] = sum;
-                }
-            }
-
-        private:
-            ParallelMath::Float m_values[PyramidSize];
-        };
-
-        static const int NumEndpointSelectorPasses = 3;
-
-        template<int TVectorSize, int TIterationCount>
-        class EndpointSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-
-            EndpointSelector()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_centroid[ch] = ParallelMath::MakeFloatZero();
-                    m_direction[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_weightTotal = ParallelMath::MakeFloatZero();
-                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
-                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
-            }
-
-            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
-            {
-                if (pass == 0)
-                    ContributeCentroid(value, weight);
-                else if (pass == 1)
-                    ContributeDirection(value, weight);
-                else if (pass == 2)
-                    ContributeMinMax(value);
-            }
-
-            void FinishPass(int pass)
-            {
-                if (pass == 0)
-                    FinishCentroid();
-                else if (pass == 1)
-                    FinishDirection();
-            }
-
-            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
-            {
-                MFloat unweightedBase[TVectorSize];
-                MFloat unweightedOffset[TVectorSize];
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
-                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
-
-                    float safeWeight = channelWeights[ch];
-                    if (safeWeight == 0.f)
-                        safeWeight = 1.0f;
-
-                    unweightedBase[ch] = min / channelWeights[ch];
-                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
-                }
-
-                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
-            }
-
-        private:
-            void ContributeCentroid(const MFloat *value, const MFloat &weight)
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
-                m_weightTotal = m_weightTotal + weight;
-            }
-
-            void FinishCentroid()
-            {
-                MFloat denom = m_weightTotal;
-                ParallelMath::MakeSafeDenominator(denom);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_centroid[ch] = m_centroid[ch] / denom;
-            }
-
-            void ContributeDirection(const MFloat *value, const MFloat &weight)
-            {
-                MFloat diff[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    diff[ch] = value[ch] - m_centroid[ch];
-
-                m_covarianceMatrix.Add(diff, weight);
-            }
-
-            void FinishDirection()
-            {
-                MFloat approx[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approx[ch] = ParallelMath::MakeFloat(1.0f);
-
-                for (int i = 0; i < TIterationCount; i++)
-                {
-                    MFloat product[TVectorSize];
-                    m_covarianceMatrix.Product(product, approx);
-
-                    MFloat largestComponent = product[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
-
-                    // product = largestComponent*newApprox
-                    ParallelMath::MakeSafeDenominator(largestComponent);
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        approx[ch] = product[ch] / largestComponent;
-                }
-
-                // Normalize
-                MFloat approxLen = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    approxLen = approxLen + approx[ch] * approx[ch];
-
-                approxLen = ParallelMath::Sqrt(approxLen);
-
-                ParallelMath::MakeSafeDenominator(approxLen);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_direction[ch] = approx[ch] / approxLen;
-            }
-
-            void ContributeMinMax(const MFloat *value)
-            {
-                MFloat dist = ParallelMath::MakeFloatZero();
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
-
-                m_minDist = ParallelMath::Min(m_minDist, dist);
-                m_maxDist = ParallelMath::Max(m_maxDist, dist);
-            }
-
-            ParallelMath::Float m_centroid[TVectorSize];
-            ParallelMath::Float m_direction[TVectorSize];
-            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
-            ParallelMath::Float m_weightTotal;
-
-            ParallelMath::Float m_minDist;
-            ParallelMath::Float m_maxDist;
-        };
-
-        static const ParallelMath::UInt16 g_weightReciprocals[] =
-        {
-            ParallelMath::MakeUInt16(0),        // -1 
-            ParallelMath::MakeUInt16(0),        // 0
-            ParallelMath::MakeUInt16(32768),    // 1
-            ParallelMath::MakeUInt16(16384),    // 2
-            ParallelMath::MakeUInt16(10923),    // 3
-            ParallelMath::MakeUInt16(8192),     // 4
-            ParallelMath::MakeUInt16(6554),     // 5
-            ParallelMath::MakeUInt16(5461),     // 6
-            ParallelMath::MakeUInt16(4681),     // 7
-            ParallelMath::MakeUInt16(4096),     // 8
-            ParallelMath::MakeUInt16(3641),     // 9
-            ParallelMath::MakeUInt16(3277),     // 10
-            ParallelMath::MakeUInt16(2979),     // 11
-            ParallelMath::MakeUInt16(2731),     // 12
-            ParallelMath::MakeUInt16(2521),     // 13
-            ParallelMath::MakeUInt16(2341),     // 14
-            ParallelMath::MakeUInt16(2185),     // 15
-        };
-
-        template<int TVectorSize>
-        class IndexSelector
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            template<class TInterpolationEPType, class TColorEPType>
-            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
-            {
-                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
-                // We need to select indexes using the color-space endpoints.
-
-                m_isUniform = true;
-                for (int ch = 1; ch < TVectorSize; ch++)
-                {
-                    if (channelWeights[ch] != channelWeights[0])
-                        m_isUniform = false;
-                }
-
-                // To work with channel weights, we need something where:
-                // pxDiff = px - ep[0]
-                // epDiff = ep[1] - ep[0]
-                //
-                // weightedEPDiff = epDiff * channelWeights
-                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
-                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
-                // index = normalizedIndex * maxValue
-                //
-                // Equivalent to:
-                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
-                // index = dot(axis, pxDiff)
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
-
-                m_range = range;
-                m_maxValue = static_cast<float>(range - 1);
-
-                MFloat epDiffWeighted[TVectorSize];
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
-                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
-                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
-                }
-
-                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
-
-                ParallelMath::MakeSafeDenominator(lenSquared);
-
-                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
-            }
-
-            template<bool TSigned>
-            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
-            {
-                MAInt16 converted[2][TVectorSize];
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
-
-                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
-                }
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
-                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
-                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
-                }
-            }
-
-            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDR_BC7(index, pixel, TVectorSize);
-            }
-
-            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
-            {
-                ReconstructLDRPrecise(index, pixel, TVectorSize);
-            }
-
-            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
-
-                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
-            }
-
-        protected:
-            MAInt16 m_endPoint[2][TVectorSize];
-
-        private:
-            MFloat m_origin[TVectorSize];
-            MFloat m_axis[TVectorSize];
-            int m_range;
-            float m_maxValue;
-            bool m_isUniform;
-        };
-
-
-        template<int TVectorSize>
-        class IndexSelectorHDR : public IndexSelector<TVectorSize>
-        {
-        public:
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-        private:
-
-            MUInt15 InvertSingle(const MUInt15& anIndex) const
-            {
-                MUInt15 inverted = m_maxValueMinusOne - anIndex;
-                return ParallelMath::Select(m_isInverted, inverted, anIndex);
-            }
-
-            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
-                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
-
-                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
-                }
-            }
-
-            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
-            {
-                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
-                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
-
-                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
-
-                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
-
-                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
-                }
-            }
-
-            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
-            {
-                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
-                return diff * diff;
-            }
-
-            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
-            {
-                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
-                for (int ch = 1; ch < TVectorSize; ch++)
-                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
-                return error;
-            }
-
-        public:
-
-            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
-            {
-                assert(range <= 16);
-
-                m_range = range;
-
-                m_isInverted = ParallelMath::MakeBoolInt16(false);
-                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
-
-                if (!fastIndexing)
-                {
-                    for (int i = 0; i < range; i++)
-                    {
-                        MSInt16 recon2CL[TVectorSize];
-
-                        if (isSigned)
-                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-                        else
-                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
-
-                        for (int ch = 0; ch < TVectorSize; ch++)
-                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
-                    }
-                }
-            }
-
-            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
-            {
-                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
-            }
-
-            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
-            {
-                m_isInverted = invert;
-            }
-
-            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
-            {
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                MFloat bestError = ErrorForInterpolator(0, pixel);
-                for (int i = 1; i < m_range; i++)
-                {
-                    MFloat error = ErrorForInterpolator(i, pixel);
-                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                    bestError = ParallelMath::Min(bestError, error);
-                }
-
-                return InvertSingle(index);
-            }
-
-            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
-            {
-                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
-            }
-
-        private:
-            MFloat m_reconstructedInterpolators[16][TVectorSize];
-            ParallelMath::Int16CompFlag m_isInverted;
-            MUInt15 m_maxValueMinusOne;
-            int m_range;
-        };
-
-        // Solve for a, b where v = a*t + b
-        // This allows endpoints to be mapped to where T=0 and T=1
-        // Least squares from totals:
-        // a = (tv - t*v/w)/(tt - t*t/w)
-        // b = (v - a*t)/w
-        template<int TVectorSize>
-        class EndpointRefiner
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            MFloat m_tv[TVectorSize];
-            MFloat m_v[TVectorSize];
-            MFloat m_tt;
-            MFloat m_t;
-            MFloat m_w;
-            int m_wu;
-
-            float m_rcpMaxIndex;
-            float m_channelWeights[TVectorSize];
-            float m_rcpChannelWeights[TVectorSize];
-
-            void Init(int indexRange, const float channelWeights[TVectorSize])
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_tv[ch] = ParallelMath::MakeFloatZero();
-                    m_v[ch] = ParallelMath::MakeFloatZero();
-                }
-                m_tt = ParallelMath::MakeFloatZero();
-                m_t = ParallelMath::MakeFloatZero();
-                m_w = ParallelMath::MakeFloatZero();
-
-                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    m_channelWeights[ch] = channelWeights[ch];
-                    m_rcpChannelWeights[ch] = 1.0f;
-                    if (m_channelWeights[ch] != 0.0f)
-                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
-                }
-
-                m_wu = 0;
-            }
-
-            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch] * weight;
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + weight * t * t;
-                m_t = m_t + weight * t;
-                m_w = m_w + weight;
-            }
-
-            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
-            {
-                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
-
-                for (int ch = 0; ch < numRealChannels; ch++)
-                {
-                    MFloat v = pwFloatPixel[ch];
-
-                    m_tv[ch] = m_tv[ch] + t * v;
-                    m_v[ch] = m_v[ch] + v;
-                }
-                m_tt = m_tt + t * t;
-                m_t = m_t + t;
-                m_wu++;
-            }
-
-            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
-            {
-                ContributeUnweightedPW(floatPixel, index, TVectorSize);
-            }
-
-            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
-            {
-                // a = (tv - t*v/w)/(tt - t*t/w)
-                // b = (v - a*t)/w
-                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
-
-                ParallelMath::MakeSafeDenominator(w);
-                MFloat wRcp = ParallelMath::Reciprocal(w);
-
-                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
-
-                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
-                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
-
-                for (int ch = 0; ch < TVectorSize; ch++)
-                {
-                    /*
-                    if (adenom == 0.0)
-                        p1 = p2 = er.v / er.w;
-                    else
-                    {
-                        float4 a = (er.tv - er.t*er.v / er.w) / adenom;
-                        float4 b = (er.v - a * er.t) / er.w;
-                        p1 = b;
-                        p2 = a + b;
-                    }
-                    */
-
-                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
-                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
-
-                    MFloat p1 = b;
-                    MFloat p2 = a + b;
-
-                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
-                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
-
-                    // Unweight
-                    float inverseWeight = m_rcpChannelWeights[ch];
-
-                    endPoint[0][ch] = p1 * inverseWeight;
-                    endPoint[1][ch] = p2 * inverseWeight;
-                }
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
-            }
-
-            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
-            }
-
-            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                MFloat floatEndPoint[2][TVectorSize];
-                GetRefinedEndpoints(floatEndPoint);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                    {
-                        MFloat f = floatEndPoint[epi][ch];
-                        if (isSigned)
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
-                        else
-                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
-                    }
-                }
-            }
-        };
-
-        template<int TVectorSize>
-        class AggregatedError
-        {
-        public:
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt31 MUInt31;
-            typedef ParallelMath::Float MFloat;
-
-            AggregatedError()
-            {
-                for (int ch = 0; ch < TVectorSize; ch++)
-                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
-            }
-
-            void Add(const MUInt16 &channelErrorUnweighted, int ch)
-            {
-                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
-            }
-
-            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
-            {
-                if (flags & cvtt::Flags::Uniform)
-                {
-                    MUInt31 total = m_errorUnweighted[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + m_errorUnweighted[ch];
-                    return ParallelMath::ToFloat(total);
-                }
-                else
-                {
-                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
-                    for (int ch = 1; ch < TVectorSize; ch++)
-                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
-                    return total;
-                }
-            }
-
-        private:
-            MUInt31 m_errorUnweighted[TVectorSize];
-        };
-
-        class BCCommon
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static int TweakRoundsForRange(int range)
-            {
-                if (range == 3)
-                    return 3;
-                return 4;
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
-            {
-                for (int ch = 0; ch < numRealChannels; ch++)
-                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
-            }
-
-            template<int TVectorSize>
-            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
-            {
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
-            {
-                AggregatedError<TVectorSize> aggError;
-                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
-                return aggError.Finalize(flags, channelWeightsSq);
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TVectorSize>
-            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
-            {
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & Flags::Uniform)
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
-                }
-                else
-                {
-                    for (int ch = 0; ch < TVectorSize; ch++)
-                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
-                }
-
-                return error;
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-
-            template<int TChannelCount>
-            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
-            {
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < TChannelCount; ch++)
-                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
-                }
-            }
-        };
-
-        class BC7Computer
-        {
-        public:
-            static const int MaxTweakRounds = 4;
-
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::Float MFloat;
-
-            struct WorkInfo
-            {
-                MUInt15 m_mode;
-                MFloat m_error;
-                MUInt15 m_ep[3][2][4];
-                MUInt15 m_indexes[16];
-                MUInt15 m_indexes2[16];
-
-                union
-                {
-                    MUInt15 m_partition;
-                    struct IndexSelectorAndRotation
-                    {
-                        MUInt15 m_indexSelector;
-                        MUInt15 m_rotation;
-                    } m_isr;
-                } m_u;
-            };
-
-            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
-            {
-                ParallelMath::RoundTowardNearestForScope roundingMode;
-
-                float tf[2];
-                ComputeTweakFactors(tweak, range, tf);
-
-                MFloat base = ParallelMath::ToFloat(original[0]);
-                MFloat offs = ParallelMath::ToFloat(original[1]) - base;
-
-                result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
-                result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
-            }
-
-            static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                float maxColor = static_cast<float>((1 << bits) - 1);
-
-                for (int i = 0; i < channels; i++)
-                    color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode);
-            }
-
-            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
-                MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift);
-
-                float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
-
-                float maxQuantized = static_cast<float>((1 << bits) - 1);
-
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    if (p)
-                        clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
-
-                    MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF;
-
-                    clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1;
-                    if (p)
-                        clr = clr | ParallelMath::MakeUInt15(1);
-
-                    color[ch] = clr;
-                }
-            }
-
-            static void Unquantize(MUInt15* color, int bits, int channels)
-            {
-                for (int ch = 0; ch < channels; ch++)
-                {
-                    MUInt15 clr = color[ch];
-                    clr = clr << (8 - bits);
-                    color[ch] = clr | ParallelMath::RightShift(clr, bits);
-                }
-            }
-
-            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 4, p[j], 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 6, p, 3, roundingMode);
-                    Unquantize(ep[j], 7, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(ep[j], 5, 3, roundingMode);
-                    Unquantize(ep[j], 5, 3);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 7, p[j], 3, roundingMode);
-                    ep[j][3] = ParallelMath::MakeUInt15(255);
-                }
-            }
-
-            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 5, 3, roundingMode);
-                    Unquantize(epRGB[j], 5, 3);
-
-                    Quantize(epA + j, 6, 1, roundingMode);
-                    Unquantize(epA + j, 6, 1);
-                }
-            }
-
-            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    Quantize(epRGB[j], 7, 3, roundingMode);
-                    Unquantize(epRGB[j], 7, 3);
-                }
-
-                // Alpha is full precision
-                (void)epA;
-            }
-
-            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                    QuantizeP(ep[j], 7, p[j], 4, roundingMode);
-            }
-
-            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
-            {
-                for (int j = 0; j < 2; j++)
-                {
-                    QuantizeP(ep[j], 5, p[j], 4, roundingMode);
-                    Unquantize(ep[j], 6, 4);
-                }
-            }
-
-            struct SinglePlaneTemporaries
-            {
-                UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
-                UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
-
-                MUInt15 fragmentBestIndexes[BC7Data::g_numFragments];
-                MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4];
-                MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode];
-            };
-
-            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
-
-                MUInt15 intAverage[4];
-                for (int ch = 0; ch < 4; ch++)
-                    intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
-
-                MUInt15 eps[2][4];
-                MUInt15 reconstructed[4];
-                MUInt15 index = ParallelMath::MakeUInt15(0);
-
-                for (int epi = 0; epi < 2; epi++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        eps[epi][ch] = ParallelMath::MakeUInt15(0);
-                    eps[epi][3] = ParallelMath::MakeUInt15(255);
-                }
-
-                for (int ch = 0; ch < 3; ch++)
-                    reconstructed[ch] = ParallelMath::MakeUInt15(0);
-                reconstructed[3] = ParallelMath::MakeUInt15(255);
-
-                // Depending on the target index and parity bits, there are multiple valid solid colors.
-                // We want to find the one closest to the actual average.
-                MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
-                for (int t = 0; t < numTables; t++)
-                {
-                    const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
-
-                    ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
-
-                    MUInt15 candidateReconstructed[4];
-                    MUInt15 candidateEPs[2][4];
-
-                    for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                        {
-                            ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
-                            assert(avgValue >= 0 && avgValue <= 255);
-
-                            const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
-
-                            ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
-                            ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
-                            ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
-                        }
-                    }
-
-                    MFloat avgError = ParallelMath::MakeFloatZero();
-                    for (int ch = 0; ch < numRealChannels; ch++)
-                    {
-                        MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
-                        avgError = avgError + delta * delta * channelWeightsSq[ch];
-                    }
-
-                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
-                    better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
-
-                    if (ParallelMath::AnySet(better))
-                    {
-                        ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
-
-                        MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
-
-                        ParallelMath::ConditionalSet(index, better, candidateIndex);
-
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
-
-                        for (int epi = 0; epi < 2; epi++)
-                            for (int ch = 0; ch < numRealChannels; ch++)
-                                ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
-                    }
-                }
-
-                AggregatedError<4> aggError;
-                for (int pxi = 0; pxi < shapeLength; pxi++)
-                {
-                    int px = fragmentStart[pxi];
-
-                    BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                }
-
-                MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
-
-                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
-                if (ParallelMath::AnySet(better))
-                {
-                    shapeBestError = ParallelMath::Min(shapeBestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < numRealChannels; ch++)
-                            ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
-                    }
-
-                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                        ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
-                }
-            }
-
-
-            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                SinglePlaneTemporaries temps;
-
-                MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-                ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 a = pixels[px][3];
-                    maxAlpha = ParallelMath::Max(maxAlpha, a);
-                    minAlpha = ParallelMath::Min(minAlpha, a);
-
-                    isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
-                }
-
-                ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
-                ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
-
-                bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
-
-                // Try RGB modes if any block has a min alpha 251 or higher
-                bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
-
-                // Try mode 7 if any block has alpha.
-                // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
-                // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
-                // situations, and only by at most 1 unit of error per pixel.
-                bool allowMode7 = anyBlockHasAlpha;
-
-                MFloat preWeightedPixels[16][4];
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                const int *rgbInitialEPCollapseList = NULL;
-
-                // Get initial RGB endpoints
-                if (allowRGBModes)
-                {
-                    const int *shapeList;
-                    int numShapesToEvaluate;
-
-                    if (flags & Flags::BC7_EnablePartitioning)
-                    {
-                        if (flags & Flags::BC7_Enable3Subsets)
-                        {
-                            shapeList = BC7Data::g_shapeListAll;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeListAll;
-                            numShapesToEvaluate = BC7Data::g_numShapesAll;
-                        }
-                        else
-                        {
-                            shapeList = BC7Data::g_shapeList12;
-                            rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-                            numShapesToEvaluate = BC7Data::g_numShapes12;
-                        }
-                    }
-                    else
-                    {
-                        shapeList = BC7Data::g_shapeList1;
-                        rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse;
-                        numShapesToEvaluate = BC7Data::g_numShapes1;
-                    }
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                        EndpointSelector<3, 8> epSelector;
-
-                        for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                        {
-                            for (int spx = 0; spx < shapeSize; spx++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + spx];
-                                epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                            }
-                            epSelector.FinishPass(epPass);
-                        }
-                        temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                    }
-                }
-
-                const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
-
-                // Get initial RGBA endpoints
-                {
-                    const int *shapeList = BC7Data::g_shapeList12;
-                    int numShapesToEvaluate = BC7Data::g_numShapes12;
-
-                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-
-                        if (anyBlockHasAlpha || !allowRGBModes)
-                        {
-                            int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                            int shapeSize = BC7Data::g_shapeRanges[shape][1];
-
-                            EndpointSelector<4, 8> epSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int spx = 0; spx < shapeSize; spx++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + spx];
-                                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
-                                }
-                                epSelector.FinishPass(epPass);
-                            }
-                            temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights);
-                        }
-                        else
-                        {
-                            temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255);
-                        }
-                    }
-                }
-
-                for (uint16_t mode = 0; mode <= 7; mode++)
-                {
-                    if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1)
-                        continue;
-
-                    if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3)
-                        continue;
-
-                    if (mode == 4 || mode == 5)
-                        continue;
-
-                    if (mode < 4 && !allowRGBModes)
-                        continue;
-
-                    if (mode == 7 && !allowMode7)
-                        continue;
-
-                    bool isRGB = (mode < 4);
-
-                    unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
-                    int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
-                    int indexPrec = BC7Data::g_modes[mode].m_indexBits;
-
-                    int parityBitMax = 1;
-                    if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                        parityBitMax = 4;
-                    else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
-                        parityBitMax = 2;
-
-                    int numRealChannels = isRGB ? 3 : 4;
-
-                    int numShapes;
-                    const int *shapeList;
-                    const int *shapeCollapseList;
-
-                    if (numSubsets == 1)
-                    {
-                        numShapes = BC7Data::g_numShapes1;
-                        shapeList = BC7Data::g_shapeList1;
-                        shapeCollapseList = BC7Data::g_shapeList1Collapse;
-                    }
-                    else if (numSubsets == 2)
-                    {
-                        numShapes = BC7Data::g_numShapes2;
-                        shapeList = BC7Data::g_shapeList2;
-                        shapeCollapseList = BC7Data::g_shapeList2Collapse;
-                    }
-                    else
-                    {
-                        assert(numSubsets == 3);
-                        if (numPartitions == 16)
-                        {
-                            numShapes = BC7Data::g_numShapes3Short;
-                            shapeList = BC7Data::g_shapeList3Short;
-                            shapeCollapseList = BC7Data::g_shapeList3ShortCollapse;
-                        }
-                        else
-                        {
-                            assert(numPartitions == 64);
-                            numShapes = BC7Data::g_numShapes3;
-                            shapeList = BC7Data::g_shapeList3;
-                            shapeCollapseList = BC7Data::g_shapeList3Collapse;
-                        }
-                    }
-
-                    for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++)
-                        temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
-
-                    for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
-                    {
-                        int shape = shapeList[shapeIter];
-                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                        int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                        int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                        AggregatedError<1> alphaAggError;
-                        if (isRGB && anyBlockHasAlpha)
-                        {
-                            MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                MUInt15 original[1] = { pixels[px][3] };
-                                BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
-                            }
-                        }
-
-                        float alphaWeightsSq[1] = { channelWeightsSq[3] };
-                        MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
-
-                        assert(shapeCollapsedEvalIndex >= 0);
-
-                        MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
-
-                        for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                        {
-                            if (isRGB)
-                            {
-                                temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                                tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
-                            }
-                            else
-                            {
-                                temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
-                            }
-                        }
-
-                        ParallelMath::Int16CompFlag punchThroughInvalid[4];
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
-
-                            if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
-                            {
-                                // Modes 6 and 7 have parity bits that affect alpha
-                                if (pIter == 0)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
-                                else if (pIter == parityBitMax - 1)
-                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
-                                else
-                                    punchThroughInvalid[pIter] = isPunchThrough;
-                            }
-                        }
-
-                        for (int pIter = 0; pIter < parityBitMax; pIter++)
-                        {
-                            if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
-                                continue;
-
-                            bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                uint16_t p[2];
-                                p[0] = (pIter & 1);
-                                p[1] = ((pIter >> 1) & 1);
-
-                                MUInt15 ep[2][4];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    switch (mode)
-                                    {
-                                    case 0:
-                                        CompressEndpoints0(ep, p, rtn);
-                                        break;
-                                    case 1:
-                                        CompressEndpoints1(ep, p[0], rtn);
-                                        break;
-                                    case 2:
-                                        CompressEndpoints2(ep, rtn);
-                                        break;
-                                    case 3:
-                                        CompressEndpoints3(ep, p, rtn);
-                                        break;
-                                    case 6:
-                                        CompressEndpoints6(ep, p, rtn);
-                                        break;
-                                    case 7:
-                                        CompressEndpoints7(ep, p, rtn);
-                                        break;
-                                    default:
-                                        assert(false);
-                                        break;
-                                    };
-
-                                    MFloat shapeError = ParallelMath::MakeFloatZero();
-
-                                    IndexSelector<4> indexSelector;
-                                    indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
-
-                                    EndpointRefiner<4> epRefiner;
-                                    epRefiner.Init(1 << indexPrec, channelWeights);
-
-                                    MUInt15 indexes[16];
-
-                                    AggregatedError<4> aggError;
-                                    for (int pxi = 0; pxi < shapeLength; pxi++)
-                                    {
-                                        int px = BC7Data::g_fragments[shapeStart + pxi];
-
-                                        MUInt15 index;
-                                        MUInt15 reconstructed[4];
-
-                                        index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
-                                        indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                            BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
-                                        else
-                                        {
-                                            MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-
-                                            MUInt15 altIndexes[2];
-                                            altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
-
-                                                MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
-                                                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
-                                                error = ParallelMath::Min(error, altError);
-                                                ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
-                                            }
-
-                                            shapeError = shapeError + error;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                            epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
-
-                                        indexes[pxi] = index;
-                                    }
-
-                                    if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        shapeError = aggError.Finalize(flags, channelWeightsSq);
-
-                                    if (isRGB)
-                                        shapeError = shapeError + staticAlphaError;
-
-                                    ParallelMath::FloatCompFlag shapeErrorBetter;
-                                    ParallelMath::Int16CompFlag shapeErrorBetter16;
-
-                                    shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]);
-                                    shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
-
-                                    if (ParallelMath::AnySet(shapeErrorBetter16))
-                                    {
-                                        bool punchThroughOK = true;
-                                        if (needPunchThroughCheck)
-                                        {
-                                            shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
-                                            shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
-
-                                            if (!ParallelMath::AnySet(shapeErrorBetter16))
-                                                punchThroughOK = false;
-                                        }
-
-                                        if (punchThroughOK)
-                                        {
-                                            ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError);
-                                            for (int epi = 0; epi < 2; epi++)
-                                                for (int ch = 0; ch < numRealChannels; ch++)
-                                                    ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]);
-
-                                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                                                ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
-                                        }
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                        epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
-                                } // refine
-                            } // tweak
-                        } // p
-
-                        if (flags & cvtt::Flags::BC7_TrySingleColor)
-                        {
-                            MUInt15 total[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                total[ch] = ParallelMath::MakeUInt15(0);
-
-                            for (int pxi = 0; pxi < shapeLength; pxi++)
-                            {
-                                int px = BC7Data::g_fragments[shapeStart + pxi];
-                                for (int ch = 0; ch < 4; ch++)
-                                    total[ch] = total[ch] + pixels[pxi][ch];
-                            }
-
-                            MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
-                            MFloat average[4];
-                            for (int ch = 0; ch < 4; ch++)
-                                average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
-
-                            const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
-                            MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex];
-                            MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex];
-                            MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
-
-                            const cvtt::Tables::BC7SC::Table **scTables = NULL;
-                            int numSCTables = 0;
-
-                            switch (mode)
-                            {
-                            case 0:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p00_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p01_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p10_i3,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i1,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i2,
-                                        &cvtt::Tables::BC7SC::g_mode0_p11_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 1:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode1_p1_i3,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 2:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode2,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 3:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode3_p0,
-                                        &cvtt::Tables::BC7SC::g_mode3_p1,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 6:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p0_i7,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i1,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i2,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i3,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i4,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i5,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i6,
-                                        &cvtt::Tables::BC7SC::g_mode6_p1_i7,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            case 7:
-                                {
-                                    const cvtt::Tables::BC7SC::Table *tables[] =
-                                    {
-                                        &cvtt::Tables::BC7SC::g_mode7_p00,
-                                        &cvtt::Tables::BC7SC::g_mode7_p01,
-                                        &cvtt::Tables::BC7SC::g_mode7_p10,
-                                        &cvtt::Tables::BC7SC::g_mode7_p11,
-                                    };
-                                    scTables = tables;
-                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
-                                }
-                                break;
-                            default:
-                                assert(false);
-                                break;
-                            }
-
-                            TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
-                        }
-                    } // shapeIter
-
-                    for (uint16_t partition = 0; partition < numPartitions; partition++)
-                    {
-                        const int *partitionShapes;
-                        if (numSubsets == 1)
-                            partitionShapes = BC7Data::g_shapes1[partition];
-                        else if (numSubsets == 2)
-                            partitionShapes = BC7Data::g_shapes2[partition];
-                        else
-                        {
-                            assert(numSubsets == 3);
-                            partitionShapes = BC7Data::g_shapes3[partition];
-                        }
-
-                        MFloat totalError = ParallelMath::MakeFloatZero();
-                        for (int subset = 0; subset < numSubsets; subset++)
-                            totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]];
-
-                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
-                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                        if (ParallelMath::AnySet(errorBetter16))
-                        {
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                int shape = partitionShapes[subset];
-                                int shapeStart = BC7Data::g_shapeRanges[shape][0];
-                                int shapeLength = BC7Data::g_shapeRanges[shape][1];
-                                int shapeCollapsedEvalIndex = shapeCollapseList[shape];
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    for (int ch = 0; ch < 4; ch++)
-                                        ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]);
-
-                                for (int pxi = 0; pxi < shapeLength; pxi++)
-                                {
-                                    int px = BC7Data::g_fragments[shapeStart + pxi];
-                                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
-                                }
-                            }
-
-                            work.m_error = ParallelMath::Min(totalError, work.m_error);
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
-                        }
-                    }
-                }
-            }
-
-            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
-                // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
-                // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
-                // - Separate alpha channel, then weighted RGB
-                // - Alpha+2 other channels, then the independent channel
-
-                if (!(flags & Flags::BC7_EnableDualPlane))
-                    return;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                float channelWeightsSq[4];
-                for (int ch = 0; ch < 4; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (uint16_t mode = 4; mode <= 5; mode++)
-                {
-                    for (uint16_t rotation = 0; rotation < 4; rotation++)
-                    {
-                        int alphaChannel = (rotation + 3) & 3;
-                        int redChannel = (rotation == 1) ? 3 : 0;
-                        int greenChannel = (rotation == 2) ? 3 : 1;
-                        int blueChannel = (rotation == 3) ? 3 : 2;
-
-                        MUInt15 rotatedRGB[16][3];
-                        MFloat floatRotatedRGB[16][3];
-
-                        for (int px = 0; px < 16; px++)
-                        {
-                            rotatedRGB[px][0] = pixels[px][redChannel];
-                            rotatedRGB[px][1] = pixels[px][greenChannel];
-                            rotatedRGB[px][2] = pixels[px][blueChannel];
-
-                            for (int ch = 0; ch < 3; ch++)
-                                floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
-                        }
-
-                        uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
-
-                        float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
-                        float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
-                        float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
-                        float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
-
-                        float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
-
-                        MFloat preWeightedRotatedRGB[16][3];
-                        BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
-
-                        for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
-                        {
-                            EndpointSelector<3, 8> rgbSelector;
-
-                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
-                            {
-                                for (int px = 0; px < 16; px++)
-                                    rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
-
-                                rgbSelector.FinishPass(epPass);
-                            }
-
-                            MUInt15 alphaRange[2];
-
-                            alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
-                            for (int px = 1; px < 16; px++)
-                            {
-                                alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
-                                alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
-                            }
-
-                            int rgbPrec = 0;
-                            int alphaPrec = 0;
-
-                            if (mode == 4)
-                            {
-                                rgbPrec = indexSelector ? 3 : 2;
-                                alphaPrec = indexSelector ? 2 : 3;
-                            }
-                            else
-                                rgbPrec = alphaPrec = 2;
-
-                            UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
-
-                            MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
-                            MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
-
-                            MUInt15 bestRGBIndexes[16];
-                            MUInt15 bestAlphaIndexes[16];
-                            MUInt15 bestEP[2][4];
-
-                            for (int px = 0; px < 16; px++)
-                                bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 rgbEP[2][3];
-                                MUInt15 alphaEP[2];
-
-                                unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
-
-                                TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
-
-                                for (int refine = 0; refine < numRefineRounds; refine++)
-                                {
-                                    if (mode == 4)
-                                        CompressEndpoints4(rgbEP, alphaEP, rtn);
-                                    else
-                                        CompressEndpoints5(rgbEP, alphaEP, rtn);
-
-
-                                    IndexSelector<1> alphaIndexSelector;
-                                    IndexSelector<3> rgbIndexSelector;
-
-                                    {
-                                        MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
-                                        alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
-                                    }
-                                    rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
-
-                                    EndpointRefiner<3> rgbRefiner;
-                                    EndpointRefiner<1> alphaRefiner;
-
-                                    rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
-                                    alphaRefiner.Init(1 << alphaPrec, uniformWeight);
-
-                                    MFloat errorRGB = ParallelMath::MakeFloatZero();
-                                    MFloat errorA = ParallelMath::MakeFloatZero();
-
-                                    MUInt15 rgbIndexes[16];
-                                    MUInt15 alphaIndexes[16];
-
-                                    AggregatedError<3> rgbAggError;
-                                    AggregatedError<1> alphaAggError;
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
-                                        MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
-
-                                        MUInt15 reconstructedRGB[3];
-                                        MUInt15 reconstructedAlpha[1];
-
-                                        rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
-                                        alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
-
-                                        if (flags & cvtt::Flags::BC7_FastIndexing)
-                                        {
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
-                                        }
-                                        else
-                                        {
-                                            AggregatedError<3> baseRGBAggError;
-                                            AggregatedError<1> baseAlphaAggError;
-
-                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
-                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
-
-                                            MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                            MUInt15 altRGBIndexes[2];
-                                            MUInt15 altAlphaIndexes[2];
-
-                                            altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
-
-                                            altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
-                                            altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
-
-                                            for (int ii = 0; ii < 2; ii++)
-                                            {
-                                                rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
-                                                alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
-
-                                                AggregatedError<3> altRGBAggError;
-                                                AggregatedError<1> altAlphaAggError;
-
-                                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
-                                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
-
-                                                MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                                MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
-
-                                                ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
-                                                ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
-
-                                                rgbError = ParallelMath::Min(altRGBError, rgbError);
-                                                alphaError = ParallelMath::Min(altAlphaError, alphaError);
-
-                                                ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
-                                                ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
-                                            }
-
-                                            errorRGB = errorRGB + rgbError;
-                                            errorA = errorA + alphaError;
-                                        }
-
-                                        if (refine != numRefineRounds - 1)
-                                        {
-                                            rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
-                                            alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
-                                        }
-
-                                        if (flags & Flags::BC7_FastIndexing)
-                                        {
-                                            errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
-                                            errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq);
-                                        }
-
-                                        rgbIndexes[px] = rgbIndex;
-                                        alphaIndexes[px] = alphaIndex;
-                                    }
-
-                                    ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
-                                    ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
-
-                                    ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
-                                    ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
-
-                                    if (ParallelMath::AnySet(rgbBetterInt16))
-                                    {
-                                        bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                        {
-                                            for (int ch = 0; ch < 3; ch++)
-                                                ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
-                                        }
-                                    }
-
-                                    if (ParallelMath::AnySet(alphaBetterInt16))
-                                    {
-                                        bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
-
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
-
-                                        for (int ep = 0; ep < 2; ep++)
-                                            ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
-                                    }
-
-                                    if (refine != numRefineRounds - 1)
-                                    {
-                                        rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
-
-                                        MUInt15 alphaEPTemp[2][1];
-                                        alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
-
-                                        for (int i = 0; i < 2; i++)
-                                            alphaEP[i] = alphaEPTemp[i][0];
-                                    }
-                                }	// refine
-                            } // tweak
-
-                            MFloat combinedError = bestRGBError + bestAlphaError;
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            work.m_error = ParallelMath::Min(combinedError, work.m_error);
-
-                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
-                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
-
-                            for (int px = 0; px < 16; px++)
-                            {
-                                ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
-                                ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
-                            }
-
-                            for (int ep = 0; ep < 2; ep++)
-                                for (int ch = 0; ch < 4; ch++)
-                                    ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
-                        }
-                    }
-                }
-            }
-
-            template<class T>
-            static void Swap(T& a, T& b)
-            {
-                T temp = a;
-                a = b;
-                b = temp;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds)
-            {
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                WorkInfo work;
-                memset(&work, 0, sizeof(work));
-
-                work.m_error = ParallelMath::MakeFloat(FLT_MAX);
-
-                {
-                    ParallelMath::RoundTowardNearestForScope rtn;
-                    TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                    TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    PackingVector pv;
-                    pv.Init();
-
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
-                    ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
-
-                    const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
-
-                    ParallelMath::ScalarUInt16 indexes[16];
-                    ParallelMath::ScalarUInt16 indexes2[16];
-                    ParallelMath::ScalarUInt16 endPoints[3][2][4];
-
-                    for (int i = 0; i < 16; i++)
-                    {
-                        indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
-                        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                            indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
-                    }
-
-                    for (int subset = 0; subset < 3; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 4; ch++)
-                                endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
-                        }
-                    }
-
-                    int fixups[3] = { 0, 0, 0 };
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-                        bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
-
-                        if (flipRGB)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes[px] = highIndex - indexes[px];
-                        }
-
-                        if (flipAlpha)
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                                indexes2[px] = highIndex - indexes2[px];
-                        }
-
-                        if (indexSelector)
-                            Swap(flipRGB, flipAlpha);
-
-                        if (flipRGB)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
-                        }
-                        if (flipAlpha)
-                            Swap(endPoints[0][0][3], endPoints[0][1][3]);
-
-                    }
-                    else
-                    {
-                        if (modeInfo.m_numSubsets == 2)
-                            fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                        else if (modeInfo.m_numSubsets == 3)
-                        {
-                            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                        }
-
-                        bool flip[3] = { false, false, false };
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
-
-                        if (flip[0] || flip[1] || flip[2])
-                        {
-                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                int subset = 0;
-                                if (modeInfo.m_numSubsets == 2)
-                                    subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                                else if (modeInfo.m_numSubsets == 3)
-                                    subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                                if (flip[subset])
-                                    indexes[px] = highIndex - indexes[px];
-                            }
-
-                            int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
-                            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                            {
-                                if (flip[subset])
-                                    for (int ch = 0; ch < maxCH; ch++)
-                                        Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
-                            }
-                        }
-                    }
-
-                    pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
-
-                    if (modeInfo.m_partitionBits)
-                        pv.Pack(partition, modeInfo.m_partitionBits);
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
-                        pv.Pack(rotation, 2);
-                    }
-
-                    if (modeInfo.m_hasIndexSelector)
-                        pv.Pack(indexSelector, 1);
-
-                    // Encode RGB
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
-                                epPart >>= (8 - modeInfo.m_rgbBits);
-
-                                pv.Pack(epPart, modeInfo.m_rgbBits);
-                            }
-                        }
-                    }
-
-                    // Encode alpha
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
-                                epPart >>= (8 - modeInfo.m_alphaBits);
-
-                                pv.Pack(epPart, modeInfo.m_alphaBits);
-                            }
-                        }
-                    }
-
-                    // Encode parity bits
-                    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
-                            epPart >>= (7 - modeInfo.m_rgbBits);
-                            epPart &= 1;
-
-                            pv.Pack(epPart, 1);
-                        }
-                    }
-                    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                    {
-                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                        {
-                            for (int ep = 0; ep < 2; ep++)
-                            {
-                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
-                                epPart >>= (7 - modeInfo.m_rgbBits);
-                                epPart &= 1;
-
-                                pv.Pack(epPart, 1);
-                            }
-                        }
-                    }
-
-                    // Encode indexes
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_indexBits;
-                        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                            bits--;
-
-                        pv.Pack(indexes[px], bits);
-                    }
-
-                    // Encode secondary indexes
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int bits = modeInfo.m_alphaIndexBits;
-                            if (px == 0)
-                                bits--;
-
-                            pv.Pack(indexes2[px], bits);
-                        }
-                    }
-
-                    pv.Flush(packedBlocks);
-
-                    packedBlocks += 16;
-                }
-            }
-
-            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
-            {
-                UnpackingVector pv;
-                pv.Init(packedBlock);
-
-                int mode = 8;
-                for (int i = 0; i < 8; i++)
-                {
-                    if (pv.Unpack(1) == 1)
-                    {
-                        mode = i;
-                        break;
-                    }
-                }
-
-                if (mode > 7)
-                {
-                    for (int px = 0; px < 16; px++)
-                        for (int ch = 0; ch < 4; ch++)
-                            output.m_pixels[px][ch] = 0;
-
-                    return;
-                }
-
-                const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
-
-                int partition = 0;
-                if (modeInfo.m_partitionBits)
-                    partition = pv.Unpack(modeInfo.m_partitionBits);
-
-                int rotation = 0;
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                    rotation = pv.Unpack(2);
-
-                int indexSelector = 0;
-                if (modeInfo.m_hasIndexSelector)
-                    indexSelector = pv.Unpack(1);
-
-                // Resolve fixups
-                int fixups[3] = { 0, 0, 0 };
-
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
-                {
-                    if (modeInfo.m_numSubsets == 2)
-                        fixups[1] = BC7Data::g_fixupIndexes2[partition];
-                    else if (modeInfo.m_numSubsets == 3)
-                    {
-                        fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
-                        fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
-                    }
-                }
-
-                int endPoints[3][2][4];
-
-                // Decode RGB
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
-                    }
-                }
-
-                // Decode alpha
-                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
-                    }
-                }
-                else
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                            endPoints[subset][ep][3] = 255;
-                    }
-                }
-
-                int parityBits = 0;
-
-                // Decode parity bits
-                if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        int p = pv.Unpack(1);
-
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-                else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
-                {
-                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                    {
-                        for (int ep = 0; ep < 2; ep++)
-                        {
-                            int p = pv.Unpack(1);
-
-                            for (int ch = 0; ch < 3; ch++)
-                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
-
-                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
-                        }
-                    }
-
-                    parityBits = 1;
-                }
-
-                // Fill endpoint bits
-                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
-                {
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
-
-                        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                            endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
-                    }
-                }
-
-                int indexes[16];
-                int indexes2[16];
-
-                // Decode indexes
-                for (int px = 0; px < 16; px++)
-                {
-                    int bits = modeInfo.m_indexBits;
-                    if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
-                        bits--;
-
-                    indexes[px] = pv.Unpack(bits);
-                }
-
-                // Decode secondary indexes
-                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int bits = modeInfo.m_alphaIndexBits;
-                        if (px == 0)
-                            bits--;
-
-                        indexes2[px] = pv.Unpack(bits);
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                        indexes2[px] = 0;
-                }
-
-                const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
-                const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
-
-                // Decode each pixel
-                for (int px = 0; px < 16; px++)
-                {
-                    int rgbWeight = 0;
-                    int alphaWeight = 0;
-
-                    int rgbIndex = indexes[px];
-
-                    rgbWeight = rgbWeights[indexes[px]];
-
-                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
-                        alphaWeight = rgbWeight;
-                    else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
-                        alphaWeight = alphaWeights[indexes2[px]];
-
-                    if (indexSelector == 1)
-                    {
-                        int temp = rgbWeight;
-                        rgbWeight = alphaWeight;
-                        alphaWeight = temp;
-                    }
-
-                    int pixel[4] = { 0, 0, 0, 255 };
-
-                    int subset = 0;
-
-                    if (modeInfo.m_numSubsets == 2)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-                    else if (modeInfo.m_numSubsets == 3)
-                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
-
-                    for (int ch = 0; ch < 3; ch++)
-                        pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
-
-                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
-                        pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
-
-                    if (rotation != 0)
-                    {
-                        int ch = rotation - 1;
-                        int temp = pixel[ch];
-                        pixel[ch] = pixel[3];
-                        pixel[3] = temp;
-                    }
-
-                    for (int ch = 0; ch < 4; ch++)
-                        output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
-                }
-            }
-        };
-
-        class BC6HComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::AInt16 MAInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-            typedef ParallelMath::UInt31 MUInt31;
-
-            static const int MaxTweakRounds = 4;
-            static const int MaxRefineRounds = 3;
-
-            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
-                assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
-
-                // Expand to full range
-                ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
-                MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
-
-                absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
-
-                MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
-
-                return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
-            }
-
-            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
-            {
-                MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
-                return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
-            }
-
-            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
-            {
-                MSInt16 zero = ParallelMath::MakeSInt16(0);
-
-                ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
-                MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
-
-                MSInt16 unq;
-                MUInt15 absUnq;
-
-                if (precision >= 16)
-                {
-                    unq = comp;
-                    absUnq = absComp;
-                }
-                else
-                {
-                    MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
-                    ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
-                    ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
-
-                    unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
-                }
-
-                outUnquantized = unq;
-
-                MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
-
-                outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
-            }
-
-            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
-            {
-                MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
-                if (precision < 15)
-                {
-                    MUInt15 zero = ParallelMath::MakeUInt15(0);
-                    MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
-
-                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
-                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
-
-                    unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
-
-                    ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
-                    ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
-                }
-
-                outUnquantized = unq;
-                outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
-            }
-
-            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MSInt16 unquantizedEP[2][3];
-                MSInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
-                            UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                MUInt16 unquantizedEP[2][3];
-                MUInt16 finishedUnquantizedEP[2][3];
-
-                {
-                    ParallelMath::RoundUpForScope ru;
-
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
-                            UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
-                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
-                        }
-                    }
-                }
-
-                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
-                indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
-
-                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
-
-                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
-
-                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
-
-                if (ParallelMath::AnySet(invert))
-                {
-                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
-
-                    indexSelector.ConditionalInvert(invert);
-
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MAInt16 firstEP = quantizedEndPoints[0][ch];
-                        MAInt16 secondEP = quantizedEndPoints[1][ch];
-
-                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
-                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
-                    }
-                }
-
-                indexes[fixupIndex] = index;
-            }
-
-            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][0][ch] = ep0[0][ch];
-                    outEncodedEPs[0][1][ch] = ep0[1][ch];
-                    outEncodedEPs[1][0][ch] = ep1[0][ch];
-                    outEncodedEPs[1][1][ch] = ep1[1][ch];
-
-                    if (isTransformed)
-                    {
-                        for (int subset = 0; subset < 2; subset++)
-                        {
-                            for (int epi = 0; epi < 2; epi++)
-                            {
-                                if (epi == 0 && subset == 0)
-                                    continue;
-
-                                MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
-
-                                MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
-
-                                outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                                MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
-                                allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                            }
-                        }
-                    }
-
-                    if (!ParallelMath::AnySet(allLegal))
-                        break;
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
-            {
-                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
-
-                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
-
-                for (int ch = 0; ch < 3; ch++)
-                {
-                    outEncodedEPs[0][ch] = ep[0][ch];
-                    outEncodedEPs[1][ch] = ep[1][ch];
-
-                    if (isTransformed)
-                    {
-                        MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
-
-                        MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
-
-                        outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
-
-                        MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
-                        allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
-                    }
-                }
-
-                outIsLegal = allLegal;
-            }
-
-            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
-            {
-                if (numTweakRounds < 1)
-                    numTweakRounds = 1;
-                else if (numTweakRounds > MaxTweakRounds)
-                    numTweakRounds = MaxTweakRounds;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-                else if (numRefineRounds > MaxRefineRounds)
-                    numRefineRounds = MaxRefineRounds;
-
-                bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
-                float channelWeightsSq[3];
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                MSInt16 pixels[16][3];
-                MFloat floatPixels2CL[16][3];
-                MFloat floatPixelsLinearWeighted[16][3];
-
-                MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        MSInt16 pixelValue;
-                        ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
-
-                        // Convert from sign+magnitude to 2CL
-                        if (isSigned)
-                        {
-                            ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
-                            MSInt16 magnitude = (pixelValue & low15Bits);
-                            ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
-                        }
-                        else
-                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
-
-                        pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
-
-                        pixels[px][ch] = pixelValue;
-                        floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
-                        floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
-                    }
-                }
-
-                MFloat preWeightedPixels[16][3];
-
-                BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
-
-                MAInt16 bestEndPoints[2][2][3];
-                MUInt15 bestIndexes[16];
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestMode = ParallelMath::MakeUInt15(0);
-                MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
-
-                UnfinishedEndpoints<3> partitionedUFEP[32][2];
-                UnfinishedEndpoints<3> singleUFEP;
-
-                // Generate UFEP for partitions
-                for (int p = 0; p < 32; p++)
-                {
-                    int partitionMask = BC7Data::g_partitionMap[p];
-
-                    EndpointSelector<3, 8> epSelectors[2];
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                        {
-                            int subset = (partitionMask >> px) & 1;
-                            epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-                        }
-
-                        for (int subset = 0; subset < 2; subset++)
-                            epSelectors[subset].FinishPass(pass);
-                    }
-
-                    for (int subset = 0; subset < 2; subset++)
-                        partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
-                }
-
-                // Generate UFEP for single
-                {
-                    EndpointSelector<3, 8> epSelector;
-
-                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                    {
-                        for (int px = 0; px < 16; px++)
-                            epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
-
-                        epSelector.FinishPass(pass);
-                    }
-
-                    singleUFEP = epSelector.GetEndpoints(channelWeights);
-                }
-
-                for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
-                {
-                    bool partitioned = (partitionedInt == 1);
-
-                    for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
-                    {
-                        if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
-                            continue;
-
-                        int numPartitions = partitioned ? 32 : 1;
-                        int numSubsets = partitioned ? 2 : 1;
-                        int indexBits = partitioned ? 3 : 4;
-                        int indexRange = (1 << indexBits);
-
-                        for (int p = 0; p < numPartitions; p++)
-                        {
-                            int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
-
-                            const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
-
-                            MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
-                            MUInt15 metaIndexes[MaxMetaRounds][16];
-                            MFloat metaError[MaxMetaRounds][2];
-
-                            bool roundValid[MaxMetaRounds][2];
-
-                            for (int r = 0; r < MaxMetaRounds; r++)
-                                for (int subset = 0; subset < 2; subset++)
-                                    roundValid[r][subset] = true;
-
-                            for (int subset = 0; subset < numSubsets; subset++)
-                            {
-                                for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
-                                {
-                                    EndpointRefiner<3> refiners[2];
-
-                                    bool abortRemainingRefines = false;
-                                    for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
-                                    {
-                                        int metaRound = tweak * MaxRefineRounds + refinePass;
-
-                                        if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
-                                            abortRemainingRefines = true;
-
-                                        if (abortRemainingRefines)
-                                        {
-                                            roundValid[metaRound][subset] = false;
-                                            continue;
-                                        }
-
-                                        MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
-                                        MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
-
-                                        MSInt16 endPointsColorSpace[2][3];
-
-                                        if (refinePass == 0)
-                                        {
-                                            UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
-
-                                            if (isSigned)
-                                                ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                            else
-                                                ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
-                                        }
-                                        else
-                                            refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
-
-                                        refiners[subset].Init(indexRange, channelWeights);
-
-                                        int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
-
-                                        IndexSelectorHDR<3> indexSelector;
-                                        if (isSigned)
-                                            QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-                                        else
-                                            QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
-
-                                        if (metaRound > 0)
-                                        {
-                                            ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
-
-                                            for (int prevRound = 0; prevRound < metaRound; prevRound++)
-                                            {
-                                                MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
-
-                                                ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
-
-                                                for (int epi = 0; epi < 2; epi++)
-                                                    for (int ch = 0; ch < 3; ch++)
-                                                        same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
-
-                                                anySame = (anySame | same);
-                                                if (ParallelMath::AllSet(anySame))
-                                                    break;
-                                            }
-
-                                            if (ParallelMath::AllSet(anySame))
-                                            {
-                                                roundValid[metaRound][subset] = false;
-                                                continue;
-                                            }
-                                        }
-
-                                        MFloat subsetError = ParallelMath::MakeFloatZero();
-
-                                        {
-                                            for (int px = 0; px < 16; px++)
-                                            {
-                                                if (subset != ((partitionMask >> px) & 1))
-                                                    continue;
-
-                                                MUInt15 index;
-                                                if (px == fixupIndex)
-                                                    index = mrIndexes[px];
-                                                else
-                                                {
-                                                    index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
-                                                    mrIndexes[px] = index;
-                                                }
-
-                                                MSInt16 reconstructed[3];
-                                                if (isSigned)
-                                                    indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
-                                                else
-                                                    indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
-
-                                                subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
-
-                                                if (refinePass != numRefineRounds - 1)
-                                                    refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
-                                            }
-                                        }
-
-                                        metaError[metaRound][subset] = subsetError;
-                                    }
-                                }
-                            }
-
-                            // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
-                            int numMeta1 = partitioned ? MaxMetaRounds : 1;
-                            for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
-                            {
-                                if (!roundValid[meta0][0])
-                                    continue;
-
-                                for (int meta1 = 0; meta1 < numMeta1; meta1++)
-                                {
-                                    MFloat combinedError = metaError[meta0][0];
-                                    if (partitioned)
-                                    {
-                                        if (!roundValid[meta1][1])
-                                            continue;
-
-                                        combinedError = combinedError + metaError[meta1][1];
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
-                                    if (!ParallelMath::AnySet(errorBetter))
-                                        continue;
-
-                                    ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    // Figure out if this is encodable
-                                    for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
-                                    {
-                                        const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
-
-                                        if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
-                                            continue;
-
-                                        MAInt16 encodedEPs[2][2][3];
-                                        ParallelMath::Int16CompFlag isLegal;
-                                        if (partitioned)
-                                            EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
-                                        else
-                                            EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
-
-                                        ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
-                                        if (!ParallelMath::AnySet(isLegalAndBetter))
-                                            continue;
-
-                                        ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
-
-                                        ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
-                                        ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
-                                        ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
-
-                                        for (int subset = 0; subset < numSubsets; subset++)
-                                        {
-                                            for (int epi = 0; epi < 2; epi++)
-                                            {
-                                                for (int ch = 0; ch < 3; ch++)
-                                                    ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
-                                            }
-                                        }
-
-                                        for (int px = 0; px < 16; px++)
-                                        {
-                                            int subset = ((partitionMask >> px) & 1);
-                                            if (subset == 0)
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
-                                            else
-                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
-                                        }
-
-                                        needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
-                                        if (!ParallelMath::AnySet(needsCommit))
-                                            break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-
-                // At this point, everything should be set
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
-                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
-                    int32_t eps[2][2][3];
-                    ParallelMath::ScalarUInt16 indexes[16];
-
-                    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-
-                    const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                    const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-
-                    for (int subset = 0; subset < 2; subset++)
-                    {
-                        for (int epi = 0; epi < 2; epi++)
-                        {
-                            for (int ch = 0; ch < 3; ch++)
-                                eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
-                        }
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                        indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
-
-                    uint16_t modeID = modeInfo.m_modeID;
-
-                    PackingVector pv;
-                    pv.Init();
-
-                    for (size_t i = 0; i < headerBits; i++)
-                    {
-                        int32_t codedValue = 0;
-                        switch (desc[i].m_eField)
-                        {
-                        case BC6HData::M:  codedValue = modeID; break;
-                        case BC6HData::D:  codedValue = partition; break;
-                        case BC6HData::RW: codedValue = eps[0][0][0]; break;
-                        case BC6HData::RX: codedValue = eps[0][1][0]; break;
-                        case BC6HData::RY: codedValue = eps[1][0][0]; break;
-                        case BC6HData::RZ: codedValue = eps[1][1][0]; break;
-                        case BC6HData::GW: codedValue = eps[0][0][1]; break;
-                        case BC6HData::GX: codedValue = eps[0][1][1]; break;
-                        case BC6HData::GY: codedValue = eps[1][0][1]; break;
-                        case BC6HData::GZ: codedValue = eps[1][1][1]; break;
-                        case BC6HData::BW: codedValue = eps[0][0][2]; break;
-                        case BC6HData::BX: codedValue = eps[0][1][2]; break;
-                        case BC6HData::BY: codedValue = eps[1][0][2]; break;
-                        case BC6HData::BZ: codedValue = eps[1][1][2]; break;
-                        default: assert(false); break;
-                        }
-
-                        pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
-                    }
-
-                    int fixupIndex1 = 0;
-                    int indexBits = 4;
-                    if (modeInfo.m_partitioned)
-                    {
-                        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                        indexBits = 3;
-                    }
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
-                        if (px == 0 || px == fixupIndex1)
-                            pv.Pack(index, indexBits - 1);
-                        else
-                            pv.Pack(index, indexBits);
-                    }
-
-                    pv.Flush(packedBlocks + 16 * block);
-                }
-            }
-
-            static void SignExtendSingle(int &v, int bits)
-            {
-                if (v & (1 << (bits - 1)))
-                    v |= -(1 << bits);
-            }
-
-            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
-            {
-                UnpackingVector pv;
-                pv.Init(pBC);
-
-                int numModeBits = 2;
-                int modeBits = pv.Unpack(2);
-                if (modeBits != 0 && modeBits != 1)
-                {
-                    modeBits |= pv.Unpack(3) << 2;
-                    numModeBits += 3;
-                }
-
-                int mode = -1;
-                for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
-                {
-                    if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
-                    {
-                        mode = possibleMode;
-                        break;
-                    }
-                }
-
-                if (mode < 0)
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            output.m_pixels[px][ch] = 0;
-                        output.m_pixels[px][3] = 0x3c00;	// 1.0
-                    }
-                    return;
-                }
-
-                const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
-                const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
-                const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
-
-                int32_t partition = 0;
-                int32_t eps[2][2][3];
-
-                for (int subset = 0; subset < 2; subset++)
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            eps[subset][epi][ch] = 0;
-
-                for (size_t i = numModeBits; i < headerBits; i++)
-                {
-                    int32_t *pCodedValue = NULL;
-
-                    switch (desc[i].m_eField)
-                    {
-                    case BC6HData::D:  pCodedValue = &partition; break;
-                    case BC6HData::RW: pCodedValue = &eps[0][0][0]; break;
-                    case BC6HData::RX: pCodedValue = &eps[0][1][0]; break;
-                    case BC6HData::RY: pCodedValue = &eps[1][0][0]; break;
-                    case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break;
-                    case BC6HData::GW: pCodedValue = &eps[0][0][1]; break;
-                    case BC6HData::GX: pCodedValue = &eps[0][1][1]; break;
-                    case BC6HData::GY: pCodedValue = &eps[1][0][1]; break;
-                    case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break;
-                    case BC6HData::BW: pCodedValue = &eps[0][0][2]; break;
-                    case BC6HData::BX: pCodedValue = &eps[0][1][2]; break;
-                    case BC6HData::BY: pCodedValue = &eps[1][0][2]; break;
-                    case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break;
-                    default: assert(false); break;
-                    }
-
-                    (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
-                }
-
-
-                uint16_t modeID = modeInfo.m_modeID;
-
-                int fixupIndex1 = 0;
-                int indexBits = 4;
-                int numSubsets = 1;
-                if (modeInfo.m_partitioned)
-                {
-                    fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
-                    indexBits = 3;
-                    numSubsets = 2;
-                }
-
-                int indexes[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    if (px == 0 || px == fixupIndex1)
-                        indexes[px] = pv.Unpack(indexBits - 1);
-                    else
-                        indexes[px] = pv.Unpack(indexBits);
-                }
-
-                if (modeInfo.m_partitioned)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                        {
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
-                            SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
-                        }
-                    }
-                }
-                else
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        if (isSigned)
-                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
-                        if (modeInfo.m_transformed || isSigned)
-                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
-                    }
-                }
-
-                int aPrec = modeInfo.m_aPrec;
-
-                if (modeInfo.m_transformed)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int wrapMask = (1 << aPrec) - 1;
-
-                        eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
-                        if (isSigned)
-                            SignExtendSingle(eps[0][1][ch], aPrec);
-
-                        if (modeInfo.m_partitioned)
-                        {
-                            eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
-                            eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
-
-                            if (isSigned)
-                            {
-                                SignExtendSingle(eps[1][0][ch], aPrec);
-                                SignExtendSingle(eps[1][1][ch], aPrec);
-                            }
-                        }
-                    }
-                }
-
-                // Unquantize endpoints
-                for (int subset = 0; subset < numSubsets; subset++)
-                {
-                    for (int epi = 0; epi < 2; epi++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                        {
-                            int &v = eps[subset][epi][ch];
-
-                            if (isSigned)
-                            {
-                                if (aPrec >= 16)
-                                {
-                                    // Nothing
-                                }
-                                else
-                                {
-                                    bool s = false;
-                                    int comp = v;
-                                    if (v < 0)
-                                    {
-                                        s = true;
-                                        comp = -comp;
-                                    }
-
-                                    int unq = 0;
-                                    if (comp == 0)
-                                        unq = 0;
-                                    else if (comp >= ((1 << (aPrec - 1)) - 1))
-                                        unq = 0x7fff;
-                                    else
-                                        unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
-
-                                    if (s)
-                                        unq = -unq;
-
-                                    v = unq;
-                                }
-                            }
-                            else
-                            {
-                                if (aPrec >= 15)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == 0)
-                                {
-                                    // Nothing
-                                }
-                                else if (v == ((1 << aPrec) - 1))
-                                    v = 0xffff;
-                                else
-                                    v = ((v << 16) + 0x8000) >> aPrec;
-                            }
-                        }
-                    }
-                }
-
-                const int *weights = BC7Data::g_weightTables[indexBits];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    int subset = 0;
-                    if (modeInfo.m_partitioned)
-                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
-
-                    int w = weights[indexes[px]];
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
-
-                        if (isSigned)
-                        {
-                            if (comp < 0)
-                                comp = -(((-comp) * 31) >> 5);
-                            else
-                                comp = (comp * 31) >> 5;
-
-                            int s = 0;
-                            if (comp < 0)
-                            {
-                                s = 0x8000;
-                                comp = -comp;
-                            }
-
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
-                        }
-                        else
-                        {
-                            comp = (comp * 31) >> 6;
-                            output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
-                        }
-                    }
-                    output.m_pixels[px][3] = 0x3c00;	// 1.0
-                }
-            }
-        };
-
-        namespace S3TCSingleColorTables
-        {
-            struct SingleColorTableEntry
-            {
-                uint8_t m_min;
-                uint8_t m_max;
-                uint8_t m_actualColor;
-                uint8_t m_span;
-            };
-
-            SingleColorTableEntry g_singleColor5_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
-                { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
-                { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
-                { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
-                { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
-                { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
-                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
-                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
-                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
-                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
-                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
-                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
-                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
-                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
-                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
-                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
-                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
-                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
-                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
-                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
-                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
-                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
-                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
-                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
-                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_3_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
-                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
-                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
-                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
-                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
-                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
-                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
-                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
-                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
-                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
-                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
-                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
-                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
-                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor5_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
-                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
-                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
-                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
-                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
-                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
-                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
-                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
-                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
-                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
-                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
-                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
-                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
-                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
-                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
-                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
-                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
-                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
-                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
-                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
-                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
-                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
-                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
-                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
-                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
-                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-
-            SingleColorTableEntry g_singleColor6_2_p[256] =
-            {
-                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
-                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
-                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
-                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
-                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
-                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
-                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
-                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
-                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
-                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
-                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
-                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
-                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
-                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
-                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
-                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
-                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
-                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
-                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
-                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
-                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
-                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
-                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
-                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
-                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
-                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
-                { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
-                { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
-                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
-                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
-                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
-                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
-            };
-        }
-
-        class S3TCComputer
-        {
-        public:
-            typedef ParallelMath::Float MFloat;
-            typedef ParallelMath::SInt16 MSInt16;
-            typedef ParallelMath::UInt15 MUInt15;
-            typedef ParallelMath::UInt16 MUInt16;
-            typedef ParallelMath::SInt32 MSInt32;
-
-            static void Init(MFloat& error)
-            {
-                error = ParallelMath::MakeFloat(FLT_MAX);
-            }
-
-            static void QuantizeTo6Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
-                v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
-            }
-
-            static void QuantizeTo5Bits(MUInt15& v)
-            {
-                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
-                v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
-            }
-
-            static void QuantizeTo565(MUInt15 endPoint[3])
-            {
-                QuantizeTo5Bits(endPoint[0]);
-                QuantizeTo6Bits(endPoint[1]);
-                QuantizeTo5Bits(endPoint[2]);
-            }
-
-            static MFloat ParanoidFactorForSpan(const MSInt16& span)
-            {
-                return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
-            }
-
-            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
-            {
-                MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
-                absDiff = absDiff + d;
-                return absDiff * absDiff;
-            }
-
-            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                        totals[ch] = totals[ch] + pixels[px][ch];
-                }
-
-                MUInt15 average[3];
-                for (int ch = 0; ch < 3; ch++)
-                    average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
-
-                const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL;
-                const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL;
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3_p;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2_p;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2_p;
-                    }
-                }
-                else
-                {
-                    if (range == 4)
-                    {
-                        rbTable = S3TCSingleColorTables::g_singleColor5_3;
-                        gTable = S3TCSingleColorTables::g_singleColor6_3;
-                    }
-                    else
-                    {
-                        assert(range == 3);
-                        rbTable = S3TCSingleColorTables::g_singleColor5_2;
-                        gTable = S3TCSingleColorTables::g_singleColor6_2;
-                    }
-                }
-
-                MUInt15 interpolated[3];
-                MUInt15 eps[2][3];
-                MSInt16 spans[3];
-                for (int i = 0; i < ParallelMath::ParallelSize; i++)
-                {
-                    for (int ch = 0; ch < 3; ch++)
-                    {
-                        uint16_t avg = ParallelMath::Extract(average[ch], i);
-                        const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
-                        ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
-                        ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
-                        ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
-                        ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
-                    }
-                }
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                if (flags & cvtt::Flags::S3TC_Paranoid)
-                {
-                    MFloat spanParanoidFactors[3];
-                    for (int ch = 0; ch < 3; ch++)
-                        spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                }
-                else
-                {
-                    for (int px = 0; px < 16; px++)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
-                    }
-                }
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-                ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
-
-                if (ParallelMath::AnySet(better16))
-                {
-                    bestError = ParallelMath::Min(bestError, error);
-                    for (int epi = 0; epi < 2; epi++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
-
-                    MUInt15 vindexes = ParallelMath::MakeUInt15(1);
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
-
-                    ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
-                }
-            }
-
-            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
-                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
-            {
-                float channelWeightsSq[3];
-
-                for (int ch = 0; ch < 3; ch++)
-                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
-
-                MUInt15 endPoints[2][3];
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
-
-                QuantizeTo565(endPoints[0]);
-                QuantizeTo565(endPoints[1]);
-
-                IndexSelector<3> selector;
-                selector.Init<false>(channelWeights, endPoints, range);
-
-                MUInt15 indexes[16];
-
-                MFloat paranoidFactors[3];
-                for (int ch = 0; ch < 3; ch++)
-                    paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
-
-                MFloat error = ParallelMath::MakeFloatZero();
-                AggregatedError<3> aggError;
-                for (int px = 0; px < 16; px++)
-                {
-                    MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
-                    indexes[px] = index;
-
-                    if (refiner)
-                        refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
-
-                    MUInt15 reconstructed[3];
-                    selector.ReconstructLDRPrecise(index, reconstructed);
-
-                    if (flags & Flags::S3TC_Paranoid)
-                    {
-                        for (int ch = 0; ch < 3; ch++)
-                            error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
-                    }
-                    else
-                        BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
-                }
-
-                if (!(flags & Flags::S3TC_Paranoid))
-                    error = aggError.Finalize(flags, channelWeightsSq);
-
-                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
-
-                if (ParallelMath::AnySet(better))
-                {
-                    ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
-
-                    ParallelMath::ConditionalSet(bestError, better, error);
-
-                    for (int ep = 0; ep < 2; ep++)
-                        for (int ch = 0; ch < 3; ch++)
-                            ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
-
-                    for (int px = 0; px < 16; px++)
-                        ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
-
-                    ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
-                }
-            }
-
-            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
-                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
-                const ParallelMath::RoundTowardNearestForScope* rtn)
-            {
-                UNREFERENCED_PARAMETER(alphaTest);
-                UNREFERENCED_PARAMETER(flags);
-
-                EndpointRefiner<3> refiner;
-
-                refiner.Init(nCounts, channelWeights);
-
-                bool escape = false;
-                int e = 0;
-                for (int i = 0; i < nCounts; i++)
-                {
-                    for (int n = 0; n < counts[i]; n++)
-                    {
-                        ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
-                        if (!ParallelMath::AnySet(valid))
-                        {
-                            escape = true;
-                            break;
-                        }
-
-                        if (ParallelMath::AllSet(valid))
-                            refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
-                        else
-                        {
-                            MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
-                            refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
-                        }
-                    }
-
-                    if (escape)
-                        break;
-                }
-
-                MUInt15 endPoints[2][3];
-                refiner.GetRefinedEndpointsLDR(endPoints, rtn);
-
-                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
-            }
-
-            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
-            {
-                UNREFERENCED_PARAMETER(flags);
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float weights[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
-
-                IndexSelector<1> selector;
-                selector.Init<false>(weights, ep, 16);
-
-                MUInt15 indexes[16];
-
-                for (int px = 0; px < 16; px++)
-                    indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    for (int px = 0; px < 16; px += 8)
-                    {
-                        int index0 = ParallelMath::Extract(indexes[px], block);
-                        int index1 = ParallelMath::Extract(indexes[px], block);
-
-                        packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
-            {
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                float oneWeight[1] = { 1.0f };
-
-                MUInt15 pixels[16];
-                MFloat floatPixels[16];
-
-                MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
-                MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
-
-                for (int px = 0; px < 16; px++)
-                {
-                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
-
-                    if (isSigned)
-                        pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
-
-                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
-                }
-
-                MUInt15 sortedPixels[16];
-                for (int px = 0; px < 16; px++)
-                    sortedPixels[px] = pixels[px];
-
-                for (int sortEnd = 15; sortEnd > 0; sortEnd--)
-                {
-                    for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
-                    {
-                        MUInt15 a = sortedPixels[sortOffset];
-                        MUInt15 b = sortedPixels[sortOffset + 1];
-
-                        sortedPixels[sortOffset] = ParallelMath::Min(a, b);
-                        sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
-                    }
-                }
-
-                MUInt15 zero = ParallelMath::MakeUInt15(0);
-                MUInt15 one = ParallelMath::MakeUInt15(1);
-
-                MUInt15 bestIsFullRange = zero;
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-                MUInt15 bestEP[2] = { zero, zero };
-                MUInt15 bestIndexes[16] = {
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero,
-                    zero, zero, zero, zero
-                };
-
-                // Full-precision
-                {
-                    MUInt15 minEP = sortedPixels[0];
-                    MUInt15 maxEP = sortedPixels[15];
-
-                    MFloat base[1] = { ParallelMath::ToFloat(minEP) };
-                    MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
-
-                    UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                    int numTweakRounds = BCCommon::TweakRoundsForRange(8);
-                    if (numTweakRounds > maxTweakRounds)
-                        numTweakRounds = maxTweakRounds;
-
-                    for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                    {
-                        MUInt15 ep[2][1];
-
-                        ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                        for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                        {
-                            EndpointRefiner<1> refiner;
-                            refiner.Init(8, oneWeight);
-
-                            if (isSigned)
-                                for (int epi = 0; epi < 2; epi++)
-                                    ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                            IndexSelector<1> indexSelector;
-                            indexSelector.Init<false>(oneWeight, ep, 8);
-
-                            MUInt15 indexes[16];
-
-                            AggregatedError<1> aggError;
-                            for (int px = 0; px < 16; px++)
-                            {
-                                MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                MUInt15 reconstructedPixel;
-
-                                indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
-                                BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
-
-                                if (refinePass != numRefineRounds - 1)
-                                    refiner.ContributeUnweightedPW(&floatPixels[px], index);
-
-                                indexes[px] = index;
-                            }
-                            MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
-
-                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                            if (ParallelMath::AnySet(errorBetter16))
-                            {
-                                bestError = ParallelMath::Min(error, bestError);
-                                ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
-                                for (int px = 0; px < 16; px++)
-                                    ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                for (int epi = 0; epi < 2; epi++)
-                                    ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                            }
-
-                            if (refinePass != numRefineRounds - 1)
-                                refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                        }
-                    }
-                }
-
-                // Reduced precision with special endpoints
-                {
-                    MUInt15 bestHeuristicMin = sortedPixels[0];
-                    MUInt15 bestHeuristicMax = sortedPixels[15];
-
-                    ParallelMath::Int16CompFlag canTryClipping;
-
-                    // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
-                    // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
-                    // This will usually not find anything, but it's cheap to check.
-
-                    {
-                        MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
-                        MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
-
-                        MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
-                        canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
-                    }
-
-                    if (ParallelMath::AnySet(canTryClipping))
-                    {
-                        MUInt15 lowClearances[16];
-                        MUInt15 highClearances[16];
-                        MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
-
-                        lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
-
-                        for (int px = 1; px < 16; px++)
-                        {
-                            lowClearances[px] = sortedPixels[px - 1];
-                            highClearances[px] = highTerminal - sortedPixels[16 - px];
-                        }
-
-                        for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
-                        {
-                            uint16_t numSkippedLow = firstIndex;
-
-                            MUInt15 lowClearance = lowClearances[firstIndex];
-
-                            for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
-                            {
-                                uint16_t numSkippedHigh = 15 - lastIndex;
-                                uint16_t numSkipped = numSkippedLow + numSkippedHigh;
-
-                                MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
-
-                                ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
-
-                                if (!ParallelMath::AnySet(areMoreSkipped))
-                                    continue;
-
-                                MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
-                                MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
-
-                                MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
-
-                                ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
-                                ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
-                                ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
-                            }
-                        }
-                    }
-
-                    MUInt15 bestSimpleMin = one;
-                    MUInt15 bestSimpleMax = highTerminalMinusOne;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
-                        ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
-                    }
-
-                    MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
-                    MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
-
-                    int minEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
-                        minEPRange = 1;
-
-                    int maxEPRange = 2;
-                    if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
-                        maxEPRange = 1;
-
-                    for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
-                    {
-                        for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
-                        {
-                            MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
-                            MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
-
-                            UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
-
-                            int numTweakRounds = BCCommon::TweakRoundsForRange(6);
-                            if (numTweakRounds > maxTweakRounds)
-                                numTweakRounds = maxTweakRounds;
-
-                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
-                            {
-                                MUInt15 ep[2][1];
-
-                                ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
-
-                                for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
-                                {
-                                    EndpointRefiner<1> refiner;
-                                    refiner.Init(6, oneWeight);
-
-                                    if (isSigned)
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
-
-                                    IndexSelector<1> indexSelector;
-                                    indexSelector.Init<false>(oneWeight, ep, 6);
-
-                                    MUInt15 indexes[16];
-                                    MFloat error = ParallelMath::MakeFloatZero();
-
-                                    for (int px = 0; px < 16; px++)
-                                    {
-                                        MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
-
-                                        MUInt15 reconstructedPixel;
-
-                                        indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
-
-                                        MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
-                                        MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
-                                        MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
-
-                                        MFloat bestPixelError = zeroError;
-                                        MUInt15 index = ParallelMath::MakeUInt15(6);
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
-                                        bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
-
-                                        ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
-
-                                        if (ParallelMath::AllSet(selectedIndexBetter))
-                                        {
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
-                                        }
-                                        else
-                                        {
-                                            MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
-
-                                            if (refinePass != numRefineRounds - 1)
-                                                refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
-                                        }
-
-                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
-                                        bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
-
-                                        error = error + bestPixelError;
-
-                                        indexes[px] = index;
-                                    }
-
-                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
-                                    ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
-
-                                    if (ParallelMath::AnySet(errorBetter16))
-                                    {
-                                        bestError = ParallelMath::Min(error, bestError);
-                                        ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
-                                        for (int px = 0; px < 16; px++)
-                                            ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
-
-                                        for (int epi = 0; epi < 2; epi++)
-                                            ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
-                                    }
-
-                                    if (refinePass != numRefineRounds - 1)
-                                        refiner.GetRefinedEndpointsLDR(ep, &rtn);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    int ep0 = ParallelMath::Extract(bestEP[0], block);
-                    int ep1 = ParallelMath::Extract(bestEP[1], block);
-                    int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
-
-                    if (isSigned)
-                    {
-                        ep0 -= 127;
-                        ep1 -= 127;
-
-                        assert(ep0 >= -127 && ep0 <= 127);
-                        assert(ep1 >= -127 && ep1 <= 127);
-                    }
-
-
-                    bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
-
-                    if (swapEndpoints)
-                        std::swap(ep0, ep1);
-
-                    uint16_t dumpBits = 0;
-                    int dumpBitsOffset = 0;
-                    int dumpByteOffset = 2;
-                    packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
-
-                    int maxValue = (isFullRange != 0) ? 7 : 5;
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        int index = ParallelMath::Extract(bestIndexes[px], block);
-
-                        if (swapEndpoints && index <= maxValue)
-                            index = maxValue - index;
-
-                        if (index != 0)
-                        {
-                            if (index == maxValue)
-                                index = 1;
-                            else if (index < maxValue)
-                                index++;
-                        }
-
-                        assert(index >= 0 && index < 8);
-
-                        dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
-                        dumpBitsOffset += 3;
-
-                        if (dumpBitsOffset >= 8)
-                        {
-                            assert(dumpByteOffset < 8);
-                            packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
-                            dumpBits >>= 8;
-                            dumpBitsOffset -= 8;
-                            dumpByteOffset++;
-                        }
-                    }
-
-                    assert(dumpBitsOffset == 0);
-                    assert(dumpByteOffset == 8);
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-
-            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
-            {
-                ParallelMath::RoundTowardNearestForScope rtn;
-
-                if (numRefineRounds < 1)
-                    numRefineRounds = 1;
-
-                if (maxTweakRounds < 1)
-                    maxTweakRounds = 1;
-
-                EndpointSelector<3, 8> endpointSelector;
-
-                MUInt15 pixels[16][4];
-                MFloat floatPixels[16][4];
-
-                MFloat preWeightedPixels[16][4];
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
-                }
-
-                for (int px = 0; px < 16; px++)
-                {
-                    for (int ch = 0; ch < 4; ch++)
-                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
-                }
-
-                if (alphaTest)
-                {
-                    MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
-
-                    for (int px = 0; px < 16; px++)
-                    {
-                        ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
-                        pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
-                    }
-                }
-
-                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
-
-                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
-
-                for (int px = 0; px < 16; px++)
-                    minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
-
-                MFloat pixelWeights[16];
-                for (int px = 0; px < 16; px++)
-                {
-                    pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
-                    if (alphaTest)
-                    {
-                        ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                        ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
-                    }
-                }
-
-                for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
-                {
-                    for (int px = 0; px < 16; px++)
-                        endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
-
-                    endpointSelector.FinishPass(pass);
-                }
-
-                UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
-
-                MUInt15 bestEndpoints[2][3];
-                MUInt15 bestIndexes[16];
-                MUInt15 bestRange = ParallelMath::MakeUInt15(0);
-                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
-
-                for (int px = 0; px < 16; px++)
-                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
-
-                for (int ep = 0; ep < 2; ep++)
-                    for (int ch = 0; ch < 3; ch++)
-                        bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
-
-                if (exhaustive)
-                {
-                    MSInt16 sortBins[16];
-
-                    {
-                        // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
-                        // and pack the original indexes into the low bits.
-
-                        MUInt15 sortEP[2][3];
-                        ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
-
-                        IndexSelector<3> sortSelector;
-                        sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
-
-                        for (int16_t px = 0; px < 16; px++)
-                        {
-                            MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
-
-                            if (alphaTest)
-                            {
-                                ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
-
-                                ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
-                            }
-
-                            sortBin = sortBin + ParallelMath::MakeSInt16(px);
-
-                            sortBins[px] = sortBin;
-                        }
-                    }
-
-                    // Sort bins
-                    for (int sortEnd = 1; sortEnd < 16; sortEnd++)
-                    {
-                        for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
-                        {
-                            MSInt16 a = sortBins[sortLoc];
-                            MSInt16 b = sortBins[sortLoc - 1];
-
-                            sortBins[sortLoc] = ParallelMath::Max(a, b);
-                            sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
-                        }
-                    }
-
-                    MUInt15 firstElement = ParallelMath::MakeUInt15(0);
-                    for (uint16_t e = 0; e < 16; e++)
-                    {
-                        ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
-                        ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
-                        if (!ParallelMath::AnySet(isInvalid))
-                            break;
-                    }
-
-                    MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
-
-                    MUInt15 sortedInputs[16][4];
-                    MFloat floatSortedInputs[16][4];
-                    MFloat pwFloatSortedInputs[16][4];
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                            sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
-                    }
-
-                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                    {
-                        for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
-                        {
-                            ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
-                            int originalIndex = (sortBin & 15);
-
-                            for (int ch = 0; ch < 4; ch++)
-                                ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
-                        }
-                    }
-
-                    for (int e = 0; e < 16; e++)
-                    {
-                        for (int ch = 0; ch < 4; ch++)
-                        {
-                            MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
-                            floatSortedInputs[e][ch] = f;
-                            pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
-                        }
-                    }
-
-                    for (int n0 = 0; n0 <= 15; n0++)
-                    {
-                        int remainingFor1 = 16 - n0;
-                        if (remainingFor1 == 16)
-                            remainingFor1 = 15;
-
-                        for (int n1 = 0; n1 <= remainingFor1; n1++)
-                        {
-                            int remainingFor2 = 16 - n1 - n0;
-                            if (remainingFor2 == 16)
-                                remainingFor2 = 15;
-
-                            for (int n2 = 0; n2 <= remainingFor2; n2++)
-                            {
-                                int n3 = 16 - n2 - n1 - n0;
-
-                                if (n3 == 16)
-                                    continue;
-
-                                int counts[4] = { n0, n1, n2, n3 };
-
-                                TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-                    }
-
-                    TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-
-                    if (alphaTest)
-                    {
-                        for (int n0 = 0; n0 <= 15; n0++)
-                        {
-                            int remainingFor1 = 16 - n0;
-                            if (remainingFor1 == 16)
-                                remainingFor1 = 15;
-
-                            for (int n1 = 0; n1 <= remainingFor1; n1++)
-                            {
-                                int n2 = 16 - n1 - n0;
-
-                                if (n2 == 16)
-                                    continue;
-
-                                int counts[3] = { n0, n1, n2 };
-
-                                TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                            }
-                        }
-
-                        TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
-                    }
-                }
-                else
-                {
-                    int minRange = alphaTest ? 3 : 4;
-
-                    for (int range = minRange; range <= 4; range++)
-                    {
-                        int tweakRounds = BCCommon::TweakRoundsForRange(range);
-                        if (tweakRounds > maxTweakRounds)
-                            tweakRounds = maxTweakRounds;
-
-                        for (int tweak = 0; tweak < tweakRounds; tweak++)
-                        {
-                            MUInt15 endPoints[2][3];
-
-                            ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
-
-                            for (int refine = 0; refine < numRefineRounds; refine++)
-                            {
-                                EndpointRefiner<3> refiner;
-                                refiner.Init(range, channelWeights);
-
-                                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
-
-                                if (refine != numRefineRounds - 1)
-                                    refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
-                            }
-                        }
-                    }
-                }
-
-                for (int block = 0; block < ParallelMath::ParallelSize; block++)
-                {
-                    ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
-                    assert(range == 3 || range == 4);
-
-                    ParallelMath::ScalarUInt16 compressedEP[2];
-                    for (int ep = 0; ep < 2; ep++)
-                    {
-                        ParallelMath::ScalarUInt16 endPoint[3];
-                        for (int ch = 0; ch < 3; ch++)
-                            endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
-
-                        int compressed = (endPoint[0] & 0xf8) << 8;
-                        compressed |= (endPoint[1] & 0xfc) << 3;
-                        compressed |= (endPoint[2] & 0xf8) >> 3;
-
-                        compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
-                    }
-
-                    int indexOrder[4];
-
-                    if (range == 4)
-                    {
-                        if (compressedEP[0] == compressedEP[1])
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 0;
-                            indexOrder[2] = 0;
-                            indexOrder[3] = 0;
-                        }
-                        else if (compressedEP[0] < compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 3;
-                            indexOrder[2] = 2;
-                            indexOrder[3] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 3;
-                            indexOrder[3] = 1;
-                        }
-                    }
-                    else
-                    {
-                        assert(range == 3);
-
-                        if (compressedEP[0] > compressedEP[1])
-                        {
-                            std::swap(compressedEP[0], compressedEP[1]);
-                            indexOrder[0] = 1;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 0;
-                        }
-                        else
-                        {
-                            indexOrder[0] = 0;
-                            indexOrder[1] = 2;
-                            indexOrder[2] = 1;
-                        }
-                        indexOrder[3] = 3;
-                    }
-
-                    packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
-                    packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
-                    packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
-                    packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
-
-                    for (int i = 0; i < 16; i += 4)
-                    {
-                        int packedIndexes = 0;
-                        for (int subi = 0; subi < 4; subi++)
-                        {
-                            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
-                            packedIndexes |= (indexOrder[index] << (subi * 2));
-                        }
-
-                        packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
-                    }
-
-                    packedBlocks += packedBlockStride;
-                }
-            }
-        };
-
-        // Signed input blocks are converted into unsigned space, with the maximum value being 254
-        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
-        {
-            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
-            {
-                const PixelBlockS8& inputSignedBlock = inputSigned[block];
-                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
-
-                for (size_t px = 0; px < 16; px++)
-                {
-                    for (size_t ch = 0; ch < 4; ch++)
-                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
-                }
-            }
-        }
-
-        void FillWeights(const Options &options, float channelWeights[4])
-        {
-            if (options.flags & Flags::Uniform)
-                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
-            else
-            {
-                channelWeights[0] = options.redWeight;
-                channelWeights[1] = options.greenWeight;
-                channelWeights[2] = options.blueWeight;
-                channelWeights[3] = options.alphaWeight;
-            }
-        }
-    }
-
-    namespace Kernels
-    {
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 8;
-            }
-        }
-
-        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            float channelWeights[4];
-            Internal::FillWeights(options, channelWeights);
-
-            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
-            {
-                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
-                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
-
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
-                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
-                pBC += ParallelMath::ParallelSize * 16;
-            }
-        }
-
-        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
-                pBC += 16;
-            }
-        }
-
-        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
-        {
-            assert(pBlocks);
-            assert(pBC);
-
-            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
-            {
-                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
-                pBC += 16;
-            }
-        }
-    }
-}
diff --git a/thirdparty/cvtt/ConvectionKernels.h b/thirdparty/cvtt/ConvectionKernels.h
index fb5ca130f9..3da48405ff 100644
--- a/thirdparty/cvtt/ConvectionKernels.h
+++ b/thirdparty/cvtt/ConvectionKernels.h
@@ -25,21 +25,13 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __CVTT_CONVECTION_KERNELS__
 #define __CVTT_CONVECTION_KERNELS__
 
+#include <stddef.h>
 #include <stdint.h>
 
 namespace cvtt
 {
     namespace Flags
     {
-        // Enable partitioned modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnablePartitioning   = 0x001;
-
-        // Enable 3-partition modes in BC7 encoding (slower, better quality, requires BC7_EnablePartitioning)
-        const uint32_t BC7_Enable3Subsets       = 0x002;
-
-        // Enable dual-plane modes in BC7 encoding (slower, better quality)
-        const uint32_t BC7_EnableDualPlane      = 0x004;
-
         // Use fast indexing in BC7 encoding (about 2x faster, slightly worse quality)
         const uint32_t BC7_FastIndexing         = 0x008;
 
@@ -61,13 +53,19 @@ namespace cvtt
         // Uniform color channel importance
         const uint32_t Uniform                  = 0x200;
 
+        // Use fake BT.709 color space for etc2comp compatibility (slower)
+        const uint32_t ETC_UseFakeBT709         = 0x400;
+
+        // Use accurate quantization functions when quantizing fake BT.709 (much slower, marginal improvement on specific blocks)
+        const uint32_t ETC_FakeBT709Accurate    = 0x800;
+
         // Misc useful default flag combinations
-        const uint32_t Fastest = (BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Faster = (BC7_EnableDualPlane | BC6H_FastIndexing | S3TC_Paranoid);
-        const uint32_t Fast = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Default = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_FastIndexing | S3TC_Paranoid);
-        const uint32_t Better = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | S3TC_Paranoid | S3TC_Exhaustive);
-        const uint32_t Ultra = (BC7_EnablePartitioning | BC7_EnableDualPlane | BC7_Enable3Subsets | BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Fastest = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Faster = (BC6H_FastIndexing | BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Fast = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Default = (BC7_FastIndexing | S3TC_Paranoid);
+        const uint32_t Better = (S3TC_Paranoid | S3TC_Exhaustive);
+        const uint32_t Ultra = (BC7_TrySingleColor | S3TC_Paranoid | S3TC_Exhaustive | ETC_FakeBT709Accurate);
     }
 
     const unsigned int NumParallelBlocks = 8;
@@ -81,7 +79,7 @@ namespace cvtt
         float blueWeight;       // Blue channel importance
         float alphaWeight;      // Alpha channel importance
 
-        int refineRoundsBC7;    // Number of refine rounds for BC7
+        int refineRoundsBC7;   // Number of refine rounds for BC7
         int refineRoundsBC6H;   // Number of refine rounds for BC6H (max 3)
         int refineRoundsIIC;    // Number of refine rounds for independent interpolated channels (BC3 alpha, BC4, BC5)
         int refineRoundsS3TC;   // Number of refine rounds for S3TC RGB
@@ -104,6 +102,102 @@ namespace cvtt
         }
     };
 
+    struct BC7FineTuningParams
+    {
+        // Seed point counts for each mode+configuration combination
+        uint8_t mode0SP[16];
+        uint8_t mode1SP[64];
+        uint8_t mode2SP[64];
+        uint8_t mode3SP[64];
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        uint8_t mode6SP;
+        uint8_t mode7SP[64];
+
+        BC7FineTuningParams()
+        {
+            for (int i = 0; i < 16; i++)
+                this->mode0SP[i] = 4;
+
+            for (int i = 0; i < 64; i++)
+            {
+                this->mode1SP[i] = 4;
+                this->mode2SP[i] = 4;
+                this->mode3SP[i] = 4;
+                this->mode7SP[i] = 4;
+            }
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+
+            this->mode6SP = 4;
+        }
+    };
+
+    struct BC7EncodingPlan
+    {
+        static const int kNumRGBAShapes = 129;
+        static const int kNumRGBShapes = 243;
+
+        uint64_t mode1PartitionEnabled;
+        uint64_t mode2PartitionEnabled;
+        uint64_t mode3PartitionEnabled;
+        uint16_t mode0PartitionEnabled;
+        uint64_t mode7RGBAPartitionEnabled;
+        uint64_t mode7RGBPartitionEnabled;
+        uint8_t mode4SP[4][2];
+        uint8_t mode5SP[4];
+        bool mode6Enabled;
+
+        uint8_t seedPointsForShapeRGB[kNumRGBShapes];
+        uint8_t seedPointsForShapeRGBA[kNumRGBAShapes];
+
+        uint8_t rgbaShapeList[kNumRGBAShapes];
+        uint8_t rgbaNumShapesToEvaluate;
+
+        uint8_t rgbShapeList[kNumRGBShapes];
+        uint8_t rgbNumShapesToEvaluate;
+
+        BC7EncodingPlan()
+        {
+            for (int i = 0; i < kNumRGBShapes; i++)
+            {
+                this->rgbShapeList[i] = i;
+                this->seedPointsForShapeRGB[i] = 4;
+            }
+            this->rgbNumShapesToEvaluate = kNumRGBShapes;
+
+            for (int i = 0; i < kNumRGBAShapes; i++)
+            {
+                this->rgbaShapeList[i] = i;
+                this->seedPointsForShapeRGBA[i] = 4;
+            }
+            this->rgbaNumShapesToEvaluate = kNumRGBAShapes;
+
+
+            this->mode0PartitionEnabled = 0xffff;
+            this->mode1PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode2PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode3PartitionEnabled = 0xffffffffffffffffULL;
+            this->mode6Enabled = true;
+            this->mode7RGBPartitionEnabled = 0xffffffffffffffffULL;
+            this->mode7RGBAPartitionEnabled = 0xffffffffffffffffULL;
+
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                    this->mode4SP[i][j] = 4;
+
+                this->mode5SP[i] = 4;
+            }
+        }
+    };
+
     // RGBA input block for unsigned 8-bit formats
     struct PixelBlockU8
     {
@@ -116,14 +210,34 @@ namespace cvtt
         int8_t m_pixels[16][4];
     };
 
+    struct PixelBlockScalarS16
+    {
+        int16_t m_pixels[16];
+    };
+
     // RGBA input block for half-precision float formats (bit-cast to int16_t)
     struct PixelBlockF16
     {
         int16_t m_pixels[16][4];
     };
 
+    class ETC2CompressionData
+    {
+    protected:
+        ETC2CompressionData() {}
+    };
+
+    class ETC1CompressionData
+    {
+    protected:
+        ETC1CompressionData() {}
+    };
+
     namespace Kernels
     {
+        typedef void* allocFunc_t(void *context, size_t size);
+        typedef void freeFunc_t(void *context, void* ptr, size_t size);
+
         // NOTE: All functions accept and output NumParallelBlocks blocks at once
         void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
         void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
@@ -134,7 +248,28 @@ namespace cvtt
         void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options);
         void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
         void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const Options &options);
-        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options);
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, const BC7EncodingPlan &encodingPlan);
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC1CompressionData *compressionData);
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options, ETC2CompressionData *compressionData);
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData);
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options);
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options);
+
+        // Generates a BC7 encoding plan from a quality parameter that ranges from 1 (fastest) to 100 (best)
+        void ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality);
+
+        // Generates a BC7 encoding plan from fine-tuning parameters.
+        bool ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params);
+
+        // ETC compression requires temporary storage that normally consumes a large amount of stack space.
+        // To allocate and release it, use one of these functions.
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc);
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context);
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc);
 
         void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC);
         void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC);
diff --git a/thirdparty/cvtt/ConvectionKernels_API.cpp b/thirdparty/cvtt/ConvectionKernels_API.cpp
new file mode 100644
index 0000000000..707e71d474
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_API.cpp
@@ -0,0 +1,346 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include <stdint.h>
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Util.h"
+#include "ConvectionKernels_BC67.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_S3TC.h"
+
+#include <assert.h>
+
+namespace cvtt
+{
+    namespace Kernels
+    {
+        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, const BC7EncodingPlan &encodingPlan)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, encodingPlan, options.refineRoundsBC7);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
+                Util::BiasSignedInput(inputBlocks, pBlocks + blockBase);
+
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
+                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
+                pBC += ParallelMath::ParallelSize * 16;
+            }
+        }
+
+        void EncodeETC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC1CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC1Block(pBC, pBlocks + blockBase, compressionData, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, false);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2PunchthroughAlpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            float channelWeights[4];
+            Util::FillWeights(options, channelWeights);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2Block(pBC, pBlocks + blockBase, compressionData, options, true);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressETC2AlphaBlock(pBC, pBlocks + blockBase, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2Alpha11(uint8_t *pBC, const PixelBlockScalarS16 *pBlocks, bool isSigned, const cvtt::Options &options)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
+            {
+                Internal::ETCComputer::CompressEACBlock(pBC, pBlocks + blockBase, isSigned, options);
+                pBC += ParallelMath::ParallelSize * 8;
+            }
+        }
+
+        void EncodeETC2RGBA(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options, cvtt::ETC2CompressionData *compressionData)
+        {
+            uint8_t alphaBlockData[cvtt::NumParallelBlocks * 8];
+            uint8_t colorBlockData[cvtt::NumParallelBlocks * 8];
+
+            EncodeETC2(colorBlockData, pBlocks, options, compressionData);
+            EncodeETC2Alpha(alphaBlockData, pBlocks, options);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + blockData] = alphaBlockData[blockBase * 8 + blockData];
+
+                for (size_t blockData = 0; blockData < 8; blockData++)
+                    pBC[blockBase * 16 + 8 + blockData] = colorBlockData[blockBase * 8 + blockData];
+            }
+        }
+
+        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
+                pBC += 16;
+            }
+        }
+
+        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
+        {
+            assert(pBlocks);
+            assert(pBC);
+
+            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
+            {
+                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
+                pBC += 16;
+            }
+        }
+
+        ETC1CompressionData *AllocETC1Data(allocFunc_t allocFunc, void *context)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC1Data(allocFunc, context);
+        }
+
+        void ReleaseETC1Data(ETC1CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC1Data(compressionData, freeFunc);
+        }
+
+        ETC2CompressionData *AllocETC2Data(allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+        {
+            return cvtt::Internal::ETCComputer::AllocETC2Data(allocFunc, context, options);
+        }
+
+        void ReleaseETC2Data(ETC2CompressionData *compressionData, freeFunc_t freeFunc)
+        {
+            cvtt::Internal::ETCComputer::ReleaseETC2Data(compressionData, freeFunc);
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_AggregatedError.h b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
new file mode 100644
index 0000000000..9f9356a345
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_AggregatedError.h
@@ -0,0 +1,55 @@
+#pragma once
+#ifndef __CVTT_AGGREGATEDERROR_H__
+#define __CVTT_AGGREGATEDERROR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class AggregatedError
+        {
+        public:
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::Float MFloat;
+
+            AggregatedError()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
+            }
+
+            void Add(const MUInt16 &channelErrorUnweighted, int ch)
+            {
+                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
+            }
+
+            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
+            {
+                if (flags & cvtt::Flags::Uniform)
+                {
+                    MUInt31 total = m_errorUnweighted[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + m_errorUnweighted[ch];
+                    return ParallelMath::ToFloat(total);
+                }
+                else
+                {
+                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
+                    return total;
+                }
+            }
+
+        private:
+            MUInt31 m_errorUnweighted[TVectorSize];
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.cpp b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
new file mode 100644
index 0000000000..791859b232
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.cpp
@@ -0,0 +1,3485 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC67.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_BC7_Prio.h"
+#include "ConvectionKernels_BC7_SingleColor.h"
+#include "ConvectionKernels_BC6H_IO.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelectorHDR.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        namespace BC67
+        {
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt15 MUInt15;
+
+            struct WorkInfo
+            {
+                MUInt15 m_mode;
+                MFloat m_error;
+                MUInt15 m_ep[3][2][4];
+                MUInt15 m_indexes[16];
+                MUInt15 m_indexes2[16];
+
+                union
+                {
+                    MUInt15 m_partition;
+                    struct IndexSelectorAndRotation
+                    {
+                        MUInt15 m_indexSelector;
+                        MUInt15 m_rotation;
+                    } m_isr;
+                } m_u;
+            };
+        }
+
+        namespace BC7Data
+        {
+            enum AlphaMode
+            {
+                AlphaMode_Combined,
+                AlphaMode_Separate,
+                AlphaMode_None,
+            };
+
+            enum PBitMode
+            {
+                PBitMode_PerEndpoint,
+                PBitMode_PerSubset,
+                PBitMode_None
+            };
+
+            struct BC7ModeInfo
+            {
+                PBitMode m_pBitMode;
+                AlphaMode m_alphaMode;
+                int m_rgbBits;
+                int m_alphaBits;
+                int m_partitionBits;
+                int m_numSubsets;
+                int m_indexBits;
+                int m_alphaIndexBits;
+                bool m_hasIndexSelector;
+            };
+
+            BC7ModeInfo g_modes[] =
+            {
+                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
+                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
+                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
+                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
+
+                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
+                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
+                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
+            };
+
+            const int g_weight2[] = { 0, 21, 43, 64 };
+            const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+            const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+
+            const int *g_weightTables[] =
+            {
+                NULL,
+                NULL,
+                g_weight2,
+                g_weight3,
+                g_weight4
+            };
+
+            struct BC6HModeInfo
+            {
+                uint16_t m_modeID;
+                bool m_partitioned;
+                bool m_transformed;
+                int m_aPrec;
+                int m_bPrec[3];
+            };
+
+            // [partitioned][precision]
+            bool g_hdrModesExistForPrecision[2][17] =
+            {
+                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
+                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
+                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
+            };
+
+            BC6HModeInfo g_hdrModes[] =
+            {
+                { 0x00, true,  true,  10,{ 5, 5, 5 } },
+                { 0x01, true,  true,  7,{ 6, 6, 6 } },
+                { 0x02, true,  true,  11,{ 5, 4, 4 } },
+                { 0x06, true,  true,  11,{ 4, 5, 4 } },
+                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
+                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
+                { 0x12, true,  true,  8,{ 6, 5, 5 } },
+                { 0x16, true,  true,  8,{ 5, 6, 5 } },
+                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
+                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
+                { 0x03, false, false, 10,{ 10, 10, 10 } },
+                { 0x07, false, true,  11,{ 9, 9, 9 } },
+                { 0x0b, false, true,  12,{ 8, 8, 8 } },
+                { 0x0f, false, true,  16,{ 4, 4, 4 } },
+            };
+
+            const int g_maxHDRPrecision = 16;
+
+            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
+
+            static uint16_t g_partitionMap[64] =
+            {
+                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+                0xC800, 0xFFEC, 0xFE80, 0xE800,
+                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+                0xF710, 0x008E, 0x7100, 0x08CE,
+                0x008C, 0x7310, 0x3100, 0x8CCE,
+                0x088C, 0x3110, 0x6666, 0x366C,
+                0x17E8, 0x0FF0, 0x718E, 0x399C,
+                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
+                0x3c3c, 0x55aa, 0x9696, 0xa55a,
+                0x73ce, 0x13c8, 0x324c, 0x3bdc,
+                0x6996, 0xc33c, 0x9966, 0x660,
+                0x272, 0x4e4, 0x4e40, 0x2720,
+                0xc936, 0x936c, 0x39c6, 0x639c,
+                0x9336, 0x9cc6, 0x817e, 0xe718,
+                0xccf0, 0xfcc, 0x7744, 0xee22,
+            };
+
+            static uint32_t g_partitionMap2[64] =
+            {
+                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+            };
+
+            static int g_fixupIndexes2[64] =
+            {
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15,15,15,15,
+                15, 2, 8, 2,
+                2, 8, 8,15,
+                2, 8, 2, 2,
+                8, 8, 2, 2,
+
+                15,15, 6, 8,
+                2, 8,15,15,
+                2, 8, 2, 2,
+                2,15,15, 6,
+                6, 2, 6, 8,
+                15,15, 2, 2,
+                15,15,15,15,
+                15, 2, 2,15,
+            };
+
+            static int g_fixupIndexes3[64][2] =
+            {
+                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
+                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
+                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
+                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
+                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
+                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
+                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
+                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
+
+                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
+                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
+                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
+                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
+                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
+                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
+                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
+                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
+            };
+
+            static const unsigned char g_fragments[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
+                0, 1, 2, 3,  // 16, 4
+                0, 1, 4,  // 20, 3
+                0, 1, 2, 4,  // 23, 4
+                2, 3, 7,  // 27, 3
+                1, 2, 3, 7,  // 30, 4
+                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
+                0, 1, 4, 8,  // 42, 4
+                0, 1, 2, 4, 5, 8,  // 46, 6
+                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
+                1, 4, 5, 6, 9,  // 60, 5
+                2, 5, 6, 7, 10,  // 65, 5
+                5, 6, 9, 10,  // 70, 4
+                2, 3, 7, 11,  // 74, 4
+                1, 2, 3, 6, 7, 11,  // 78, 6
+                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
+                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
+                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
+                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
+                0, 4, 8, 12,  // 128, 4
+                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
+                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
+                3, 6, 7, 8, 9, 12,  // 158, 6
+                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
+                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
+                5, 8, 9, 10, 13,  // 192, 5
+                8, 12, 13,  // 197, 3
+                4, 8, 12, 13,  // 200, 4
+                2, 3, 6, 9, 12, 13,  // 204, 6
+                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
+                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
+                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
+                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
+                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
+                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
+                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
+                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
+                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
+                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
+                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
+                6, 9, 10, 11, 14,  // 317, 5
+                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
+                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
+                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
+                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
+                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
+                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
+                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
+                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
+                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
+                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
+                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
+                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
+                8, 12, 13, 14,  // 418, 4
+                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
+                4, 8, 9, 12, 13, 14,  // 430, 6
+                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
+                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
+                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
+                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
+                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
+                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
+                3, 7, 11, 15,  // 490, 4
+                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
+                0, 4, 5, 10, 11, 15,  // 502, 6
+                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
+                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
+                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
+                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
+                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
+                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
+                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
+                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
+                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
+                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
+                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
+                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
+                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
+                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
+                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
+                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
+                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
+                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
+                0, 1, 5, 10, 14, 15,  // 669, 6
+                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
+                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
+                11, 14, 15,  // 691, 3
+                7, 11, 14, 15,  // 694, 4
+                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
+                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
+                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
+                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
+                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
+                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
+                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
+                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
+                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
+                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
+                11, 13, 14, 15,  // 792, 4
+                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
+                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
+                7, 10, 11, 13, 14, 15,  // 814, 6
+                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
+                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
+                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
+                12, 13, 14, 15,  // 848, 4
+                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
+                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
+                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
+                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
+                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
+                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
+                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
+                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
+                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
+                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
+                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
+                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
+                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
+                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
+                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
+                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
+                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
+                0, 2,  // 1040, 2
+                1, 3,  // 1042, 2
+                0, 1, 4, 5,  // 1044, 4
+                0, 1, 2, 4, 5,  // 1048, 5
+                2, 3, 6,  // 1053, 3
+                0, 2, 4, 6,  // 1056, 4
+                1, 2, 5, 6,  // 1060, 4
+                0, 1, 2, 3, 5, 6,  // 1064, 6
+                0, 1, 2, 4, 5, 6,  // 1070, 6
+                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
+                0, 3, 4, 7,  // 1083, 4
+                0, 1, 2, 3, 4, 7,  // 1087, 6
+                1, 3, 5, 7,  // 1093, 4
+                2, 3, 6, 7,  // 1097, 4
+                1, 2, 3, 6, 7,  // 1101, 5
+                1, 2, 3, 5, 6, 7,  // 1106, 6
+                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
+                4, 5, 6, 7,  // 1119, 4
+                0, 8,  // 1123, 2
+                0, 1, 4, 5, 8,  // 1125, 5
+                0, 1, 8, 9,  // 1130, 4
+                4, 5, 8, 9,  // 1134, 4
+                0, 1, 4, 5, 8, 9,  // 1138, 6
+                2, 6, 8, 9,  // 1144, 4
+                6, 7, 8, 9,  // 1148, 4
+                0, 2, 4, 6, 8, 10,  // 1152, 6
+                1, 2, 5, 6, 9, 10,  // 1158, 6
+                0, 3, 4, 7, 9, 10,  // 1164, 6
+                0, 1, 2, 8, 9, 10,  // 1170, 6
+                4, 5, 6, 8, 9, 10,  // 1176, 6
+                3, 11,  // 1182, 2
+                2, 3, 6, 7, 11,  // 1184, 5
+                0, 3, 8, 11,  // 1189, 4
+                0, 3, 4, 7, 8, 11,  // 1193, 6
+                1, 3, 5, 7, 9, 11,  // 1199, 6
+                2, 3, 10, 11,  // 1205, 4
+                1, 5, 10, 11,  // 1209, 4
+                4, 5, 10, 11,  // 1213, 4
+                6, 7, 10, 11,  // 1217, 4
+                2, 3, 6, 7, 10, 11,  // 1221, 6
+                1, 2, 3, 9, 10, 11,  // 1227, 6
+                5, 6, 7, 9, 10, 11,  // 1233, 6
+                8, 9, 10, 11,  // 1239, 4
+                4, 12,  // 1243, 2
+                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
+                8, 9, 12,  // 1253, 3
+                0, 4, 5, 8, 9, 12,  // 1256, 6
+                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
+                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
+                1, 5, 9, 13,  // 1276, 4
+                6, 7, 9, 13,  // 1280, 4
+                1, 4, 7, 10, 13,  // 1284, 5
+                1, 6, 8, 11, 13,  // 1289, 5
+                0, 1, 12, 13,  // 1294, 4
+                4, 5, 12, 13,  // 1298, 4
+                0, 1, 6, 7, 12, 13,  // 1302, 6
+                0, 1, 4, 8, 12, 13,  // 1308, 6
+                8, 9, 12, 13,  // 1314, 4
+                4, 8, 9, 12, 13,  // 1318, 5
+                4, 5, 8, 9, 12, 13,  // 1323, 6
+                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
+                0, 1, 6, 10, 12, 13,  // 1336, 6
+                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
+                0, 1, 10, 11, 12, 13,  // 1349, 6
+                2, 4, 7, 9, 14,  // 1355, 5
+                4, 5, 10, 14,  // 1360, 4
+                2, 6, 10, 14,  // 1364, 4
+                2, 5, 8, 11, 14,  // 1368, 5
+                0, 2, 12, 14,  // 1373, 4
+                8, 10, 12, 14,  // 1377, 4
+                4, 6, 8, 10, 12, 14,  // 1381, 6
+                13, 14,  // 1387, 2
+                9, 10, 13, 14,  // 1389, 4
+                5, 6, 9, 10, 13, 14,  // 1393, 6
+                0, 1, 2, 12, 13, 14,  // 1399, 6
+                4, 5, 6, 12, 13, 14,  // 1405, 6
+                8, 9, 12, 13, 14,  // 1411, 5
+                8, 9, 10, 12, 13, 14,  // 1416, 6
+                7, 15,  // 1422, 2
+                0, 5, 10, 15,  // 1424, 4
+                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
+                10, 11, 15,  // 1436, 3
+                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
+                3, 6, 7, 10, 11, 15,  // 1446, 6
+                12, 15,  // 1452, 2
+                0, 3, 12, 15,  // 1454, 4
+                4, 7, 12, 15,  // 1458, 4
+                0, 3, 6, 9, 12, 15,  // 1462, 6
+                0, 3, 5, 10, 12, 15,  // 1468, 6
+                8, 11, 12, 15,  // 1474, 4
+                5, 6, 8, 11, 12, 15,  // 1478, 6
+                4, 7, 8, 11, 12, 15,  // 1484, 6
+                1, 3, 13, 15,  // 1490, 4
+                9, 11, 13, 15,  // 1494, 4
+                5, 7, 9, 11, 13, 15,  // 1498, 6
+                2, 3, 14, 15,  // 1504, 4
+                2, 3, 4, 5, 14, 15,  // 1508, 6
+                6, 7, 14, 15,  // 1514, 4
+                2, 3, 5, 9, 14, 15,  // 1518, 6
+                2, 3, 8, 9, 14, 15,  // 1524, 6
+                10, 14, 15,  // 1530, 3
+                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
+                2, 3, 7, 11, 14, 15,  // 1540, 6
+                10, 11, 14, 15,  // 1546, 4
+                7, 10, 11, 14, 15,  // 1550, 5
+                6, 7, 10, 11, 14, 15,  // 1555, 6
+                1, 2, 3, 13, 14, 15,  // 1561, 6
+                5, 6, 7, 13, 14, 15,  // 1567, 6
+                10, 11, 13, 14, 15,  // 1573, 5
+                9, 10, 11, 13, 14, 15,  // 1578, 6
+                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
+                9, 10, 12, 13, 14, 15,  // 1592, 6
+                8, 11, 12, 13, 14, 15,  // 1598, 6
+                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
+            };
+            static const int g_shapeRanges[][2] =
+            {
+                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
+                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
+                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
+                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
+                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
+                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
+                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
+                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
+                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
+                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
+                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
+                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
+                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
+                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
+                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
+                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
+                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
+                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
+                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
+                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
+                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
+                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
+                { 1604, 8 },
+            };
+            static const int g_shapes1[][2] =
+            {
+                { 0, 16 }
+            };
+            static const int g_shapes2[64][2] =
+            {
+                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
+                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
+                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
+                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
+                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
+                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
+                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
+                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
+            };
+            static const int g_shapes3[64][3] =
+            {
+                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
+                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
+                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
+                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
+                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
+                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
+                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
+                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
+            };
+
+            static const int g_shapeList1[] =
+            {
+                0,
+            };
+
+            static const int g_shapeList2[] =
+            {
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+                122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList12[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128,
+            };
+
+            static const int g_shapeList3[] =
+            {
+                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
+                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
+                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
+                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
+                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
+                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
+                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
+                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
+                235, 236, 237, 238, 239, 240, 241, 242,
+            };
+
+            static const int g_shapeList3Short[] =
+            {
+                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
+                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
+                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
+                233, 237, 240,
+            };
+
+            static const int g_shapeListAll[] =
+            {
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
+                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
+                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
+                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
+                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
+                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+                242,
+            };
+
+            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
+            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
+            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
+            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
+            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
+            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
+            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
+        }
+
+        struct PackingVector
+        {
+            uint32_t m_vector[4];
+            int m_offset;
+
+            void Init()
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                m_offset = 0;
+            }
+
+            void InitPacked(const uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    m_vector[b / 32] = v[b / 32];
+
+                m_offset = bits;
+            }
+
+            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
+            {
+                int vOffset = m_offset >> 5;
+                int bitOffset = m_offset & 0x1f;
+
+                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
+
+                int overflowBits = bitOffset + bits - 32;
+                if (overflowBits > 0)
+                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
+
+                m_offset += bits;
+            }
+
+            inline void Flush(uint8_t* output)
+            {
+                assert(m_offset == 128);
+
+                for (int v = 0; v < 4; v++)
+                {
+                    uint32_t chunk = m_vector[v];
+                    for (int b = 0; b < 4; b++)
+                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
+                }
+            }
+        };
+
+
+        struct UnpackingVector
+        {
+            uint32_t m_vector[4];
+
+            void Init(const uint8_t *bytes)
+            {
+                for (int i = 0; i < 4; i++)
+                    m_vector[i] = 0;
+
+                for (int b = 0; b < 16; b++)
+                    m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
+            }
+
+            inline void UnpackStart(uint32_t *v, int bits)
+            {
+                for (int b = 0; b < bits; b += 32)
+                    v[b / 32] = m_vector[b / 32];
+
+                int entriesShifted = bits / 32;
+                int carry = bits % 32;
+
+                for (int i = entriesShifted; i < 4; i++)
+                    m_vector[i - entriesShifted] = m_vector[i];
+
+                int entriesRemaining = 4 - entriesShifted;
+                if (carry)
+                {
+                    uint32_t bitMask = (1 << carry) - 1;
+                    for (int i = 0; i < 4; i++)
+                    {
+                        m_vector[i] >>= carry;
+                        if (i != 3)
+                            m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
+                    }
+                }
+            }
+
+            inline ParallelMath::ScalarUInt16 Unpack(int bits)
+            {
+                uint32_t bitMask = (1 << bits) - 1;
+
+                ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
+
+                for (int i = 0; i < 4; i++)
+                {
+                    m_vector[i] >>= bits;
+                    if (i != 3)
+                        m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
+                }
+
+                return result;
+            }
+        };
+
+        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
+        {
+            if (isSigned)
+            {
+                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
+                return (v * 32.0f + offset) / 31.0f;
+            }
+            else
+                return (v * 64.0f + 30.0f) / 31.0f;
+        }
+
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
+        {
+#ifdef CVTT_ENABLE_ASSERTS
+            for (int i = 0; i < ParallelMath::ParallelSize; i++)
+                assert(ParallelMath::Extract(v, i) != -32768)
+#endif
+
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
+            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
+
+            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
+            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
+            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
+            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
+
+            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
+        }
+
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
+        {
+            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
+        }
+
+        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    if (isSigned)
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
+                    else
+                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
+                }
+            }
+        }
+
+        struct SinglePlaneTemporaries
+        {
+            UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
+            UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
+
+            ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
+            ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
+            ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
+        };
+    }
+}
+
+void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
+{
+    ParallelMath::RoundTowardNearestForScope roundingMode;
+
+    float tf[2];
+    Util::ComputeTweakFactors(tweak, range, tf);
+
+    MFloat base = ParallelMath::ToFloat(original[0]);
+    MFloat offs = ParallelMath::ToFloat(original[1]) - base;
+
+    result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
+    result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
+}
+
+void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+        color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
+}
+
+void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
+{
+    int16_t addend;
+    if (p)
+        addend = ((1 << (8 - bits)) - 1);
+    else
+        addend = 255;
+
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
+        ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
+        ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
+        color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
+    }
+}
+
+void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
+{
+    for (int ch = 0; ch < channels; ch++)
+    {
+        MUInt15 clr = color[ch];
+        clr = clr << (8 - bits);
+        color[ch] = clr | ParallelMath::RightShift(clr, bits);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 4, p[j], 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 6, p, 3);
+        Unquantize(ep[j], 7, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(ep[j], 5, 3);
+        Unquantize(ep[j], 5, 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 7, p[j], 3);
+        ep[j][3] = ParallelMath::MakeUInt15(255);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 5, 3);
+        Unquantize(epRGB[j], 5, 3);
+
+        Quantize(epA + j, 6, 1);
+        Unquantize(epA + j, 6, 1);
+    }
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        Quantize(epRGB[j], 7, 3);
+        Unquantize(epRGB[j], 7, 3);
+    }
+
+    // Alpha is full precision
+    (void)epA;
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+        QuantizeP(ep[j], 7, p[j], 4);
+}
+
+void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
+{
+    for (int j = 0; j < 2; j++)
+    {
+        QuantizeP(ep[j], 5, p[j], 4);
+        Unquantize(ep[j], 6, 4);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 intAverage[4];
+    for (int ch = 0; ch < 4; ch++)
+        intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
+
+    MUInt15 eps[2][4];
+    MUInt15 reconstructed[4];
+    MUInt15 index = ParallelMath::MakeUInt15(0);
+
+    for (int epi = 0; epi < 2; epi++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            eps[epi][ch] = ParallelMath::MakeUInt15(0);
+        eps[epi][3] = ParallelMath::MakeUInt15(255);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        reconstructed[ch] = ParallelMath::MakeUInt15(0);
+    reconstructed[3] = ParallelMath::MakeUInt15(255);
+
+    // Depending on the target index and parity bits, there are multiple valid solid colors.
+    // We want to find the one closest to the actual average.
+    MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
+    for (int t = 0; t < numTables; t++)
+    {
+        const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
+
+        ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
+
+        MUInt15 candidateReconstructed[4];
+        MUInt15 candidateEPs[2][4];
+
+        for (int i = 0; i < ParallelMath::ParallelSize; i++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+            {
+                ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
+                assert(avgValue >= 0 && avgValue <= 255);
+
+                const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
+
+                ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
+                ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
+                ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
+            }
+        }
+
+        MFloat avgError = ParallelMath::MakeFloatZero();
+        for (int ch = 0; ch < numRealChannels; ch++)
+        {
+            MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
+            avgError = avgError + delta * delta * channelWeightsSq[ch];
+        }
+
+        ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
+        better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
+
+        if (ParallelMath::AnySet(better))
+        {
+            ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
+
+            MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
+
+            ParallelMath::ConditionalSet(index, better, candidateIndex);
+
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
+
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
+        }
+    }
+
+    AggregatedError<4> aggError;
+    for (int pxi = 0; pxi < shapeLength; pxi++)
+    {
+        int px = fragmentStart[pxi];
+
+        BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+    }
+
+    MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
+
+    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
+    if (ParallelMath::AnySet(better))
+    {
+        shapeBestError = ParallelMath::Min(shapeBestError, error);
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < numRealChannels; ch++)
+                ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
+        }
+
+        for (int pxi = 0; pxi < shapeLength; pxi++)
+            ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
+    }
+}
+
+void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    SinglePlaneTemporaries temps;
+
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+    ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 a = pixels[px][3];
+        maxAlpha = ParallelMath::Max(maxAlpha, a);
+        minAlpha = ParallelMath::Min(minAlpha, a);
+
+        isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
+    }
+
+    ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
+    ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
+
+    bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
+
+    // Try RGB modes if any block has a min alpha 251 or higher
+    bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
+
+    // Try mode 7 if any block has alpha.
+    // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
+    // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
+    // situations, and only by at most 1 unit of error per pixel.
+    bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);
+
+    MFloat preWeightedPixels[16][4];
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    // Get initial RGB endpoints
+    if (allowRGBModes)
+    {
+        const uint8_t *shapeList = encodingPlan.rgbShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+            EndpointSelector<3, 8> epSelector;
+
+            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+            {
+                for (int spx = 0; spx < shapeSize; spx++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + spx];
+                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                }
+                epSelector.FinishPass(epPass);
+            }
+            temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
+        }
+    }
+
+    // Get initial RGBA endpoints
+    {
+        const uint8_t *shapeList = encodingPlan.rgbaShapeList;
+        int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;
+
+        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            if (anyBlockHasAlpha || !allowRGBModes)
+            {
+                int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                int shapeSize = BC7Data::g_shapeRanges[shape][1];
+
+                EndpointSelector<4, 8> epSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int spx = 0; spx < shapeSize; spx++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + spx];
+                        epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
+                    }
+                    epSelector.FinishPass(epPass);
+                }
+                temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
+            }
+            else
+            {
+                temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
+            }
+        }
+    }
+
+    for (uint16_t mode = 0; mode <= 7; mode++)
+    {
+        if (mode == 4 || mode == 5)
+            continue;
+
+        if (mode < 4 && !allowRGBModes)
+            continue;
+
+        if (mode == 7 && !allowMode7)
+            continue;
+
+        uint64_t partitionEnabledBits = 0;
+        switch (mode)
+        {
+        case 0:
+            partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        }
+
+        bool isRGB = (mode < 4);
+
+        unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
+        int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
+        int indexPrec = BC7Data::g_modes[mode].m_indexBits;
+
+        int parityBitMax = 1;
+        if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+            parityBitMax = 4;
+        else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
+            parityBitMax = 2;
+
+        int numRealChannels = isRGB ? 3 : 4;
+
+        int numShapes;
+        const int *shapeList;
+
+        if (numSubsets == 1)
+        {
+            numShapes = BC7Data::g_numShapes1;
+            shapeList = BC7Data::g_shapeList1;
+        }
+        else if (numSubsets == 2)
+        {
+            numShapes = BC7Data::g_numShapes2;
+            shapeList = BC7Data::g_shapeList2;
+        }
+        else
+        {
+            assert(numSubsets == 3);
+            if (numPartitions == 16)
+            {
+                numShapes = BC7Data::g_numShapes3Short;
+                shapeList = BC7Data::g_shapeList3Short;
+            }
+            else
+            {
+                assert(numPartitions == 64);
+                numShapes = BC7Data::g_numShapes3;
+                shapeList = BC7Data::g_shapeList3;
+            }
+        }
+
+        for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
+            temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
+
+        for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
+        {
+            int shape = shapeList[shapeIter];
+
+            int numTweakRounds = 0;
+            if (isRGB)
+                numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
+            else
+                numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];
+
+            if (numTweakRounds == 0)
+                continue;
+
+            if (numTweakRounds > MaxTweakRounds)
+                numTweakRounds = MaxTweakRounds;
+
+            int shapeStart = BC7Data::g_shapeRanges[shape][0];
+            int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+            AggregatedError<1> alphaAggError;
+            if (isRGB && anyBlockHasAlpha)
+            {
+                MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    MUInt15 original[1] = { pixels[px][3] };
+                    BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
+                }
+            }
+
+            float alphaWeightsSq[1] = { channelWeightsSq[3] };
+            MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
+
+            MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
+
+            for (int tweak = 0; tweak < numTweakRounds; tweak++)
+            {
+                if (isRGB)
+                {
+                    temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                    tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
+                }
+                else
+                {
+                    temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
+                }
+            }
+
+            ParallelMath::Int16CompFlag punchThroughInvalid[4];
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
+
+                if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
+                {
+                    // Modes 6 and 7 have parity bits that affect alpha
+                    if (pIter == 0)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
+                    else if (pIter == parityBitMax - 1)
+                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
+                    else
+                        punchThroughInvalid[pIter] = isPunchThrough;
+                }
+            }
+
+            for (int pIter = 0; pIter < parityBitMax; pIter++)
+            {
+                if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
+                    continue;
+
+                bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    uint16_t p[2];
+                    p[0] = (pIter & 1);
+                    p[1] = ((pIter >> 1) & 1);
+
+                    MUInt15 ep[2][4];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        switch (mode)
+                        {
+                        case 0:
+                            CompressEndpoints0(ep, p);
+                            break;
+                        case 1:
+                            CompressEndpoints1(ep, p[0]);
+                            break;
+                        case 2:
+                            CompressEndpoints2(ep);
+                            break;
+                        case 3:
+                            CompressEndpoints3(ep, p);
+                            break;
+                        case 6:
+                            CompressEndpoints6(ep, p);
+                            break;
+                        case 7:
+                            CompressEndpoints7(ep, p);
+                            break;
+                        default:
+                            assert(false);
+                            break;
+                        };
+
+                        MFloat shapeError = ParallelMath::MakeFloatZero();
+
+                        IndexSelector<4> indexSelector;
+                        indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
+
+                        EndpointRefiner<4> epRefiner;
+                        epRefiner.Init(1 << indexPrec, channelWeights);
+
+                        MUInt15 indexes[16];
+
+                        AggregatedError<4> aggError;
+                        for (int pxi = 0; pxi < shapeLength; pxi++)
+                        {
+                            int px = BC7Data::g_fragments[shapeStart + pxi];
+
+                            MUInt15 index;
+                            MUInt15 reconstructed[4];
+
+                            index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
+                            indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                                BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
+                            else
+                            {
+                                MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+
+                                MUInt15 altIndexes[2];
+                                altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
+
+                                    MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
+                                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
+                                    error = ParallelMath::Min(error, altError);
+                                    ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
+                                }
+
+                                shapeError = shapeError + error;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                                epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
+
+                            indexes[pxi] = index;
+                        }
+
+                        if (flags & cvtt::Flags::BC7_FastIndexing)
+                            shapeError = aggError.Finalize(flags, channelWeightsSq);
+
+                        if (isRGB)
+                            shapeError = shapeError + staticAlphaError;
+
+                        ParallelMath::FloatCompFlag shapeErrorBetter;
+                        ParallelMath::Int16CompFlag shapeErrorBetter16;
+
+                        shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
+                        shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
+
+                        if (ParallelMath::AnySet(shapeErrorBetter16))
+                        {
+                            bool punchThroughOK = true;
+                            if (needPunchThroughCheck)
+                            {
+                                shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
+                                shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
+
+                                if (!ParallelMath::AnySet(shapeErrorBetter16))
+                                    punchThroughOK = false;
+                            }
+
+                            if (punchThroughOK)
+                            {
+                                ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
+                                for (int epi = 0; epi < 2; epi++)
+                                    for (int ch = 0; ch < numRealChannels; ch++)
+                                        ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);
+
+                                for (int pxi = 0; pxi < shapeLength; pxi++)
+                                    ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
+                            }
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                            epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
+                    } // refine
+                } // tweak
+            } // p
+
+            if (flags & cvtt::Flags::BC7_TrySingleColor)
+            {
+                MUInt15 total[4];
+                for (int ch = 0; ch < 4; ch++)
+                    total[ch] = ParallelMath::MakeUInt15(0);
+
+                for (int pxi = 0; pxi < shapeLength; pxi++)
+                {
+                    int px = BC7Data::g_fragments[shapeStart + pxi];
+                    for (int ch = 0; ch < 4; ch++)
+                        total[ch] = total[ch] + pixels[pxi][ch];
+                }
+
+                MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
+                MFloat average[4];
+                for (int ch = 0; ch < 4; ch++)
+                    average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
+
+                const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
+                MFloat &shapeBestError = temps.shapeBestError[shape];
+                MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
+                MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
+
+                const cvtt::Tables::BC7SC::Table **scTables = NULL;
+                int numSCTables = 0;
+
+                const cvtt::Tables::BC7SC::Table *tables0[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p00_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p01_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p10_i3,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i1,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i2,
+                    &cvtt::Tables::BC7SC::g_mode0_p11_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables1[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode1_p1_i3,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables2[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode2,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables3[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode3_p0,
+                    &cvtt::Tables::BC7SC::g_mode3_p1,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables6[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p0_i7,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i1,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i2,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i3,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i4,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i5,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i6,
+                    &cvtt::Tables::BC7SC::g_mode6_p1_i7,
+                };
+
+                const cvtt::Tables::BC7SC::Table *tables7[] =
+                {
+                    &cvtt::Tables::BC7SC::g_mode7_p00,
+                    &cvtt::Tables::BC7SC::g_mode7_p01,
+                    &cvtt::Tables::BC7SC::g_mode7_p10,
+                    &cvtt::Tables::BC7SC::g_mode7_p11,
+                };
+
+                switch (mode)
+                {
+                case 0:
+                {
+                    scTables = tables0;
+                    numSCTables = sizeof(tables0) / sizeof(tables0[0]);
+                }
+                break;
+                case 1:
+                {
+                    scTables = tables1;
+                    numSCTables = sizeof(tables1) / sizeof(tables1[0]);
+                }
+                break;
+                case 2:
+                {
+
+                    scTables = tables2;
+                    numSCTables = sizeof(tables2) / sizeof(tables2[0]);
+                }
+                break;
+                case 3:
+                {
+                    scTables = tables3;
+                    numSCTables = sizeof(tables3) / sizeof(tables3[0]);
+                }
+                break;
+                case 6:
+                {
+                    scTables = tables6;
+                    numSCTables = sizeof(tables6) / sizeof(tables6[0]);
+                }
+                break;
+                case 7:
+                {
+                    scTables = tables7;
+                    numSCTables = sizeof(tables7) / sizeof(tables7[0]);
+                }
+                break;
+                default:
+                    assert(false);
+                    break;
+                }
+
+                TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
+            }
+        } // shapeIter
+
+        uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;
+
+        switch (mode)
+        {
+        case 0:
+            partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
+            break;
+        case 1:
+            partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
+            break;
+        case 2:
+            partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
+            break;
+        case 3:
+            partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
+            break;
+        case 6:
+            partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
+            break;
+        case 7:
+            if (anyBlockHasAlpha)
+                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
+            else
+                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
+            break;
+        default:
+            break;
+        };
+
+        for (uint16_t partition = 0; partition < numPartitions; partition++)
+        {
+            if (((partitionsEnabledBits >> partition) & 1) == 0)
+                continue;
+
+            const int *partitionShapes;
+            if (numSubsets == 1)
+                partitionShapes = BC7Data::g_shapes1[partition];
+            else if (numSubsets == 2)
+                partitionShapes = BC7Data::g_shapes2[partition];
+            else
+            {
+                assert(numSubsets == 3);
+                partitionShapes = BC7Data::g_shapes3[partition];
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            for (int subset = 0; subset < numSubsets; subset++)
+                totalError = totalError + temps.shapeBestError[partitionShapes[subset]];
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
+            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+            if (mode == 7 && anyBlockHasAlpha)
+            {
+                // Some lanes could be better, but we filter them out to ensure consistency with scalar
+                bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);
+
+                if (!isRGBAllowedForThisPartition)
+                {
+                    errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
+                    errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
+                }
+            }
+
+            if (ParallelMath::AnySet(errorBetter16))
+            {
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    int shape = partitionShapes[subset];
+                    int shapeStart = BC7Data::g_shapeRanges[shape][0];
+                    int shapeLength = BC7Data::g_shapeRanges[shape][1];
+
+                    for (int epi = 0; epi < 2; epi++)
+                        for (int ch = 0; ch < 4; ch++)
+                            ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);
+
+                    for (int pxi = 0; pxi < shapeLength; pxi++)
+                    {
+                        int px = BC7Data::g_fragments[shapeStart + pxi];
+                        ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
+                    }
+                }
+
+                ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
+            }
+        }
+    }
+}
+
+void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
+    // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
+    // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
+    // - Separate alpha channel, then weighted RGB
+    // - Alpha+2 other channels, then the independent channel
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    float channelWeightsSq[4];
+    for (int ch = 0; ch < 4; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (uint16_t mode = 4; mode <= 5; mode++)
+    {
+        int numSP[2] = { 0, 0 };
+
+        for (uint16_t rotation = 0; rotation < 4; rotation++)
+        {
+            if (mode == 4)
+            {
+                numSP[0] = encodingPlan.mode4SP[rotation][0];
+                numSP[1] = encodingPlan.mode4SP[rotation][1];
+            }
+            else
+                numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];
+
+            if (numSP[0] == 0 && numSP[1] == 0)
+                continue;
+
+            int alphaChannel = (rotation + 3) & 3;
+            int redChannel = (rotation == 1) ? 3 : 0;
+            int greenChannel = (rotation == 2) ? 3 : 1;
+            int blueChannel = (rotation == 3) ? 3 : 2;
+
+            MUInt15 rotatedRGB[16][3];
+            MFloat floatRotatedRGB[16][3];
+
+            for (int px = 0; px < 16; px++)
+            {
+                rotatedRGB[px][0] = pixels[px][redChannel];
+                rotatedRGB[px][1] = pixels[px][greenChannel];
+                rotatedRGB[px][2] = pixels[px][blueChannel];
+
+                for (int ch = 0; ch < 3; ch++)
+                    floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
+            }
+
+            uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
+
+            float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
+            float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
+            float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
+            float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
+
+            float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
+
+            MFloat preWeightedRotatedRGB[16][3];
+            BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
+
+            for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
+            {
+                int numTweakRounds = numSP[indexSelector];
+
+                if (numTweakRounds <= 0)
+                    continue;
+
+                if (numTweakRounds > MaxTweakRounds)
+                    numTweakRounds = MaxTweakRounds;
+
+                EndpointSelector<3, 8> rgbSelector;
+
+                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
+                {
+                    for (int px = 0; px < 16; px++)
+                        rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
+
+                    rgbSelector.FinishPass(epPass);
+                }
+
+                MUInt15 alphaRange[2];
+
+                alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
+                for (int px = 1; px < 16; px++)
+                {
+                    alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
+                    alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
+                }
+
+                int rgbPrec = 0;
+                int alphaPrec = 0;
+
+                if (mode == 4)
+                {
+                    rgbPrec = indexSelector ? 3 : 2;
+                    alphaPrec = indexSelector ? 2 : 3;
+                }
+                else
+                    rgbPrec = alphaPrec = 2;
+
+                UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
+
+                MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
+                MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
+
+                MUInt15 bestRGBIndexes[16];
+                MUInt15 bestAlphaIndexes[16];
+                MUInt15 bestEP[2][4];
+
+                for (int px = 0; px < 16; px++)
+                    bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 rgbEP[2][3];
+                    MUInt15 alphaEP[2];
+
+                    unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
+
+                    TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
+
+                    for (int refine = 0; refine < numRefineRounds; refine++)
+                    {
+                        if (mode == 4)
+                            CompressEndpoints4(rgbEP, alphaEP);
+                        else
+                            CompressEndpoints5(rgbEP, alphaEP);
+
+
+                        IndexSelector<1> alphaIndexSelector;
+                        IndexSelector<3> rgbIndexSelector;
+
+                        {
+                            MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
+                            alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
+                        }
+                        rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
+
+                        EndpointRefiner<3> rgbRefiner;
+                        EndpointRefiner<1> alphaRefiner;
+
+                        rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
+                        alphaRefiner.Init(1 << alphaPrec, uniformWeight);
+
+                        MFloat errorRGB = ParallelMath::MakeFloatZero();
+                        MFloat errorA = ParallelMath::MakeFloatZero();
+
+                        MUInt15 rgbIndexes[16];
+                        MUInt15 alphaIndexes[16];
+
+                        AggregatedError<3> rgbAggError;
+                        AggregatedError<1> alphaAggError;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
+                            MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
+
+                            MUInt15 reconstructedRGB[3];
+                            MUInt15 reconstructedAlpha[1];
+
+                            rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
+                            alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
+
+                            if (flags & cvtt::Flags::BC7_FastIndexing)
+                            {
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
+                            }
+                            else
+                            {
+                                AggregatedError<3> baseRGBAggError;
+                                AggregatedError<1> baseAlphaAggError;
+
+                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
+                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
+
+                                MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                MUInt15 altRGBIndexes[2];
+                                MUInt15 altAlphaIndexes[2];
+
+                                altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
+
+                                altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
+                                altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
+
+                                for (int ii = 0; ii < 2; ii++)
+                                {
+                                    rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
+                                    alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
+
+                                    AggregatedError<3> altRGBAggError;
+                                    AggregatedError<1> altAlphaAggError;
+
+                                    BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
+                                    BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
+
+                                    MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                    MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+
+                                    ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
+                                    ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
+
+                                    rgbError = ParallelMath::Min(altRGBError, rgbError);
+                                    alphaError = ParallelMath::Min(altAlphaError, alphaError);
+
+                                    ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
+                                    ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
+                                }
+
+                                errorRGB = errorRGB + rgbError;
+                                errorA = errorA + alphaError;
+                            }
+
+                            if (refine != numRefineRounds - 1)
+                            {
+                                rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
+                                alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
+                            }
+
+                            if (flags & Flags::BC7_FastIndexing)
+                            {
+                                errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
+                                errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
+                            }
+
+                            rgbIndexes[px] = rgbIndex;
+                            alphaIndexes[px] = alphaIndex;
+                        }
+
+                        ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
+                        ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
+
+                        ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
+                        ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
+
+                        if (ParallelMath::AnySet(rgbBetterInt16))
+                        {
+                            bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                            {
+                                for (int ch = 0; ch < 3; ch++)
+                                    ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
+                            }
+                        }
+
+                        if (ParallelMath::AnySet(alphaBetterInt16))
+                        {
+                            bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
+
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
+
+                            for (int ep = 0; ep < 2; ep++)
+                                ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
+                        }
+
+                        if (refine != numRefineRounds - 1)
+                        {
+                            rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
+
+                            MUInt15 alphaEPTemp[2][1];
+                            alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
+
+                            for (int i = 0; i < 2; i++)
+                                alphaEP[i] = alphaEPTemp[i][0];
+                        }
+                    }	// refine
+                } // tweak
+
+                MFloat combinedError = bestRGBError + bestAlphaError;
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                work.m_error = ParallelMath::Min(combinedError, work.m_error);
+
+                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
+                ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
+
+                for (int px = 0; px < 16; px++)
+                {
+                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
+                    ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
+                }
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < 4; ch++)
+                        ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
+            }
+        }
+    }
+}
+
+template<class T>
+void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
+{
+    T temp = a;
+    a = b;
+    b = temp;
+}
+
+void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
+{
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    BC67::WorkInfo work;
+    memset(&work, 0, sizeof(work));
+
+    work.m_error = ParallelMath::MakeFloat(FLT_MAX);
+
+    {
+        ParallelMath::RoundTowardNearestForScope rtn;
+        TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+        TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        PackingVector pv;
+        pv.Init();
+
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
+        ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
+
+        const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        ParallelMath::ScalarUInt16 indexes2[16];
+        ParallelMath::ScalarUInt16 endPoints[3][2][4];
+
+        for (int i = 0; i < 16; i++)
+        {
+            indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
+            if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+                indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
+        }
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 4; ch++)
+                    endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
+            }
+        }
+
+        int fixups[3] = { 0, 0, 0 };
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+            bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
+
+            if (flipRGB)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes[px] = highIndex - indexes[px];
+            }
+
+            if (flipAlpha)
+            {
+                uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                    indexes2[px] = highIndex - indexes2[px];
+            }
+
+            if (indexSelector)
+                Swap(flipRGB, flipAlpha);
+
+            if (flipRGB)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
+            }
+            if (flipAlpha)
+                Swap(endPoints[0][0][3], endPoints[0][1][3]);
+
+        }
+        else
+        {
+            if (modeInfo.m_numSubsets == 2)
+                fixups[1] = BC7Data::g_fixupIndexes2[partition];
+            else if (modeInfo.m_numSubsets == 3)
+            {
+                fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+                fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+            }
+
+            bool flip[3] = { false, false, false };
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+
+            if (flip[0] || flip[1] || flip[2])
+            {
+                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                for (int px = 0; px < 16; px++)
+                {
+                    int subset = 0;
+                    if (modeInfo.m_numSubsets == 2)
+                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+                    else if (modeInfo.m_numSubsets == 3)
+                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+                    if (flip[subset])
+                        indexes[px] = highIndex - indexes[px];
+                }
+
+                int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
+                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                {
+                    if (flip[subset])
+                        for (int ch = 0; ch < maxCH; ch++)
+                            Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
+                }
+            }
+        }
+
+        pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
+
+        if (modeInfo.m_partitionBits)
+            pv.Pack(partition, modeInfo.m_partitionBits);
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
+            pv.Pack(rotation, 2);
+        }
+
+        if (modeInfo.m_hasIndexSelector)
+            pv.Pack(indexSelector, 1);
+
+        // Encode RGB
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
+                    epPart >>= (8 - modeInfo.m_rgbBits);
+
+                    pv.Pack(epPart, modeInfo.m_rgbBits);
+                }
+            }
+        }
+
+        // Encode alpha
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
+                    epPart >>= (8 - modeInfo.m_alphaBits);
+
+                    pv.Pack(epPart, modeInfo.m_alphaBits);
+                }
+            }
+        }
+
+        // Encode parity bits
+        if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
+                epPart >>= (7 - modeInfo.m_rgbBits);
+                epPart &= 1;
+
+                pv.Pack(epPart, 1);
+            }
+        }
+        else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+        {
+            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+            {
+                for (int ep = 0; ep < 2; ep++)
+                {
+                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
+                    epPart >>= (7 - modeInfo.m_rgbBits);
+                    epPart &= 1;
+
+                    pv.Pack(epPart, 1);
+                }
+            }
+        }
+
+        // Encode indexes
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_indexBits;
+            if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+                bits--;
+
+            pv.Pack(indexes[px], bits);
+        }
+
+        // Encode secondary indexes
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int bits = modeInfo.m_alphaIndexBits;
+                if (px == 0)
+                    bits--;
+
+                pv.Pack(indexes2[px], bits);
+            }
+        }
+
+        pv.Flush(packedBlocks);
+
+        packedBlocks += 16;
+    }
+}
+
+void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
+{
+    UnpackingVector pv;
+    pv.Init(packedBlock);
+
+    int mode = 8;
+    for (int i = 0; i < 8; i++)
+    {
+        if (pv.Unpack(1) == 1)
+        {
+            mode = i;
+            break;
+        }
+    }
+
+    if (mode > 7)
+    {
+        for (int px = 0; px < 16; px++)
+            for (int ch = 0; ch < 4; ch++)
+                output.m_pixels[px][ch] = 0;
+
+        return;
+    }
+
+    const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
+
+    int partition = 0;
+    if (modeInfo.m_partitionBits)
+        partition = pv.Unpack(modeInfo.m_partitionBits);
+
+    int rotation = 0;
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+        rotation = pv.Unpack(2);
+
+    int indexSelector = 0;
+    if (modeInfo.m_hasIndexSelector)
+        indexSelector = pv.Unpack(1);
+
+    // Resolve fixups
+    int fixups[3] = { 0, 0, 0 };
+
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
+    {
+        if (modeInfo.m_numSubsets == 2)
+            fixups[1] = BC7Data::g_fixupIndexes2[partition];
+        else if (modeInfo.m_numSubsets == 3)
+        {
+            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
+            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
+        }
+    }
+
+    int endPoints[3][2][4];
+
+    // Decode RGB
+    for (int ch = 0; ch < 3; ch++)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
+        }
+    }
+
+    // Decode alpha
+    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
+        }
+    }
+    else
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                endPoints[subset][ep][3] = 255;
+        }
+    }
+
+    int parityBits = 0;
+
+    // Decode parity bits
+    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            int p = pv.Unpack(1);
+
+            for (int ep = 0; ep < 2; ep++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
+    {
+        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+        {
+            for (int ep = 0; ep < 2; ep++)
+            {
+                int p = pv.Unpack(1);
+
+                for (int ch = 0; ch < 3; ch++)
+                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
+
+                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
+            }
+        }
+
+        parityBits = 1;
+    }
+
+    // Fill endpoint bits
+    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+    {
+        for (int ep = 0; ep < 2; ep++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
+
+            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+                endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
+        }
+    }
+
+    int indexes[16];
+    int indexes2[16];
+
+    // Decode indexes
+    for (int px = 0; px < 16; px++)
+    {
+        int bits = modeInfo.m_indexBits;
+        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+            bits--;
+
+        indexes[px] = pv.Unpack(bits);
+    }
+
+    // Decode secondary indexes
+    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            int bits = modeInfo.m_alphaIndexBits;
+            if (px == 0)
+                bits--;
+
+            indexes2[px] = pv.Unpack(bits);
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            indexes2[px] = 0;
+    }
+
+    const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
+    const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
+
+    // Decode each pixel
+    for (int px = 0; px < 16; px++)
+    {
+        int rgbWeight = 0;
+        int alphaWeight = 0;
+
+        int rgbIndex = indexes[px];
+
+        rgbWeight = rgbWeights[indexes[px]];
+
+        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
+            alphaWeight = rgbWeight;
+        else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
+            alphaWeight = alphaWeights[indexes2[px]];
+
+        if (indexSelector == 1)
+        {
+            int temp = rgbWeight;
+            rgbWeight = alphaWeight;
+            alphaWeight = temp;
+        }
+
+        int pixel[4] = { 0, 0, 0, 255 };
+
+        int subset = 0;
+
+        if (modeInfo.m_numSubsets == 2)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+        else if (modeInfo.m_numSubsets == 3)
+            subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
+
+        for (int ch = 0; ch < 3; ch++)
+            pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
+
+        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
+            pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
+
+        if (rotation != 0)
+        {
+            int ch = rotation - 1;
+            int temp = pixel[ch];
+            pixel[ch] = pixel[3];
+            pixel[3] = temp;
+        }
+
+        for (int ch = 0; ch < 4; ch++)
+            output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
+    }
+}
+
+cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
+    assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
+
+    // Expand to full range
+    ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
+    MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
+
+    absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
+
+    MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
+
+    return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
+}
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
+{
+    MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
+    return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
+{
+    MSInt16 zero = ParallelMath::MakeSInt16(0);
+
+    ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
+    MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
+
+    MSInt16 unq;
+    MUInt15 absUnq;
+
+    if (precision >= 16)
+    {
+        unq = comp;
+        absUnq = absComp;
+    }
+    else
+    {
+        MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
+        ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
+        ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
+
+        unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
+    }
+
+    outUnquantized = unq;
+
+    MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
+
+    outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
+}
+
+void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
+{
+    MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
+    if (precision < 15)
+    {
+        MUInt15 zero = ParallelMath::MakeUInt15(0);
+        MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
+
+        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
+        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
+
+        unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
+
+        ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
+        ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
+    }
+
+    outUnquantized = unq;
+    outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MSInt16 unquantizedEP[2][3];
+    MSInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
+                UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    MUInt16 unquantizedEP[2][3];
+    MUInt16 finishedUnquantizedEP[2][3];
+
+    {
+        ParallelMath::RoundUpForScope ru;
+
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
+                UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
+                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
+            }
+        }
+    }
+
+    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
+    indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
+
+    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
+
+    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
+
+    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
+
+    if (ParallelMath::AnySet(invert))
+    {
+        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
+
+        indexSelector.ConditionalInvert(invert);
+
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MAInt16 firstEP = quantizedEndPoints[0][ch];
+            MAInt16 secondEP = quantizedEndPoints[1][ch];
+
+            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
+            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
+        }
+    }
+
+    indexes[fixupIndex] = index;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][0][ch] = ep0[0][ch];
+        outEncodedEPs[0][1][ch] = ep0[1][ch];
+        outEncodedEPs[1][0][ch] = ep1[0][ch];
+        outEncodedEPs[1][1][ch] = ep1[1][ch];
+
+        if (isTransformed)
+        {
+            for (int subset = 0; subset < 2; subset++)
+            {
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    if (epi == 0 && subset == 0)
+                        continue;
+
+                    MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
+
+                    MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
+
+                    outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+                    MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
+                    allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+                }
+            }
+        }
+
+        if (!ParallelMath::AnySet(allLegal))
+            break;
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
+{
+    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
+
+    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        outEncodedEPs[0][ch] = ep[0][ch];
+        outEncodedEPs[1][ch] = ep[1][ch];
+
+        if (isTransformed)
+        {
+            MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
+
+            MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
+
+            outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
+
+            MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
+            allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
+        }
+    }
+
+    outIsLegal = allLegal;
+}
+
+void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
+{
+    if (numTweakRounds < 1)
+        numTweakRounds = 1;
+    else if (numTweakRounds > MaxTweakRounds)
+        numTweakRounds = MaxTweakRounds;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+    else if (numRefineRounds > MaxRefineRounds)
+        numRefineRounds = MaxRefineRounds;
+
+    bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
+    float channelWeightsSq[3];
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    MSInt16 pixels[16][3];
+    MFloat floatPixels2CL[16][3];
+    MFloat floatPixelsLinearWeighted[16][3];
+
+    MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MSInt16 pixelValue;
+            ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
+
+            // Convert from sign+magnitude to 2CL
+            if (isSigned)
+            {
+                ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
+                MSInt16 magnitude = (pixelValue & low15Bits);
+                ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
+            }
+            else
+                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
+
+            pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
+
+            pixels[px][ch] = pixelValue;
+            floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
+            floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
+        }
+    }
+
+    MFloat preWeightedPixels[16][3];
+
+    BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
+
+    MAInt16 bestEndPoints[2][2][3];
+    MUInt15 bestIndexes[16];
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestMode = ParallelMath::MakeUInt15(0);
+    MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
+
+    UnfinishedEndpoints<3> partitionedUFEP[32][2];
+    UnfinishedEndpoints<3> singleUFEP;
+
+    // Generate UFEP for partitions
+    for (int p = 0; p < 32; p++)
+    {
+        int partitionMask = BC7Data::g_partitionMap[p];
+
+        EndpointSelector<3, 8> epSelectors[2];
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                int subset = (partitionMask >> px) & 1;
+                epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+            }
+
+            for (int subset = 0; subset < 2; subset++)
+                epSelectors[subset].FinishPass(pass);
+        }
+
+        for (int subset = 0; subset < 2; subset++)
+            partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
+    }
+
+    // Generate UFEP for single
+    {
+        EndpointSelector<3, 8> epSelector;
+
+        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+        {
+            for (int px = 0; px < 16; px++)
+                epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
+
+            epSelector.FinishPass(pass);
+        }
+
+        singleUFEP = epSelector.GetEndpoints(channelWeights);
+    }
+
+    for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
+    {
+        bool partitioned = (partitionedInt == 1);
+
+        for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
+        {
+            if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
+                continue;
+
+            int numPartitions = partitioned ? 32 : 1;
+            int numSubsets = partitioned ? 2 : 1;
+            int indexBits = partitioned ? 3 : 4;
+            int indexRange = (1 << indexBits);
+
+            for (int p = 0; p < numPartitions; p++)
+            {
+                int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
+
+                const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
+
+                MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
+                MUInt15 metaIndexes[MaxMetaRounds][16];
+                MFloat metaError[MaxMetaRounds][2];
+
+                bool roundValid[MaxMetaRounds][2];
+
+                for (int r = 0; r < MaxMetaRounds; r++)
+                    for (int subset = 0; subset < 2; subset++)
+                        roundValid[r][subset] = true;
+
+                for (int subset = 0; subset < numSubsets; subset++)
+                {
+                    for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
+                    {
+                        EndpointRefiner<3> refiners[2];
+
+                        bool abortRemainingRefines = false;
+                        for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
+                        {
+                            int metaRound = tweak * MaxRefineRounds + refinePass;
+
+                            if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
+                                abortRemainingRefines = true;
+
+                            if (abortRemainingRefines)
+                            {
+                                roundValid[metaRound][subset] = false;
+                                continue;
+                            }
+
+                            MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
+                            MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
+
+                            MSInt16 endPointsColorSpace[2][3];
+
+                            if (refinePass == 0)
+                            {
+                                UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
+
+                                if (isSigned)
+                                    ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                                else
+                                    ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
+                            }
+                            else
+                                refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
+
+                            refiners[subset].Init(indexRange, channelWeights);
+
+                            int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
+
+                            IndexSelectorHDR<3> indexSelector;
+                            if (isSigned)
+                                QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+                            else
+                                QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
+
+                            if (metaRound > 0)
+                            {
+                                ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
+
+                                for (int prevRound = 0; prevRound < metaRound; prevRound++)
+                                {
+                                    MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
+
+                                    ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
+
+                                    for (int epi = 0; epi < 2; epi++)
+                                        for (int ch = 0; ch < 3; ch++)
+                                            same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
+
+                                    anySame = (anySame | same);
+                                    if (ParallelMath::AllSet(anySame))
+                                        break;
+                                }
+
+                                if (ParallelMath::AllSet(anySame))
+                                {
+                                    roundValid[metaRound][subset] = false;
+                                    continue;
+                                }
+                            }
+
+                            MFloat subsetError = ParallelMath::MakeFloatZero();
+
+                            {
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    if (subset != ((partitionMask >> px) & 1))
+                                        continue;
+
+                                    MUInt15 index;
+                                    if (px == fixupIndex)
+                                        index = mrIndexes[px];
+                                    else
+                                    {
+                                        index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
+                                        mrIndexes[px] = index;
+                                    }
+
+                                    MSInt16 reconstructed[3];
+                                    if (isSigned)
+                                        indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
+                                    else
+                                        indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
+
+                                    subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
+
+                                    if (refinePass != numRefineRounds - 1)
+                                        refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
+                                }
+                            }
+
+                            metaError[metaRound][subset] = subsetError;
+                        }
+                    }
+                }
+
+                // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
+                int numMeta1 = partitioned ? MaxMetaRounds : 1;
+                for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
+                {
+                    if (!roundValid[meta0][0])
+                        continue;
+
+                    for (int meta1 = 0; meta1 < numMeta1; meta1++)
+                    {
+                        MFloat combinedError = metaError[meta0][0];
+                        if (partitioned)
+                        {
+                            if (!roundValid[meta1][1])
+                                continue;
+
+                            combinedError = combinedError + metaError[meta1][1];
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
+                        if (!ParallelMath::AnySet(errorBetter))
+                            continue;
+
+                        ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        // Figure out if this is encodable
+                        for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
+                        {
+                            const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
+
+                            if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
+                                continue;
+
+                            MAInt16 encodedEPs[2][2][3];
+                            ParallelMath::Int16CompFlag isLegal;
+                            if (partitioned)
+                                EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
+                            else
+                                EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
+
+                            ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
+                            if (!ParallelMath::AnySet(isLegalAndBetter))
+                                continue;
+
+                            ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
+
+                            ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
+                            ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
+                            ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
+
+                            for (int subset = 0; subset < numSubsets; subset++)
+                            {
+                                for (int epi = 0; epi < 2; epi++)
+                                {
+                                    for (int ch = 0; ch < 3; ch++)
+                                        ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
+                                }
+                            }
+
+                            for (int px = 0; px < 16; px++)
+                            {
+                                int subset = ((partitionMask >> px) & 1);
+                                if (subset == 0)
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
+                                else
+                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
+                            }
+
+                            needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
+                            if (!ParallelMath::AnySet(needsCommit))
+                                break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // At this point, everything should be set
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
+        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
+        int32_t eps[2][2][3];
+        ParallelMath::ScalarUInt16 indexes[16];
+
+        const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+
+        BC6H_IO::WriteFunc_t writeFunc = BC6H_IO::g_writeFuncs[mode];
+
+        const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            for (int epi = 0; epi < 2; epi++)
+            {
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
+            }
+        }
+
+        for (int px = 0; px < 16; px++)
+            indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
+
+        uint16_t modeID = modeInfo.m_modeID;
+
+        PackingVector pv;
+
+        {
+            uint32_t header[3];
+            writeFunc(header, modeID, partition,
+                eps[0][0][0], eps[0][1][0], eps[1][0][0], eps[1][1][0],
+                eps[0][0][1], eps[0][1][1], eps[1][0][1], eps[1][1][1],
+                eps[0][0][2], eps[0][1][2], eps[1][0][2], eps[1][1][2]
+            );
+
+            pv.InitPacked(header, headerBits);
+        }
+
+        int fixupIndex1 = 0;
+        int indexBits = 4;
+        if (modeInfo.m_partitioned)
+        {
+            fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+            indexBits = 3;
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
+            if (px == 0 || px == fixupIndex1)
+                pv.Pack(index, indexBits - 1);
+            else
+                pv.Pack(index, indexBits);
+        }
+
+        pv.Flush(packedBlocks + 16 * block);
+    }
+}
+
+void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
+{
+    if (v & (1 << (bits - 1)))
+        v |= -(1 << bits);
+}
+
+void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
+{
+    UnpackingVector pv;
+    pv.Init(pBC);
+
+    int numModeBits = 2;
+    int modeBits = pv.Unpack(2);
+    if (modeBits != 0 && modeBits != 1)
+    {
+        modeBits |= pv.Unpack(3) << 2;
+        numModeBits += 3;
+    }
+
+    int mode = -1;
+    for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
+    {
+        if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
+        {
+            mode = possibleMode;
+            break;
+        }
+    }
+
+    if (mode < 0)
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                output.m_pixels[px][ch] = 0;
+            output.m_pixels[px][3] = 0x3c00;	// 1.0
+        }
+        return;
+    }
+
+    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
+    const int headerBits = modeInfo.m_partitioned ? 82 : 65;
+    const BC6H_IO::ReadFunc_t readFunc = BC6H_IO::g_readFuncs[mode];
+
+    uint16_t partition = 0;
+    int32_t eps[2][2][3];
+
+    for (int subset = 0; subset < 2; subset++)
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                eps[subset][epi][ch] = 0;
+
+    {
+        uint32_t header[3];
+        uint16_t codedEPs[2][2][3];
+        pv.UnpackStart(header, headerBits);
+
+        readFunc(header, partition,
+            codedEPs[0][0][0], codedEPs[0][1][0], codedEPs[1][0][0], codedEPs[1][1][0],
+            codedEPs[0][0][1], codedEPs[0][1][1], codedEPs[1][0][1], codedEPs[1][1][1],
+            codedEPs[0][0][2], codedEPs[0][1][2], codedEPs[1][0][2], codedEPs[1][1][2]
+        );
+
+        for (int subset = 0; subset < 2; subset++)
+            for (int epi = 0; epi < 2; epi++)
+                for (int ch = 0; ch < 3; ch++)
+                    eps[subset][epi][ch] = codedEPs[subset][epi][ch];
+    }
+
+    uint16_t modeID = modeInfo.m_modeID;
+
+    int fixupIndex1 = 0;
+    int indexBits = 4;
+    int numSubsets = 1;
+    if (modeInfo.m_partitioned)
+    {
+        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
+        indexBits = 3;
+        numSubsets = 2;
+    }
+
+    int indexes[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (px == 0 || px == fixupIndex1)
+            indexes[px] = pv.Unpack(indexBits - 1);
+        else
+            indexes[px] = pv.Unpack(indexBits);
+    }
+
+    if (modeInfo.m_partitioned)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+            {
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
+                SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
+            }
+        }
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            if (isSigned)
+                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
+            if (modeInfo.m_transformed || isSigned)
+                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
+        }
+    }
+
+    int aPrec = modeInfo.m_aPrec;
+
+    if (modeInfo.m_transformed)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int wrapMask = (1 << aPrec) - 1;
+
+            eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
+            if (isSigned)
+                SignExtendSingle(eps[0][1][ch], aPrec);
+
+            if (modeInfo.m_partitioned)
+            {
+                eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
+                eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
+
+                if (isSigned)
+                {
+                    SignExtendSingle(eps[1][0][ch], aPrec);
+                    SignExtendSingle(eps[1][1][ch], aPrec);
+                }
+            }
+        }
+    }
+
+    // Unquantize endpoints
+    for (int subset = 0; subset < numSubsets; subset++)
+    {
+        for (int epi = 0; epi < 2; epi++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+            {
+                int &v = eps[subset][epi][ch];
+
+                if (isSigned)
+                {
+                    if (aPrec >= 16)
+                    {
+                        // Nothing
+                    }
+                    else
+                    {
+                        bool s = false;
+                        int comp = v;
+                        if (v < 0)
+                        {
+                            s = true;
+                            comp = -comp;
+                        }
+
+                        int unq = 0;
+                        if (comp == 0)
+                            unq = 0;
+                        else if (comp >= ((1 << (aPrec - 1)) - 1))
+                            unq = 0x7fff;
+                        else
+                            unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
+
+                        if (s)
+                            unq = -unq;
+
+                        v = unq;
+                    }
+                }
+                else
+                {
+                    if (aPrec >= 15)
+                    {
+                        // Nothing
+                    }
+                    else if (v == 0)
+                    {
+                        // Nothing
+                    }
+                    else if (v == ((1 << aPrec) - 1))
+                        v = 0xffff;
+                    else
+                        v = ((v << 16) + 0x8000) >> aPrec;
+                }
+            }
+        }
+    }
+
+    const int *weights = BC7Data::g_weightTables[indexBits];
+
+    for (int px = 0; px < 16; px++)
+    {
+        int subset = 0;
+        if (modeInfo.m_partitioned)
+            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
+
+        int w = weights[indexes[px]];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
+
+            if (isSigned)
+            {
+                if (comp < 0)
+                    comp = -(((-comp) * 31) >> 5);
+                else
+                    comp = (comp * 31) >> 5;
+
+                int s = 0;
+                if (comp < 0)
+                {
+                    s = 0x8000;
+                    comp = -comp;
+                }
+
+                output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
+            }
+            else
+            {
+                comp = (comp * 31) >> 6;
+                output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
+            }
+        }
+        output.m_pixels[px][3] = 0x3c00;	// 1.0
+    }
+}
+
+void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
+{
+    static const int kMaxQuality = 100;
+
+    if (quality < 1)
+        quality = 1;
+    else if (quality > kMaxQuality)
+        quality = kMaxQuality;
+
+    const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
+    const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;
+
+    const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
+    const int prioListSizes[] = { numRGBModes, numRGBAModes };
+
+    BC7FineTuningParams ftParams;
+    memset(&ftParams, 0, sizeof(ftParams));
+
+    for (int listIndex = 0; listIndex < 2; listIndex++)
+    {
+        int prioListSize = prioListSizes[listIndex];
+        const uint16_t *prioList = prioLists[listIndex];
+
+        for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
+        {
+            const uint16_t packedMode = prioList[prioIndex];
+
+            uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
+            int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);
+
+            switch (mode)
+            {
+            case 0:
+                ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 1:
+                ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 2:
+                ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 3:
+                ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            case 4:
+                ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
+                break;
+            case 5:
+                ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
+                break;
+            case 6:
+                ftParams.mode6SP = seedPoints;
+                break;
+            case 7:
+                ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
+                break;
+            }
+        }
+    }
+
+    ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
+}
+
+// Generates a BC7 encoding plan from fine-tuning parameters.
+bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
+{
+    memset(&encodingPlan, 0, sizeof(encodingPlan));
+
+    // Mode 0
+    for (int partition = 0; partition < 16; partition++)
+    {
+        uint8_t sp = params.mode0SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 1
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode1SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 2
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode2SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 3; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 3
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode3SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
+        }
+    }
+
+    // Mode 4
+    for (int rotation = 0; rotation < 4; rotation++)
+    {
+        for (int indexMode = 0; indexMode < 2; indexMode++)
+            encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
+    }
+
+    // Mode 5
+    for (int rotation = 0; rotation < 4; rotation++)
+        encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];
+
+    // Mode 6
+    {
+        uint8_t sp = params.mode6SP;
+        if (sp != 0)
+        {
+            encodingPlan.mode6Enabled = true;
+
+            int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    // Mode 7
+    for (int partition = 0; partition < 64; partition++)
+    {
+        uint8_t sp = params.mode7SP[partition];
+        if (sp == 0)
+            continue;
+
+        encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;
+
+        for (int subset = 0; subset < 2; subset++)
+        {
+            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
+            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGB[i] > 0)
+        {
+            encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
+            encodingPlan.rgbNumShapesToEvaluate++;
+        }
+    }
+
+    for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
+    {
+        if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
+        {
+            encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
+            encodingPlan.rgbaNumShapesToEvaluate++;
+        }
+    }
+
+    encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);
+
+    return true;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.h b/thirdparty/cvtt/ConvectionKernels_BC67.h
new file mode 100644
index 0000000000..b929711187
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC67.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace BC7SC
+        {
+            struct Table;
+        }
+    }
+
+    namespace Internal
+    {
+        namespace BC67
+        {
+            struct WorkInfo;
+        }
+
+        template<int TVectorSize>
+        class IndexSelectorHDR;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BC7Computer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds);
+            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock);
+
+        private:
+            static const int MaxTweakRounds = 4;
+
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2]);
+            static void Quantize(MUInt15* color, int bits, int channels);
+            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels);
+            static void Unquantize(MUInt15* color, int bits, int channels);
+            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p);
+            static void CompressEndpoints2(MUInt15 ep[2][4]);
+            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2]);
+            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2]);
+            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2]);
+            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn);
+
+            template<class T>
+            static void Swap(T& a, T& b);
+        };
+
+
+        class BC6HComputer
+        {
+        public:
+            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds);
+            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            static const int MaxTweakRounds = 4;
+            static const int MaxRefineRounds = 3;
+
+            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru);
+            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru);
+            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL);
+            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished);
+            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal);
+            static void SignExtendSingle(int &v, int bits);
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
new file mode 100644
index 0000000000..753b6f9000
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.cpp
@@ -0,0 +1,881 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        void WriteMode0(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 2) & 0x4u) | ((by >> 1) & 0x8u) | (bz & 0x10u) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode1(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x3u) | ((gy >> 3) & 0x4u) | ((gz >> 1) & 0x18u) | ((rw << 5) & 0xfe0u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x3f8000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode2(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xf8u) | ((rw >> 2) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode3(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((gw << 8) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x7800000u) | ((bw << 17) & 0x8000000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 5) & 0x20u) | ((bz << 4) & 0x40u) | ((rz << 7) & 0x780u) | ((gy << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode4(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 3) & 0x80u) | ((by << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x1e000u) | ((gw << 7) & 0x20000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bw << 18) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x1eu) | ((bz << 4) & 0x60u) | ((rz << 7) & 0x780u) | ((bz << 7) & 0x800u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode5(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x3fe0u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0xff8000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x3u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode6(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((gz << 9) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 2) & 0x6u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode7(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 13) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((gy << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((gz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0xf800000u) | ((bz << 27) & 0x10000000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode8(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x1fe0u) | ((bz << 12) & 0x2000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x7f8000u) | ((by << 18) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0xf8u) | ((gz << 4) & 0x100u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x3e000u) | ((bz << 18) & 0x40000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x3eu) | ((bz << 4) & 0x40u) | ((rz << 7) & 0xf80u) | ((bz << 9) & 0x1000u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode9(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7e0u) | ((gz << 7) & 0x800u) | ((bz << 12) & 0x3000u) | ((by << 10) & 0x4000u) | ((gw << 15) & 0x1f8000u) | ((gy << 16) & 0x200000u) | ((by << 17) & 0x400000u) | ((bz << 21) & 0x800000u) | ((gy << 20) & 0x1000000u) | ((bw << 25) & 0x7e000000u) | ((gz << 26) & 0x80000000u);
+            encoded[1] = ((bz >> 3) & 0x1u) | ((bz >> 4) & 0x2u) | ((bz >> 2) & 0x4u) | ((rx << 3) & 0x1f8u) | ((gy << 9) & 0x1e00u) | ((gx << 13) & 0x7e000u) | ((gz << 19) & 0x780000u) | ((bx << 23) & 0x1f800000u) | ((by << 29) & 0xe0000000u);
+            encoded[2] = ((by >> 3) & 0x1u) | ((ry << 1) & 0x7eu) | ((rz << 7) & 0x1f80u) | ((d << 13) & 0x3e000u);
+        }
+
+        void WriteMode10(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x1ff8u) | ((gx << 13) & 0x7fe000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bx >> 9) & 0x1u);
+        }
+
+        void WriteMode11(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0xff8u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x3fe000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0xff800000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode12(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x7f8u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1fe000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7f800000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void WriteMode13(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz)
+        {
+            encoded[0] = (m & 0x1fu) | ((rw << 5) & 0x7fe0u) | ((gw << 15) & 0x1ff8000u) | ((bw << 25) & 0xfe000000u);
+            encoded[1] = ((bw >> 7) & 0x7u) | ((rx << 3) & 0x78u) | ((rw >> 8) & 0x80u) | ((rw >> 6) & 0x100u) | ((rw >> 4) & 0x200u) | ((rw >> 2) & 0x400u) | (rw & 0x800u) | ((rw << 2) & 0x1000u) | ((gx << 13) & 0x1e000u) | ((gw << 2) & 0x20000u) | ((gw << 4) & 0x40000u) | ((gw << 6) & 0x80000u) | ((gw << 8) & 0x100000u) | ((gw << 10) & 0x200000u) | ((gw << 12) & 0x400000u) | ((bx << 23) & 0x7800000u) | ((bw << 12) & 0x8000000u) | ((bw << 14) & 0x10000000u) | ((bw << 16) & 0x20000000u) | ((bw << 18) & 0x40000000u) | ((bw << 20) & 0x80000000u);
+            encoded[2] = ((bw >> 10) & 0x1u);
+        }
+
+        void ReadMode0(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 2) & 0x10u);
+            by |= ((encoded[0] << 1) & 0x10u);
+            bz |= (encoded[0] & 0x10u);
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode1(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            gy |= ((encoded[0] << 3) & 0x20u);
+            gz |= ((encoded[0] << 1) & 0x30u);
+            rw |= ((encoded[0] >> 5) & 0x7fu);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x7fu);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode2(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            rw |= ((encoded[1] << 2) & 0x400u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode3(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            gw |= ((encoded[1] >> 8) & 0x400u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 17) & 0x400u);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 5) & 0x1u);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            gy |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode4(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 3) & 0x400u);
+            by |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 7) & 0x400u);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bw |= ((encoded[1] >> 18) & 0x400u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0xfu);
+            bz |= ((encoded[2] >> 4) & 0x6u);
+            rz |= ((encoded[2] >> 7) & 0xfu);
+            bz |= ((encoded[2] >> 7) & 0x10u);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode5(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x1ffu);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x1ffu);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x180u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode6(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            gz |= ((encoded[0] >> 9) & 0x10u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 2) & 0x18u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode7(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 13) & 0x1u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            gy |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            gz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x1fu);
+            bz |= ((encoded[1] >> 27) & 0x2u);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode8(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0xffu);
+            bz |= ((encoded[0] >> 12) & 0x2u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0xffu);
+            by |= ((encoded[0] >> 18) & 0x20u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x80u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x1fu);
+            gz |= ((encoded[1] >> 4) & 0x10u);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x1fu);
+            bz |= ((encoded[1] >> 18) & 0x1u);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x1fu);
+            bz |= ((encoded[2] >> 4) & 0x4u);
+            rz |= ((encoded[2] >> 7) & 0x1fu);
+            bz |= ((encoded[2] >> 9) & 0x8u);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode9(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3fu);
+            gz |= ((encoded[0] >> 7) & 0x10u);
+            bz |= ((encoded[0] >> 12) & 0x3u);
+            by |= ((encoded[0] >> 10) & 0x10u);
+            gw |= ((encoded[0] >> 15) & 0x3fu);
+            gy |= ((encoded[0] >> 16) & 0x20u);
+            by |= ((encoded[0] >> 17) & 0x20u);
+            bz |= ((encoded[0] >> 21) & 0x4u);
+            gy |= ((encoded[0] >> 20) & 0x10u);
+            bw |= ((encoded[0] >> 25) & 0x3fu);
+            gz |= ((encoded[0] >> 26) & 0x20u);
+            bz |= ((encoded[1] << 3) & 0x8u);
+            bz |= ((encoded[1] << 4) & 0x20u);
+            bz |= ((encoded[1] << 2) & 0x10u);
+            rx |= ((encoded[1] >> 3) & 0x3fu);
+            gy |= ((encoded[1] >> 9) & 0xfu);
+            gx |= ((encoded[1] >> 13) & 0x3fu);
+            gz |= ((encoded[1] >> 19) & 0xfu);
+            bx |= ((encoded[1] >> 23) & 0x3fu);
+            by |= ((encoded[1] >> 29) & 0x7u);
+            by |= ((encoded[2] << 3) & 0x8u);
+            ry |= ((encoded[2] >> 1) & 0x3fu);
+            rz |= ((encoded[2] >> 7) & 0x3fu);
+            d |= ((encoded[2] >> 13) & 0x1fu);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode10(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x3ffu);
+            gx |= ((encoded[1] >> 13) & 0x3ffu);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bx |= ((encoded[2] << 9) & 0x200u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode11(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0x1ffu);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0x1ffu);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0x1ffu);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode12(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xffu);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xffu);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xffu);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        void ReadMode13(const uint32_t *encoded, uint16_t &outD, uint16_t &outRW, uint16_t &outRX, uint16_t &outRY, uint16_t &outRZ, uint16_t &outGW, uint16_t &outGX, uint16_t &outGY, uint16_t &outGZ, uint16_t &outBW, uint16_t &outBX, uint16_t &outBY, uint16_t &outBZ)
+        {
+            uint16_t d = 0;
+            uint16_t rw = 0;
+            uint16_t rx = 0;
+            uint16_t ry = 0;
+            uint16_t rz = 0;
+            uint16_t gw = 0;
+            uint16_t gx = 0;
+            uint16_t gy = 0;
+            uint16_t gz = 0;
+            uint16_t bw = 0;
+            uint16_t bx = 0;
+            uint16_t by = 0;
+            uint16_t bz = 0;
+            rw |= ((encoded[0] >> 5) & 0x3ffu);
+            gw |= ((encoded[0] >> 15) & 0x3ffu);
+            bw |= ((encoded[0] >> 25) & 0x7fu);
+            bw |= ((encoded[1] << 7) & 0x380u);
+            rx |= ((encoded[1] >> 3) & 0xfu);
+            rw |= ((encoded[1] << 8) & 0x8000u);
+            rw |= ((encoded[1] << 6) & 0x4000u);
+            rw |= ((encoded[1] << 4) & 0x2000u);
+            rw |= ((encoded[1] << 2) & 0x1000u);
+            rw |= (encoded[1] & 0x800u);
+            rw |= ((encoded[1] >> 2) & 0x400u);
+            gx |= ((encoded[1] >> 13) & 0xfu);
+            gw |= ((encoded[1] >> 2) & 0x8000u);
+            gw |= ((encoded[1] >> 4) & 0x4000u);
+            gw |= ((encoded[1] >> 6) & 0x2000u);
+            gw |= ((encoded[1] >> 8) & 0x1000u);
+            gw |= ((encoded[1] >> 10) & 0x800u);
+            gw |= ((encoded[1] >> 12) & 0x400u);
+            bx |= ((encoded[1] >> 23) & 0xfu);
+            bw |= ((encoded[1] >> 12) & 0x8000u);
+            bw |= ((encoded[1] >> 14) & 0x4000u);
+            bw |= ((encoded[1] >> 16) & 0x2000u);
+            bw |= ((encoded[1] >> 18) & 0x1000u);
+            bw |= ((encoded[1] >> 20) & 0x800u);
+            bw |= ((encoded[2] << 10) & 0x400u);
+            outD = d;
+            outRW = rw;
+            outRX = rx;
+            outRY = ry;
+            outRZ = rz;
+            outGW = gw;
+            outGX = gx;
+            outGY = gy;
+            outGZ = gz;
+            outBW = bw;
+            outBX = bx;
+            outBY = by;
+            outBZ = bz;
+        }
+
+        const ReadFunc_t g_readFuncs[14] =
+        {
+            ReadMode0,
+            ReadMode1,
+            ReadMode2,
+            ReadMode3,
+            ReadMode4,
+            ReadMode5,
+            ReadMode6,
+            ReadMode7,
+            ReadMode8,
+            ReadMode9,
+            ReadMode10,
+            ReadMode11,
+            ReadMode12,
+            ReadMode13
+        };
+
+        const WriteFunc_t g_writeFuncs[14] =
+        {
+            WriteMode0,
+            WriteMode1,
+            WriteMode2,
+            WriteMode3,
+            WriteMode4,
+            WriteMode5,
+            WriteMode6,
+            WriteMode7,
+            WriteMode8,
+            WriteMode9,
+            WriteMode10,
+            WriteMode11,
+            WriteMode12,
+            WriteMode13
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
new file mode 100644
index 0000000000..a7bb517b54
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC6H_IO.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdint.h>
+#include "ConvectionKernels_BC6H_IO.h"
+
+namespace cvtt
+{
+    namespace BC6H_IO
+    {
+        typedef void (*ReadFunc_t)(const uint32_t *encoded, uint16_t &d, uint16_t &rw, uint16_t &rx, uint16_t &ry, uint16_t &rz, uint16_t &gw, uint16_t &gx, uint16_t &gy, uint16_t &gz, uint16_t &bw, uint16_t &bx, uint16_t &by, uint16_t &bz);
+        typedef void (*WriteFunc_t)(uint32_t *encoded, uint16_t m, uint16_t d, uint16_t rw, uint16_t rx, uint16_t ry, uint16_t rz, uint16_t gw, uint16_t gx, uint16_t gy, uint16_t gz, uint16_t bw, uint16_t bx, uint16_t by, uint16_t bz);
+
+        extern const ReadFunc_t g_readFuncs[14];
+        extern const WriteFunc_t g_writeFuncs[14];
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
new file mode 100644
index 0000000000..1880e22d0f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_Prio.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <stdint.h>
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    extern const uint16_t *g_bc7PrioCodesRGB;
+    extern const int g_bc7NumPrioCodesRGB;
+
+    extern const uint16_t *g_bc7PrioCodesRGBA;
+    extern const int g_bc7NumPrioCodesRGBA;
+
+    int UnpackMode(uint16_t packed);
+    int UnpackSeedPointCount(uint16_t packed);
+    int UnpackPartition(uint16_t packed);
+    int UnpackRotation(uint16_t packed);
+    int UnpackIndexSelector(uint16_t packed);
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
new file mode 100644
index 0000000000..5b3134f860
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_PrioData.cpp
@@ -0,0 +1,1301 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BC7_Prio.h"
+
+#define BC7_PARTITION_BITS  6
+#define BC7_PARTITION_OFFSET_BITS  0
+
+#define BC7_ROTATION_BITS   2
+#define BC7_ROTATION_OFFSET_BITS    0
+
+#define BC7_INDEX_MODE_BITS 1
+#define BC7_INDEX_MODE_OFFSET_BITS (BC7_ROTATION_OFFSET_BITS + BC7_ROTATION_BITS)
+
+#define BC7_MODE_BITS 3
+#define BC7_MODE_OFFSET_BITS (BC7_PARTITION_OFFSET_BITS + BC7_PARTITION_BITS)
+#define BC7_SEED_POINT_COUNT_BITS  2
+#define BC7_SEED_POINT_COUNT_OFFSET_BITS  (BC7_MODE_BITS + BC7_MODE_OFFSET_BITS)
+
+
+
+#define BC7_MODE_PRIO_DUAL_PLANE(subData)   \
+    ( \
+        ((subData / 10) << BC7_ROTATION_OFFSET_BITS) | \
+        ((subData % 10) << BC7_INDEX_MODE_OFFSET_BITS) \
+    )
+
+#define BC7_MODE_PRIO_CODE(seedPointCount, mode, subData)   \
+    (\
+        ((seedPointCount - 1) << BC7_SEED_POINT_COUNT_OFFSET_BITS) |  \
+        (mode << BC7_MODE_OFFSET_BITS) |   \
+        ((mode == 4 || mode == 5) ? BC7_MODE_PRIO_DUAL_PLANE(subData) : (subData << BC7_PARTITION_OFFSET_BITS)) \
+    )
+
+namespace cvtt { namespace Tables { namespace BC7Prio {
+    const uint16_t g_bc7PrioCodesRGBData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 16),
+        BC7_MODE_PRIO_CODE(1, 1, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 0, 14),
+        BC7_MODE_PRIO_CODE(1, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 8),
+        BC7_MODE_PRIO_CODE(1, 3, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 23),
+        BC7_MODE_PRIO_CODE(1, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 1, 21),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 0, 4),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 0),
+        BC7_MODE_PRIO_CODE(2, 0, 10),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 8),
+        BC7_MODE_PRIO_CODE(2, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 0, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 12),
+        BC7_MODE_PRIO_CODE(1, 1, 18),
+        BC7_MODE_PRIO_CODE(1, 3, 0),
+        BC7_MODE_PRIO_CODE(1, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 25),
+        BC7_MODE_PRIO_CODE(1, 0, 7),
+        BC7_MODE_PRIO_CODE(3, 0, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 0, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 0),
+        BC7_MODE_PRIO_CODE(2, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 14),
+        BC7_MODE_PRIO_CODE(3, 0, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 16),
+        BC7_MODE_PRIO_CODE(2, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 3),
+        BC7_MODE_PRIO_CODE(4, 0, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 1),
+        BC7_MODE_PRIO_CODE(1, 0, 2),
+        BC7_MODE_PRIO_CODE(2, 1, 2),
+        BC7_MODE_PRIO_CODE(4, 0, 8),
+        BC7_MODE_PRIO_CODE(1, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 0, 15),
+        BC7_MODE_PRIO_CODE(1, 0, 6),
+        BC7_MODE_PRIO_CODE(1, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 0, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 17),
+        BC7_MODE_PRIO_CODE(2, 1, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 1, 19),
+        BC7_MODE_PRIO_CODE(1, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 13),
+        BC7_MODE_PRIO_CODE(3, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 15),
+        BC7_MODE_PRIO_CODE(2, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 2, 63),
+        BC7_MODE_PRIO_CODE(4, 0, 14),
+        BC7_MODE_PRIO_CODE(2, 1, 26),
+        BC7_MODE_PRIO_CODE(2, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 10),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 11),
+        BC7_MODE_PRIO_CODE(4, 0, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 14),
+        BC7_MODE_PRIO_CODE(2, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 0, 1),
+        BC7_MODE_PRIO_CODE(1, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 5),
+        BC7_MODE_PRIO_CODE(1, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 1, 18),
+        BC7_MODE_PRIO_CODE(3, 0, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 3),
+        BC7_MODE_PRIO_CODE(2, 0, 2),
+        BC7_MODE_PRIO_CODE(3, 1, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 31),
+        BC7_MODE_PRIO_CODE(3, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 1, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 62),
+        BC7_MODE_PRIO_CODE(2, 0, 12),
+        BC7_MODE_PRIO_CODE(3, 0, 4),
+        BC7_MODE_PRIO_CODE(4, 0, 4),
+        BC7_MODE_PRIO_CODE(1, 1, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 23),
+        BC7_MODE_PRIO_CODE(2, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 0, 6),
+        BC7_MODE_PRIO_CODE(2, 1, 24),
+        BC7_MODE_PRIO_CODE(1, 1, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 0, 0),
+        BC7_MODE_PRIO_CODE(1, 1, 52),
+        BC7_MODE_PRIO_CODE(4, 0, 7),
+        BC7_MODE_PRIO_CODE(2, 1, 22),
+        BC7_MODE_PRIO_CODE(4, 0, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 7),
+        BC7_MODE_PRIO_CODE(4, 0, 9),
+        BC7_MODE_PRIO_CODE(2, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 0, 1),
+        BC7_MODE_PRIO_CODE(3, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 0, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 14),
+        BC7_MODE_PRIO_CODE(1, 3, 15),
+        BC7_MODE_PRIO_CODE(1, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 1, 32),
+        BC7_MODE_PRIO_CODE(4, 1, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 15),
+        BC7_MODE_PRIO_CODE(4, 0, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 13),
+        BC7_MODE_PRIO_CODE(4, 0, 0),
+        BC7_MODE_PRIO_CODE(3, 1, 19),
+        BC7_MODE_PRIO_CODE(4, 0, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 1),
+        BC7_MODE_PRIO_CODE(4, 0, 2),
+        BC7_MODE_PRIO_CODE(1, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 1, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 1, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 2, 62),
+        BC7_MODE_PRIO_CODE(2, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 26),
+        BC7_MODE_PRIO_CODE(1, 1, 53),
+        BC7_MODE_PRIO_CODE(2, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 2, 13),
+        BC7_MODE_PRIO_CODE(4, 1, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 6),
+        BC7_MODE_PRIO_CODE(4, 1, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 20),
+        BC7_MODE_PRIO_CODE(3, 1, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 19),
+        BC7_MODE_PRIO_CODE(3, 2, 8),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 61),
+        BC7_MODE_PRIO_CODE(2, 1, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 1),
+        BC7_MODE_PRIO_CODE(2, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 1, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 1, 33),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 1, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 1, 55),
+        BC7_MODE_PRIO_CODE(3, 1, 17),
+        BC7_MODE_PRIO_CODE(2, 3, 2),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 11),
+        BC7_MODE_PRIO_CODE(2, 1, 27),
+        BC7_MODE_PRIO_CODE(1, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 26),
+        BC7_MODE_PRIO_CODE(3, 1, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 4),
+        BC7_MODE_PRIO_CODE(3, 1, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 3, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(4, 1, 12),
+        BC7_MODE_PRIO_CODE(2, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 3, 15),
+        BC7_MODE_PRIO_CODE(4, 2, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 60),
+        BC7_MODE_PRIO_CODE(1, 2, 32),
+        BC7_MODE_PRIO_CODE(1, 1, 40),
+        BC7_MODE_PRIO_CODE(4, 1, 18),
+        BC7_MODE_PRIO_CODE(2, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 22),
+        BC7_MODE_PRIO_CODE(3, 2, 16),
+        BC7_MODE_PRIO_CODE(3, 1, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 4),
+        BC7_MODE_PRIO_CODE(2, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 24),
+        BC7_MODE_PRIO_CODE(2, 1, 58),
+        BC7_MODE_PRIO_CODE(4, 2, 8),
+        BC7_MODE_PRIO_CODE(1, 2, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 23),
+        BC7_MODE_PRIO_CODE(1, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 25),
+        BC7_MODE_PRIO_CODE(3, 1, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 2, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 53),
+        BC7_MODE_PRIO_CODE(1, 2, 55),
+        BC7_MODE_PRIO_CODE(3, 2, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 60),
+        BC7_MODE_PRIO_CODE(4, 1, 8),
+        BC7_MODE_PRIO_CODE(2, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 35),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(4, 1, 9),
+        BC7_MODE_PRIO_CODE(1, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 29),
+        BC7_MODE_PRIO_CODE(1, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 13),
+        BC7_MODE_PRIO_CODE(1, 1, 42),
+        BC7_MODE_PRIO_CODE(1, 3, 3),
+        BC7_MODE_PRIO_CODE(4, 2, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 63),
+        BC7_MODE_PRIO_CODE(3, 1, 30),
+        BC7_MODE_PRIO_CODE(1, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 57),
+        BC7_MODE_PRIO_CODE(2, 1, 52),
+        BC7_MODE_PRIO_CODE(1, 2, 61),
+        BC7_MODE_PRIO_CODE(3, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 1, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 19),
+        BC7_MODE_PRIO_CODE(1, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 1, 28),
+        BC7_MODE_PRIO_CODE(2, 2, 14),
+        BC7_MODE_PRIO_CODE(1, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 26),
+        BC7_MODE_PRIO_CODE(2, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 13),
+        BC7_MODE_PRIO_CODE(2, 2, 21),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 1, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 2, 29),
+        BC7_MODE_PRIO_CODE(2, 1, 47),
+        BC7_MODE_PRIO_CODE(1, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 1, 27),
+        BC7_MODE_PRIO_CODE(3, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 3, 10),
+        BC7_MODE_PRIO_CODE(2, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 61),
+        BC7_MODE_PRIO_CODE(3, 2, 14),
+        BC7_MODE_PRIO_CODE(3, 1, 31),
+        BC7_MODE_PRIO_CODE(1, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 19),
+        BC7_MODE_PRIO_CODE(2, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 40),
+        BC7_MODE_PRIO_CODE(2, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 1, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 1, 50),
+        BC7_MODE_PRIO_CODE(2, 1, 41),
+        BC7_MODE_PRIO_CODE(1, 2, 9),
+        BC7_MODE_PRIO_CODE(1, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 2, 25),
+        BC7_MODE_PRIO_CODE(1, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 21),
+        BC7_MODE_PRIO_CODE(1, 1, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 1, 62),
+        BC7_MODE_PRIO_CODE(1, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 59),
+        BC7_MODE_PRIO_CODE(4, 1, 28),
+        BC7_MODE_PRIO_CODE(1, 3, 23),
+        BC7_MODE_PRIO_CODE(4, 1, 30),
+        BC7_MODE_PRIO_CODE(2, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 16),
+        BC7_MODE_PRIO_CODE(4, 1, 35),
+        BC7_MODE_PRIO_CODE(2, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 1, 63),
+        BC7_MODE_PRIO_CODE(1, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 2, 30),
+        BC7_MODE_PRIO_CODE(2, 2, 31),
+        BC7_MODE_PRIO_CODE(1, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 22),
+        BC7_MODE_PRIO_CODE(2, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 62),
+        BC7_MODE_PRIO_CODE(3, 2, 20),
+        BC7_MODE_PRIO_CODE(4, 1, 32),
+        BC7_MODE_PRIO_CODE(2, 1, 43),
+        BC7_MODE_PRIO_CODE(3, 1, 58),
+        BC7_MODE_PRIO_CODE(2, 3, 19),
+        BC7_MODE_PRIO_CODE(2, 2, 32),
+        BC7_MODE_PRIO_CODE(2, 2, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 27),
+        BC7_MODE_PRIO_CODE(2, 2, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 58),
+        BC7_MODE_PRIO_CODE(1, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 2, 12),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(1, 2, 56),
+        BC7_MODE_PRIO_CODE(2, 1, 48),
+        BC7_MODE_PRIO_CODE(2, 1, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 2, 15),
+        BC7_MODE_PRIO_CODE(2, 3, 7),
+        BC7_MODE_PRIO_CODE(1, 2, 37),
+        BC7_MODE_PRIO_CODE(2, 2, 35),
+        BC7_MODE_PRIO_CODE(3, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 6),
+        BC7_MODE_PRIO_CODE(3, 1, 57),
+        BC7_MODE_PRIO_CODE(4, 1, 31),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 1, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 3, 15),
+        BC7_MODE_PRIO_CODE(2, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 14),
+        BC7_MODE_PRIO_CODE(3, 2, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 12),
+        BC7_MODE_PRIO_CODE(3, 3, 2),
+        BC7_MODE_PRIO_CODE(4, 2, 14),
+        BC7_MODE_PRIO_CODE(2, 3, 16),
+        BC7_MODE_PRIO_CODE(1, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 2, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 17),
+        BC7_MODE_PRIO_CODE(1, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 1, 43),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 1, 56),
+        BC7_MODE_PRIO_CODE(3, 1, 53),
+        BC7_MODE_PRIO_CODE(3, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 61),
+        BC7_MODE_PRIO_CODE(2, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 8),
+        BC7_MODE_PRIO_CODE(3, 1, 55),
+        BC7_MODE_PRIO_CODE(4, 1, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 57),
+        BC7_MODE_PRIO_CODE(3, 1, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 35),
+        BC7_MODE_PRIO_CODE(1, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 5),
+        BC7_MODE_PRIO_CODE(2, 2, 6),
+        BC7_MODE_PRIO_CODE(3, 2, 23),
+        BC7_MODE_PRIO_CODE(2, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 42),
+        BC7_MODE_PRIO_CODE(2, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 2, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 9),
+        BC7_MODE_PRIO_CODE(2, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 33),
+        BC7_MODE_PRIO_CODE(2, 3, 22),
+        BC7_MODE_PRIO_CODE(2, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 1, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 34),
+        BC7_MODE_PRIO_CODE(3, 2, 56),
+        BC7_MODE_PRIO_CODE(3, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 7),
+        BC7_MODE_PRIO_CODE(3, 2, 7),
+        BC7_MODE_PRIO_CODE(2, 2, 36),
+        BC7_MODE_PRIO_CODE(3, 2, 36),
+        BC7_MODE_PRIO_CODE(4, 1, 52),
+        BC7_MODE_PRIO_CODE(2, 2, 33),
+        BC7_MODE_PRIO_CODE(3, 1, 45),
+        BC7_MODE_PRIO_CODE(1, 3, 4),
+        BC7_MODE_PRIO_CODE(4, 2, 15),
+        BC7_MODE_PRIO_CODE(3, 1, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 4),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(3, 2, 62),
+        BC7_MODE_PRIO_CODE(1, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 1, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 52),
+        BC7_MODE_PRIO_CODE(3, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 1, 39),
+        BC7_MODE_PRIO_CODE(3, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 17),
+        BC7_MODE_PRIO_CODE(4, 1, 57),
+        BC7_MODE_PRIO_CODE(1, 3, 25),
+        BC7_MODE_PRIO_CODE(2, 3, 11),
+        BC7_MODE_PRIO_CODE(1, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 1, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 3, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 2, 28),
+        BC7_MODE_PRIO_CODE(4, 1, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 1, 51),
+        BC7_MODE_PRIO_CODE(4, 1, 53),
+        BC7_MODE_PRIO_CODE(4, 1, 54),
+        BC7_MODE_PRIO_CODE(1, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 1, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 51),
+        BC7_MODE_PRIO_CODE(4, 2, 12),
+        BC7_MODE_PRIO_CODE(2, 3, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 1, 49),
+        BC7_MODE_PRIO_CODE(1, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 29),
+        BC7_MODE_PRIO_CODE(4, 2, 63),
+        BC7_MODE_PRIO_CODE(2, 2, 27),
+        BC7_MODE_PRIO_CODE(2, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 1, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 63),
+        BC7_MODE_PRIO_CODE(2, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 27),
+        BC7_MODE_PRIO_CODE(4, 1, 46),
+        BC7_MODE_PRIO_CODE(1, 2, 26),
+        BC7_MODE_PRIO_CODE(2, 3, 4),
+        BC7_MODE_PRIO_CODE(2, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 1, 45),
+        BC7_MODE_PRIO_CODE(4, 1, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 1),
+        BC7_MODE_PRIO_CODE(4, 2, 6),
+        BC7_MODE_PRIO_CODE(1, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 3, 62),
+        BC7_MODE_PRIO_CODE(2, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 1, 49),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 3, 25),
+        BC7_MODE_PRIO_CODE(1, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 3),
+        BC7_MODE_PRIO_CODE(3, 1, 37),
+        BC7_MODE_PRIO_CODE(1, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 2, 0),
+        BC7_MODE_PRIO_CODE(2, 3, 35),
+        BC7_MODE_PRIO_CODE(2, 3, 24),
+        BC7_MODE_PRIO_CODE(2, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 2, 53),
+        BC7_MODE_PRIO_CODE(4, 2, 59),
+        BC7_MODE_PRIO_CODE(3, 3, 10),
+        BC7_MODE_PRIO_CODE(1, 2, 3),
+        BC7_MODE_PRIO_CODE(2, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 32),
+        BC7_MODE_PRIO_CODE(1, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 62),
+        BC7_MODE_PRIO_CODE(4, 2, 60),
+        BC7_MODE_PRIO_CODE(2, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 36),
+        BC7_MODE_PRIO_CODE(2, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 1),
+        BC7_MODE_PRIO_CODE(3, 2, 58),
+        BC7_MODE_PRIO_CODE(4, 1, 36),
+        BC7_MODE_PRIO_CODE(3, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 3, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 1, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 21),
+        BC7_MODE_PRIO_CODE(2, 1, 38),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 23),
+        BC7_MODE_PRIO_CODE(1, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 2, 41),
+        BC7_MODE_PRIO_CODE(2, 2, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 2, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 26),
+        BC7_MODE_PRIO_CODE(1, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 29),
+        BC7_MODE_PRIO_CODE(3, 2, 52),
+        BC7_MODE_PRIO_CODE(1, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 33),
+        BC7_MODE_PRIO_CODE(3, 2, 30),
+        BC7_MODE_PRIO_CODE(1, 2, 44),
+        BC7_MODE_PRIO_CODE(1, 2, 2),
+        BC7_MODE_PRIO_CODE(2, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 2, 2),
+        BC7_MODE_PRIO_CODE(1, 2, 47),
+        BC7_MODE_PRIO_CODE(2, 2, 47),
+        BC7_MODE_PRIO_CODE(3, 3, 7),
+        BC7_MODE_PRIO_CODE(2, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 2, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 4),
+        BC7_MODE_PRIO_CODE(3, 2, 0),
+        BC7_MODE_PRIO_CODE(1, 3, 31),
+        BC7_MODE_PRIO_CODE(3, 2, 31),
+        BC7_MODE_PRIO_CODE(3, 3, 12),
+        BC7_MODE_PRIO_CODE(3, 2, 51),
+        BC7_MODE_PRIO_CODE(2, 1, 39),
+        BC7_MODE_PRIO_CODE(1, 3, 48),
+        BC7_MODE_PRIO_CODE(1, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 25),
+        BC7_MODE_PRIO_CODE(4, 2, 22),
+        BC7_MODE_PRIO_CODE(4, 2, 18),
+        BC7_MODE_PRIO_CODE(2, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 28),
+        BC7_MODE_PRIO_CODE(3, 1, 44),
+        BC7_MODE_PRIO_CODE(2, 1, 34),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 2, 54),
+        BC7_MODE_PRIO_CODE(4, 2, 7),
+        BC7_MODE_PRIO_CODE(4, 2, 20),
+        BC7_MODE_PRIO_CODE(2, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 6),
+        BC7_MODE_PRIO_CODE(2, 2, 43),
+        BC7_MODE_PRIO_CODE(2, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 5),
+        BC7_MODE_PRIO_CODE(4, 2, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 19),
+        BC7_MODE_PRIO_CODE(4, 2, 23),
+        BC7_MODE_PRIO_CODE(3, 2, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 27),
+        BC7_MODE_PRIO_CODE(1, 3, 57),
+        BC7_MODE_PRIO_CODE(2, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 3, 21),
+        BC7_MODE_PRIO_CODE(3, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 1, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 1, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 19),
+        BC7_MODE_PRIO_CODE(3, 1, 38),
+        BC7_MODE_PRIO_CODE(2, 2, 38),
+        BC7_MODE_PRIO_CODE(2, 3, 31),
+        BC7_MODE_PRIO_CODE(2, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 2, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 1, 38),
+        BC7_MODE_PRIO_CODE(1, 3, 41),
+        BC7_MODE_PRIO_CODE(1, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 9),
+        BC7_MODE_PRIO_CODE(3, 1, 34),
+        BC7_MODE_PRIO_CODE(4, 1, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 50),
+        BC7_MODE_PRIO_CODE(1, 3, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 40),
+        BC7_MODE_PRIO_CODE(1, 3, 51),
+        BC7_MODE_PRIO_CODE(2, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 45),
+        BC7_MODE_PRIO_CODE(2, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 20),
+        BC7_MODE_PRIO_CODE(2, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 57),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 61),
+        BC7_MODE_PRIO_CODE(1, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 1),
+        BC7_MODE_PRIO_CODE(3, 3, 22),
+        BC7_MODE_PRIO_CODE(1, 3, 49),
+        BC7_MODE_PRIO_CODE(2, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 15),
+        BC7_MODE_PRIO_CODE(3, 3, 5),
+        BC7_MODE_PRIO_CODE(4, 1, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 14),
+        BC7_MODE_PRIO_CODE(4, 3, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 60),
+        BC7_MODE_PRIO_CODE(1, 3, 53),
+        BC7_MODE_PRIO_CODE(2, 3, 53),
+        BC7_MODE_PRIO_CODE(4, 3, 32),
+        BC7_MODE_PRIO_CODE(3, 3, 24),
+        BC7_MODE_PRIO_CODE(3, 3, 63),
+        BC7_MODE_PRIO_CODE(3, 2, 37),
+        BC7_MODE_PRIO_CODE(1, 3, 52),
+        BC7_MODE_PRIO_CODE(2, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(4, 2, 34),
+        BC7_MODE_PRIO_CODE(1, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 62),
+        BC7_MODE_PRIO_CODE(3, 3, 18),
+        BC7_MODE_PRIO_CODE(3, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 58),
+        BC7_MODE_PRIO_CODE(1, 3, 42),
+        BC7_MODE_PRIO_CODE(2, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 2, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 2, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 2, 33),
+        BC7_MODE_PRIO_CODE(1, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 2, 32),
+        BC7_MODE_PRIO_CODE(3, 2, 43),
+        BC7_MODE_PRIO_CODE(3, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 3, 36),
+        BC7_MODE_PRIO_CODE(2, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 54),
+        BC7_MODE_PRIO_CODE(2, 2, 49),
+        BC7_MODE_PRIO_CODE(3, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 1, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 3),
+        BC7_MODE_PRIO_CODE(3, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 2, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 1),
+        BC7_MODE_PRIO_CODE(1, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 16),
+        BC7_MODE_PRIO_CODE(2, 2, 50),
+        BC7_MODE_PRIO_CODE(3, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 3),
+        BC7_MODE_PRIO_CODE(1, 2, 48),
+        BC7_MODE_PRIO_CODE(2, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 2, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 9),
+        BC7_MODE_PRIO_CODE(1, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 10),
+        BC7_MODE_PRIO_CODE(3, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 2, 51),
+        BC7_MODE_PRIO_CODE(1, 3, 37),
+        BC7_MODE_PRIO_CODE(2, 3, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 50),
+        BC7_MODE_PRIO_CODE(2, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 20),
+        BC7_MODE_PRIO_CODE(3, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 3, 6),
+        BC7_MODE_PRIO_CODE(4, 3, 8),
+        BC7_MODE_PRIO_CODE(4, 2, 37),
+        BC7_MODE_PRIO_CODE(3, 3, 58),
+        BC7_MODE_PRIO_CODE(3, 3, 59),
+        BC7_MODE_PRIO_CODE(4, 2, 56),
+        BC7_MODE_PRIO_CODE(1, 3, 39),
+        BC7_MODE_PRIO_CODE(2, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 2, 43),
+        BC7_MODE_PRIO_CODE(1, 3, 44),
+        BC7_MODE_PRIO_CODE(2, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 7),
+        BC7_MODE_PRIO_CODE(3, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 3, 23),
+        BC7_MODE_PRIO_CODE(3, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 22),
+        BC7_MODE_PRIO_CODE(3, 3, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 48),
+        BC7_MODE_PRIO_CODE(3, 3, 51),
+        BC7_MODE_PRIO_CODE(1, 2, 42),
+        BC7_MODE_PRIO_CODE(2, 2, 42),
+        BC7_MODE_PRIO_CODE(3, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 19),
+        BC7_MODE_PRIO_CODE(4, 3, 21),
+        BC7_MODE_PRIO_CODE(2, 2, 46),
+        BC7_MODE_PRIO_CODE(3, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 2, 28),
+        BC7_MODE_PRIO_CODE(3, 3, 49),
+        BC7_MODE_PRIO_CODE(3, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 55),
+        BC7_MODE_PRIO_CODE(2, 2, 26),
+        BC7_MODE_PRIO_CODE(3, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 30),
+        BC7_MODE_PRIO_CODE(3, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 2, 41),
+        BC7_MODE_PRIO_CODE(4, 2, 29),
+        BC7_MODE_PRIO_CODE(1, 3, 34),
+        BC7_MODE_PRIO_CODE(2, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 44),
+        BC7_MODE_PRIO_CODE(3, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 2, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 18),
+        BC7_MODE_PRIO_CODE(4, 3, 17),
+        BC7_MODE_PRIO_CODE(3, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 11),
+        BC7_MODE_PRIO_CODE(3, 3, 57),
+        BC7_MODE_PRIO_CODE(3, 2, 38),
+        BC7_MODE_PRIO_CODE(3, 3, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 25),
+        BC7_MODE_PRIO_CODE(4, 3, 4),
+        BC7_MODE_PRIO_CODE(3, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 61),
+        BC7_MODE_PRIO_CODE(4, 2, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 5),
+        BC7_MODE_PRIO_CODE(3, 3, 54),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 3, 24),
+        BC7_MODE_PRIO_CODE(4, 3, 12),
+        BC7_MODE_PRIO_CODE(4, 2, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 40),
+        BC7_MODE_PRIO_CODE(3, 3, 44),
+        BC7_MODE_PRIO_CODE(4, 3, 63),
+        BC7_MODE_PRIO_CODE(4, 3, 50),
+        BC7_MODE_PRIO_CODE(4, 2, 50),
+        BC7_MODE_PRIO_CODE(4, 3, 60),
+        BC7_MODE_PRIO_CODE(4, 2, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 62),
+        BC7_MODE_PRIO_CODE(4, 3, 49),
+        BC7_MODE_PRIO_CODE(4, 3, 58),
+        BC7_MODE_PRIO_CODE(4, 3, 47),
+        BC7_MODE_PRIO_CODE(4, 3, 56),
+        BC7_MODE_PRIO_CODE(4, 2, 26),
+        BC7_MODE_PRIO_CODE(4, 2, 27),
+        BC7_MODE_PRIO_CODE(3, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 57),
+        BC7_MODE_PRIO_CODE(4, 3, 48),
+        BC7_MODE_PRIO_CODE(4, 3, 31),
+        BC7_MODE_PRIO_CODE(4, 3, 51),
+        BC7_MODE_PRIO_CODE(4, 3, 28),
+        BC7_MODE_PRIO_CODE(4, 3, 53),
+        BC7_MODE_PRIO_CODE(3, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 40),
+        BC7_MODE_PRIO_CODE(4, 3, 27),
+        BC7_MODE_PRIO_CODE(4, 2, 2),
+        BC7_MODE_PRIO_CODE(3, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 2, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 54),
+        BC7_MODE_PRIO_CODE(3, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 52),
+        BC7_MODE_PRIO_CODE(4, 3, 30),
+        BC7_MODE_PRIO_CODE(4, 3, 59),
+        BC7_MODE_PRIO_CODE(1, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 45),
+        BC7_MODE_PRIO_CODE(4, 2, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 35),
+        BC7_MODE_PRIO_CODE(4, 3, 41),
+        BC7_MODE_PRIO_CODE(3, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 2, 46),
+        BC7_MODE_PRIO_CODE(4, 3, 46),
+        BC7_MODE_PRIO_CODE(2, 2, 45),
+        BC7_MODE_PRIO_CODE(4, 3, 43),
+        BC7_MODE_PRIO_CODE(4, 3, 37),
+        BC7_MODE_PRIO_CODE(4, 3, 38),
+        BC7_MODE_PRIO_CODE(4, 3, 36),
+        BC7_MODE_PRIO_CODE(4, 3, 42),
+        BC7_MODE_PRIO_CODE(4, 3, 34),
+        BC7_MODE_PRIO_CODE(4, 3, 39),
+        BC7_MODE_PRIO_CODE(4, 3, 55),
+        BC7_MODE_PRIO_CODE(4, 3, 44),
+        BC7_MODE_PRIO_CODE(3, 2, 45),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 2, 45),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGB = g_bc7PrioCodesRGBData;
+    const int g_bc7NumPrioCodesRGB = sizeof(g_bc7PrioCodesRGBData) / sizeof(g_bc7PrioCodesRGBData[0]);
+
+    const uint16_t g_bc7PrioCodesRGBAData[] =
+    {
+        BC7_MODE_PRIO_CODE(1, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 4, 1),
+        BC7_MODE_PRIO_CODE(3, 4, 1),
+        BC7_MODE_PRIO_CODE(2, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 4, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 15),
+        BC7_MODE_PRIO_CODE(1, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 4, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 6, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 4, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 1),
+        BC7_MODE_PRIO_CODE(1, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 4, 1),
+        BC7_MODE_PRIO_CODE(1, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 10),
+        BC7_MODE_PRIO_CODE(1, 7, 3),
+        BC7_MODE_PRIO_CODE(4, 6, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 13),
+        BC7_MODE_PRIO_CODE(1, 7, 16),
+        BC7_MODE_PRIO_CODE(1, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 9),
+        BC7_MODE_PRIO_CODE(2, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 31),
+        BC7_MODE_PRIO_CODE(1, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 4, 0),
+        BC7_MODE_PRIO_CODE(1, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 13),
+        BC7_MODE_PRIO_CODE(3, 7, 0),
+        BC7_MODE_PRIO_CODE(1, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 4, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 11),
+        BC7_MODE_PRIO_CODE(1, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 15),
+        BC7_MODE_PRIO_CODE(2, 7, 14),
+        BC7_MODE_PRIO_CODE(1, 7, 5),
+        BC7_MODE_PRIO_CODE(1, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 5, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 1),
+        BC7_MODE_PRIO_CODE(2, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 4, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 0),
+        BC7_MODE_PRIO_CODE(2, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 7),
+        BC7_MODE_PRIO_CODE(3, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 15),
+        BC7_MODE_PRIO_CODE(4, 4, 31),
+        BC7_MODE_PRIO_CODE(2, 7, 21),
+        BC7_MODE_PRIO_CODE(2, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 4, 21),
+        BC7_MODE_PRIO_CODE(3, 7, 29),
+        BC7_MODE_PRIO_CODE(2, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 29),
+        BC7_MODE_PRIO_CODE(3, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 4, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 2),
+        BC7_MODE_PRIO_CODE(2, 7, 16),
+        BC7_MODE_PRIO_CODE(2, 7, 3),
+        BC7_MODE_PRIO_CODE(2, 5, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 23),
+        BC7_MODE_PRIO_CODE(3, 7, 6),
+        BC7_MODE_PRIO_CODE(2, 7, 12),
+        BC7_MODE_PRIO_CODE(1, 7, 61),
+        BC7_MODE_PRIO_CODE(4, 4, 11),
+        BC7_MODE_PRIO_CODE(3, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 8),
+        BC7_MODE_PRIO_CODE(2, 7, 22),
+        BC7_MODE_PRIO_CODE(2, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 19),
+        BC7_MODE_PRIO_CODE(2, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 4, 21),
+        BC7_MODE_PRIO_CODE(2, 7, 24),
+        BC7_MODE_PRIO_CODE(1, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 18),
+        BC7_MODE_PRIO_CODE(2, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 4),
+        BC7_MODE_PRIO_CODE(2, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 3),
+        BC7_MODE_PRIO_CODE(3, 7, 16),
+        BC7_MODE_PRIO_CODE(3, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 21),
+        BC7_MODE_PRIO_CODE(1, 7, 62),
+        BC7_MODE_PRIO_CODE(2, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 33),
+        BC7_MODE_PRIO_CODE(2, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 33),
+        BC7_MODE_PRIO_CODE(4, 7, 33),
+        BC7_MODE_PRIO_CODE(3, 7, 11),
+        BC7_MODE_PRIO_CODE(3, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 26),
+        BC7_MODE_PRIO_CODE(3, 7, 25),
+        BC7_MODE_PRIO_CODE(1, 7, 63),
+        BC7_MODE_PRIO_CODE(2, 5, 10),
+        BC7_MODE_PRIO_CODE(3, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 5, 0),
+        BC7_MODE_PRIO_CODE(3, 7, 24),
+        BC7_MODE_PRIO_CODE(3, 7, 22),
+        BC7_MODE_PRIO_CODE(3, 7, 9),
+        BC7_MODE_PRIO_CODE(1, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 7, 4),
+        BC7_MODE_PRIO_CODE(3, 5, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 14),
+        BC7_MODE_PRIO_CODE(3, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 18),
+        BC7_MODE_PRIO_CODE(1, 7, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 4, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 15),
+        BC7_MODE_PRIO_CODE(3, 7, 17),
+        BC7_MODE_PRIO_CODE(2, 7, 32),
+        BC7_MODE_PRIO_CODE(3, 7, 32),
+        BC7_MODE_PRIO_CODE(2, 5, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 1),
+        BC7_MODE_PRIO_CODE(4, 7, 2),
+        BC7_MODE_PRIO_CODE(1, 7, 28),
+        BC7_MODE_PRIO_CODE(1, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 32),
+        BC7_MODE_PRIO_CODE(1, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 4, 10),
+        BC7_MODE_PRIO_CODE(3, 5, 10),
+        BC7_MODE_PRIO_CODE(2, 7, 60),
+        BC7_MODE_PRIO_CODE(2, 4, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 4, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 62),
+        BC7_MODE_PRIO_CODE(1, 7, 41),
+        BC7_MODE_PRIO_CODE(1, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 40),
+        BC7_MODE_PRIO_CODE(1, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 8),
+        BC7_MODE_PRIO_CODE(4, 7, 6),
+        BC7_MODE_PRIO_CODE(1, 7, 53),
+        BC7_MODE_PRIO_CODE(4, 7, 9),
+        BC7_MODE_PRIO_CODE(3, 7, 61),
+        BC7_MODE_PRIO_CODE(3, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 22),
+        BC7_MODE_PRIO_CODE(4, 7, 20),
+        BC7_MODE_PRIO_CODE(3, 7, 62),
+        BC7_MODE_PRIO_CODE(4, 7, 7),
+        BC7_MODE_PRIO_CODE(1, 7, 42),
+        BC7_MODE_PRIO_CODE(1, 7, 52),
+        BC7_MODE_PRIO_CODE(4, 5, 30),
+        BC7_MODE_PRIO_CODE(1, 7, 56),
+        BC7_MODE_PRIO_CODE(1, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 5, 20),
+        BC7_MODE_PRIO_CODE(1, 7, 48),
+        BC7_MODE_PRIO_CODE(2, 7, 28),
+        BC7_MODE_PRIO_CODE(3, 7, 28),
+        BC7_MODE_PRIO_CODE(4, 7, 19),
+        BC7_MODE_PRIO_CODE(3, 7, 35),
+        BC7_MODE_PRIO_CODE(1, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 30),
+        BC7_MODE_PRIO_CODE(3, 7, 63),
+        BC7_MODE_PRIO_CODE(4, 7, 21),
+        BC7_MODE_PRIO_CODE(4, 7, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 3),
+        BC7_MODE_PRIO_CODE(1, 7, 47),
+        BC7_MODE_PRIO_CODE(1, 7, 37),
+        BC7_MODE_PRIO_CODE(4, 5, 10),
+        BC7_MODE_PRIO_CODE(4, 7, 23),
+        BC7_MODE_PRIO_CODE(1, 7, 57),
+        BC7_MODE_PRIO_CODE(4, 7, 17),
+        BC7_MODE_PRIO_CODE(1, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 24),
+        BC7_MODE_PRIO_CODE(4, 7, 60),
+        BC7_MODE_PRIO_CODE(1, 7, 50),
+        BC7_MODE_PRIO_CODE(2, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 25),
+        BC7_MODE_PRIO_CODE(3, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 59),
+        BC7_MODE_PRIO_CODE(2, 7, 55),
+        BC7_MODE_PRIO_CODE(4, 7, 18),
+        BC7_MODE_PRIO_CODE(4, 7, 12),
+        BC7_MODE_PRIO_CODE(4, 7, 5),
+        BC7_MODE_PRIO_CODE(3, 7, 59),
+        BC7_MODE_PRIO_CODE(1, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 16),
+        BC7_MODE_PRIO_CODE(4, 7, 11),
+        BC7_MODE_PRIO_CODE(2, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 4, 20),
+        BC7_MODE_PRIO_CODE(4, 7, 4),
+        BC7_MODE_PRIO_CODE(1, 7, 49),
+        BC7_MODE_PRIO_CODE(2, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 27),
+        BC7_MODE_PRIO_CODE(4, 7, 62),
+        BC7_MODE_PRIO_CODE(3, 7, 58),
+        BC7_MODE_PRIO_CODE(4, 5, 20),
+        BC7_MODE_PRIO_CODE(2, 7, 53),
+        BC7_MODE_PRIO_CODE(3, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 40),
+        BC7_MODE_PRIO_CODE(3, 7, 40),
+        BC7_MODE_PRIO_CODE(2, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 31),
+        BC7_MODE_PRIO_CODE(4, 7, 61),
+        BC7_MODE_PRIO_CODE(1, 7, 36),
+        BC7_MODE_PRIO_CODE(4, 7, 63),
+        BC7_MODE_PRIO_CODE(1, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 55),
+        BC7_MODE_PRIO_CODE(2, 7, 52),
+        BC7_MODE_PRIO_CODE(2, 7, 56),
+        BC7_MODE_PRIO_CODE(2, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 57),
+        BC7_MODE_PRIO_CODE(3, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 57),
+        BC7_MODE_PRIO_CODE(2, 7, 49),
+        BC7_MODE_PRIO_CODE(3, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 43),
+        BC7_MODE_PRIO_CODE(3, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 28),
+        BC7_MODE_PRIO_CODE(2, 7, 48),
+        BC7_MODE_PRIO_CODE(3, 7, 52),
+        BC7_MODE_PRIO_CODE(3, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 59),
+        BC7_MODE_PRIO_CODE(4, 7, 40),
+        BC7_MODE_PRIO_CODE(4, 7, 27),
+        BC7_MODE_PRIO_CODE(3, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 55),
+        BC7_MODE_PRIO_CODE(3, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 42),
+        BC7_MODE_PRIO_CODE(2, 7, 54),
+        BC7_MODE_PRIO_CODE(3, 7, 54),
+        BC7_MODE_PRIO_CODE(4, 7, 54),
+        BC7_MODE_PRIO_CODE(2, 7, 47),
+        BC7_MODE_PRIO_CODE(3, 7, 47),
+        BC7_MODE_PRIO_CODE(4, 7, 43),
+        BC7_MODE_PRIO_CODE(4, 7, 31),
+        BC7_MODE_PRIO_CODE(3, 7, 37),
+        BC7_MODE_PRIO_CODE(3, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 48),
+        BC7_MODE_PRIO_CODE(4, 7, 45),
+        BC7_MODE_PRIO_CODE(4, 7, 47),
+        BC7_MODE_PRIO_CODE(2, 7, 36),
+        BC7_MODE_PRIO_CODE(1, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 35),
+        BC7_MODE_PRIO_CODE(4, 7, 58),
+        BC7_MODE_PRIO_CODE(3, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 50),
+        BC7_MODE_PRIO_CODE(3, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 50),
+        BC7_MODE_PRIO_CODE(4, 7, 52),
+        BC7_MODE_PRIO_CODE(1, 7, 39),
+        BC7_MODE_PRIO_CODE(1, 7, 34),
+        BC7_MODE_PRIO_CODE(1, 7, 38),
+        BC7_MODE_PRIO_CODE(2, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 38),
+        BC7_MODE_PRIO_CODE(4, 7, 30),
+        BC7_MODE_PRIO_CODE(2, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 41),
+        BC7_MODE_PRIO_CODE(4, 7, 53),
+        BC7_MODE_PRIO_CODE(2, 7, 46),
+        BC7_MODE_PRIO_CODE(3, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 49),
+        BC7_MODE_PRIO_CODE(4, 7, 56),
+        BC7_MODE_PRIO_CODE(4, 7, 37),
+        BC7_MODE_PRIO_CODE(2, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 44),
+        BC7_MODE_PRIO_CODE(4, 7, 36),
+        BC7_MODE_PRIO_CODE(2, 7, 39),
+        BC7_MODE_PRIO_CODE(2, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 38),
+        BC7_MODE_PRIO_CODE(3, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 51),
+        BC7_MODE_PRIO_CODE(4, 7, 46),
+        BC7_MODE_PRIO_CODE(4, 7, 44),
+        BC7_MODE_PRIO_CODE(3, 7, 39),
+        BC7_MODE_PRIO_CODE(3, 7, 34),
+        BC7_MODE_PRIO_CODE(4, 7, 39),
+        BC7_MODE_PRIO_CODE(4, 7, 34),
+    };
+
+    const uint16_t *g_bc7PrioCodesRGBA = g_bc7PrioCodesRGBAData;
+    const int g_bc7NumPrioCodesRGBA = sizeof(g_bc7PrioCodesRGBAData) / sizeof(g_bc7PrioCodesRGBA[0]);
+
+    int UnpackMode(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_MODE_OFFSET_BITS) & ((1 << BC7_MODE_BITS) - 1));
+    }
+
+    int UnpackSeedPointCount(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_SEED_POINT_COUNT_OFFSET_BITS) & ((1 << BC7_SEED_POINT_COUNT_BITS) - 1)) + 1;
+    }
+
+    int UnpackPartition(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_PARTITION_OFFSET_BITS) & ((1 << BC7_PARTITION_BITS) - 1));
+    }
+
+    int UnpackRotation(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_ROTATION_OFFSET_BITS) & ((1 << BC7_ROTATION_BITS) - 1));
+    }
+
+    int UnpackIndexSelector(uint16_t packed)
+    {
+        return static_cast<int>((packed >> BC7_INDEX_MODE_OFFSET_BITS) & ((1 << BC7_INDEX_MODE_BITS) - 1));
+    }
+}}}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
index b5564c0dab..b45ba5eca8 100644
--- a/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
+++ b/thirdparty/cvtt/ConvectionKernels_BC7_SingleColor.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
 namespace cvtt { namespace Tables { namespace BC7SC {
 
 struct TableEntry
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
new file mode 100644
index 0000000000..be16d1db06
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.cpp
@@ -0,0 +1,46 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_BCCommon.h"
+
+int cvtt::Internal::BCCommon::TweakRoundsForRange(int range)
+{
+    if (range == 3)
+        return 3;
+    return 4;
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_BCCommon.h b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
new file mode 100644
index 0000000000..3e13151acd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_BCCommon.h
@@ -0,0 +1,104 @@
+#pragma once
+#ifndef __CVTT_BCCOMMON_H__
+#define __CVTT_BCCOMMON_H__
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class BCCommon
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static int TweakRoundsForRange(int range);
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
+            {
+                for (int ch = 0; ch < numRealChannels; ch++)
+                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
+            }
+
+            template<int TVectorSize>
+            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
+            {
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
+            {
+                AggregatedError<TVectorSize> aggError;
+                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
+                return aggError.Finalize(flags, channelWeightsSq);
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TVectorSize>
+            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
+            {
+                MFloat error = ParallelMath::MakeFloatZero();
+                if (flags & Flags::Uniform)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
+                }
+                else
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
+                }
+
+                return error;
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+
+            template<int TChannelCount>
+            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+            {
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < TChannelCount; ch++)
+                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
+                }
+            }
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Config.h b/thirdparty/cvtt/ConvectionKernels_Config.h
new file mode 100644
index 0000000000..e79d32b1da
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Config.h
@@ -0,0 +1,12 @@
+#pragma once
+#ifndef __CVTT_CONFIG_H__
+#define __CVTT_CONFIG_H__
+
+#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
+#define CVTT_USE_SSE2
+#endif
+
+// Define this to compile everything as a single source file
+//#define CVTT_SINGLE_FILE
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.cpp b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
new file mode 100644
index 0000000000..cb202a6e9c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.cpp
@@ -0,0 +1,3147 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ETC.h"
+#include "ConvectionKernels_ETC1.h"
+#include "ConvectionKernels_ETC2.h"
+#include "ConvectionKernels_ETC2_Rounding.h"
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_FakeBT709_Rounding.h"
+
+#include <cmath>
+
+const int cvtt::Internal::ETCComputer::g_flipTables[2][2][8] =
+{
+    {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 }
+    },
+    {
+        { 0, 1, 2, 3, 4, 5, 6, 7 },
+        { 8, 9, 10, 11, 12, 13, 14, 15 }
+    },
+};
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3])
+{
+    MSInt16 d0 = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[0]);
+    MFloat fd0 = ParallelMath::ToFloat(d0);
+    MFloat error = fd0 * fd0;
+    for (int ch = 1; ch < 3; ch++)
+    {
+        MSInt16 d = ParallelMath::LosslessCast<MSInt16>::Cast(pixelA[ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixelB[ch]);
+        MFloat fd = ParallelMath::ToFloat(d);
+        error = error + fd * fd;
+    }
+    return error;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3], const Options options)
+{
+    MFloat dr = ParallelMath::ToFloat(reconstructed[0]) * options.redWeight - preWeightedPixel[0];
+    MFloat dg = ParallelMath::ToFloat(reconstructed[1]) * options.greenWeight - preWeightedPixel[1];
+    MFloat db = ParallelMath::ToFloat(reconstructed[2]) * options.blueWeight - preWeightedPixel[2];
+
+    return dr * dr + dg * dg + db * db;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::ETCComputer::ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat preWeightedPixel[3])
+{
+    MFloat yuv[3];
+    ConvertToFakeBT709(yuv, reconstructed);
+
+    MFloat dy = yuv[0] - preWeightedPixel[0];
+    MFloat du = yuv[1] - preWeightedPixel[1];
+    MFloat dv = yuv[2] - preWeightedPixel[2];
+
+    return dy * dy + du * du + dv * dv;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+
+        if (isDifferential)
+            unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+        else
+            unquantized[ch] = (quantized[ch] << 4) | quantized[ch];
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[4][3];
+    for (unsigned int s = 0; s < 4; s++)
+        for (int ch = 0; ch < 3; ch++)
+            unquantizedModified[s][ch] = ParallelMath::Min(ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::ToSInt16(unquantized[ch]) + modifiers[s], s16_zero)), u15_255);
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt16 bestSelector = ParallelMath::MakeUInt16(0);
+
+        for (unsigned int s = 0; s < 4; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt16(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        totalError = totalError + bestError;
+        selectors = selectors | (bestSelector << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options)
+{
+    MUInt15 quantized[3];
+    MUInt15 unquantized[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        quantized[ch] = (ParallelMath::RightShift(quantizedPackedColor, (ch * 5)) & ParallelMath::MakeUInt15(31));
+        unquantized[ch] = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+    }
+
+    MUInt16 selectors = ParallelMath::MakeUInt16(0);
+    MFloat totalError = ParallelMath::MakeFloatZero();
+
+    MUInt15 u15_255 = ParallelMath::MakeUInt15(255);
+    MSInt16 s16_zero = ParallelMath::MakeSInt16(0);
+
+    MUInt15 unquantizedModified[3][3];
+    for (int ch = 0; ch < 3; ch++)
+    {
+        unquantizedModified[0][ch] = ParallelMath::Max(unquantized[ch], modifier) - modifier;
+        unquantizedModified[1][ch] = unquantized[ch];
+        unquantizedModified[2][ch] = ParallelMath::Min(unquantized[ch] + modifier, u15_255);
+    }
+
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    for (int px = 0; px < 8; px++)
+    {
+        ParallelMath::FloatCompFlag isTransparentFloat = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+        MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+        MUInt15 bestSelector = ParallelMath::MakeUInt15(0);
+
+        for (unsigned int s = 0; s < 3; s++)
+        {
+            MFloat error;
+            if (isFakeBT709)
+                error = ComputeErrorFakeBT709(unquantizedModified[s], preWeightedPixels[px]);
+            else if (isUniform)
+                error = ComputeErrorUniform(pixels[px], unquantizedModified[s]);
+            else
+                error = ComputeErrorWeighted(unquantizedModified[s], preWeightedPixels[px], options);
+
+            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+            bestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(s), bestSelector);
+            bestError = ParallelMath::Min(error, bestError);
+        }
+
+        // Annoying quirk: The ETC encoding machinery assumes that selectors are in the table order in the spec, which isn't
+        // the same as their encoding bits, so the transparent index is actually 1 and the valid indexes are 0, 2, and 3.
+
+        // Remap selector 1 to 2, and 2 to 3
+        bestSelector = ParallelMath::Min(ParallelMath::MakeUInt15(3), bestSelector << 1);
+
+        // Mark zero transparent as 
+        ParallelMath::ConditionalSet(bestError, isTransparentFloat, ParallelMath::MakeFloatZero());
+        ParallelMath::ConditionalSet(bestSelector, isTransparent[px], ParallelMath::MakeUInt15(1));
+
+        totalError = totalError + bestError;
+        selectors = selectors | (ParallelMath::LosslessCast<MUInt16>::Cast(bestSelector) << (px * 2));
+    }
+
+    outError = totalError;
+    outSelectors = selectors;
+}
+
+void cvtt::Internal::ETCComputer::FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs)
+{
+    // We do this part scalar because most of the cost benefit of parallelization is in error evaluation,
+    // and this code has a LOT of early-outs and disjointed index lookups that vary heavily between blocks
+    // and save a lot of time.
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        bool canIgnore[2] = { ParallelMath::Extract(canIgnoreSector[0], block), ParallelMath::Extract(canIgnoreSector[1], block) };
+        bool canIgnoreEither = canIgnore[0] || canIgnore[1];
+        float blockBestTotalError = ParallelMath::Extract(bestTotalError, block);
+        float bestDiffErrors[2] = { FLT_MAX, FLT_MAX };
+        uint16_t bestDiffSelectors[2] = { 0, 0 };
+        uint16_t bestDiffColors[2] = { 0, 0 };
+        uint16_t bestDiffTables[2] = { 0, 0 };
+        for (int sector = 0; sector < 2; sector++)
+        {
+            unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+            for (unsigned int i = 0; i < sectorNumAttempts; i++)
+            {
+                float error = ParallelMath::Extract(drs.diffErrors[sector][i], block);
+                if (error < bestDiffErrors[sector])
+                {
+                    bestDiffErrors[sector] = error;
+                    bestDiffSelectors[sector] = ParallelMath::Extract(drs.diffSelectors[sector][i], block);
+                    bestDiffColors[sector] = ParallelMath::Extract(drs.diffColors[sector][i], block);
+                    bestDiffTables[sector] = ParallelMath::Extract(drs.diffTables[sector][i], block);
+                }
+            }
+        }
+
+        if (canIgnore[0])
+            bestDiffColors[0] = bestDiffColors[1];
+        else if (canIgnore[1])
+            bestDiffColors[1] = bestDiffColors[0];
+
+        // The best differential possibilities must be better than the best total error
+        if (bestDiffErrors[0] + bestDiffErrors[1] < blockBestTotalError)
+        {
+            // Fast path if the best possible case is legal
+            if (canIgnoreEither || ETCDifferentialIsLegalScalar(bestDiffColors[0], bestDiffColors[1]))
+            {
+                ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                ParallelMath::PutFloat(bestTotalError, block, bestDiffErrors[0] + bestDiffErrors[1]);
+                ParallelMath::PutUInt15(bestFlip, block, flip);
+                ParallelMath::PutUInt15(bestD, block, d);
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    ParallelMath::PutUInt15(bestColors[sector], block, bestDiffColors[sector]);
+                    ParallelMath::PutUInt16(bestSelectors[sector], block, bestDiffSelectors[sector]);
+                    ParallelMath::PutUInt15(bestTables[sector], block, bestDiffTables[sector]);
+                }
+            }
+            else
+            {
+                // Slow path: Sort the possible cases by quality, and search valid combinations
+                // TODO: Pre-flatten the error lists so this is nicer to cache
+                unsigned int numSortIndexes[2] = { 0, 0 };
+                for (int sector = 0; sector < 2; sector++)
+                {
+                    unsigned int sectorNumAttempts = ParallelMath::Extract(drs.diffNumAttempts[sector], block);
+
+                    for (unsigned int i = 0; i < sectorNumAttempts; i++)
+                    {
+                        if (ParallelMath::Extract(drs.diffErrors[sector][i], block) < blockBestTotalError)
+                            drs.attemptSortIndexes[sector][numSortIndexes[sector]++] = i;
+                    }
+
+                    struct SortPredicate
+                    {
+                        const MFloat *diffErrors;
+                        int block;
+
+                        bool operator()(uint16_t a, uint16_t b) const
+                        {
+                            float errorA = ParallelMath::Extract(diffErrors[a], block);
+                            float errorB = ParallelMath::Extract(diffErrors[b], block);
+
+                            if (errorA < errorB)
+                                return true;
+                            if (errorA > errorB)
+                                return false;
+
+                            return a < b;
+                        }
+                    };
+
+                    SortPredicate sp;
+                    sp.diffErrors = drs.diffErrors[sector];
+                    sp.block = block;
+
+                    std::sort<uint16_t*, const SortPredicate&>(drs.attemptSortIndexes[sector], drs.attemptSortIndexes[sector] + numSortIndexes[sector], sp);
+                }
+
+                int scannedElements = 0;
+                for (unsigned int i = 0; i < numSortIndexes[0]; i++)
+                {
+                    unsigned int attemptIndex0 = drs.attemptSortIndexes[0][i];
+                    float error0 = ParallelMath::Extract(drs.diffErrors[0][attemptIndex0], block);
+
+                    scannedElements++;
+
+                    if (error0 >= blockBestTotalError)
+                        break;
+
+                    float maxError1 = ParallelMath::Extract(bestTotalError, block) - error0;
+                    uint16_t diffColor0 = ParallelMath::Extract(drs.diffColors[0][attemptIndex0], block);
+
+                    if (maxError1 < bestDiffErrors[1])
+                        break;
+
+                    for (unsigned int j = 0; j < numSortIndexes[1]; j++)
+                    {
+                        unsigned int attemptIndex1 = drs.attemptSortIndexes[1][j];
+                        float error1 = ParallelMath::Extract(drs.diffErrors[1][attemptIndex1], block);
+
+                        scannedElements++;
+
+                        if (error1 >= maxError1)
+                            break;
+
+                        uint16_t diffColor1 = ParallelMath::Extract(drs.diffColors[1][attemptIndex1], block);
+
+                        if (ETCDifferentialIsLegalScalar(diffColor0, diffColor1))
+                        {
+                            blockBestTotalError = error0 + error1;
+
+                            ParallelMath::PutBoolInt16(bestIsThisMode, block, true);
+                            ParallelMath::PutFloat(bestTotalError, block, blockBestTotalError);
+                            ParallelMath::PutUInt15(bestFlip, block, flip);
+                            ParallelMath::PutUInt15(bestD, block, d);
+                            ParallelMath::PutUInt15(bestColors[0], block, diffColor0);
+                            ParallelMath::PutUInt15(bestColors[1], block, diffColor1);
+                            ParallelMath::PutUInt16(bestSelectors[0], block, ParallelMath::Extract(drs.diffSelectors[0][attemptIndex0], block));
+                            ParallelMath::PutUInt16(bestSelectors[1], block, ParallelMath::Extract(drs.diffSelectors[1][attemptIndex1], block));
+                            ParallelMath::PutUInt15(bestTables[0], block, ParallelMath::Extract(drs.diffTables[0][attemptIndex0], block));
+                            ParallelMath::PutUInt15(bestTables[1], block, ParallelMath::Extract(drs.diffTables[1][attemptIndex1], block));
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b)
+{
+    MSInt16 diff = ParallelMath::LosslessCast<MSInt16>::Cast(b) - ParallelMath::LosslessCast<MSInt16>::Cast(a);
+
+    return ParallelMath::Less(ParallelMath::MakeSInt16(-5), diff) & ParallelMath::Less(diff, ParallelMath::MakeSInt16(4));
+}
+
+cvtt::ParallelMath::Int16CompFlag cvtt::Internal::ETCComputer::ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 10), ParallelMath::RightShift(b, 10))
+        & ETCDifferentialIsLegalForChannel(ParallelMath::RightShift(a, 5) & mask, ParallelMath::RightShift(b, 5) & mask)
+        & ETCDifferentialIsLegalForChannel(a & mask, b & mask);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b)
+{
+    int16_t diff = static_cast<int16_t>(b) - static_cast<int16_t>(a);
+
+    return (-4 <= diff) && (diff <= 3);
+}
+
+bool cvtt::Internal::ETCComputer::ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b)
+{
+    MUInt15 mask = ParallelMath::MakeUInt15(31);
+
+    return ETCDifferentialIsLegalForChannelScalar((a >> 10), (b >> 10))
+        & ETCDifferentialIsLegalForChannelScalar((a >> 5) & 31, (b >> 5) & 31)
+        & ETCDifferentialIsLegalForChannelScalar(a & 31, b & 31);
+}
+
+void cvtt::Internal::ETCComputer::EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+
+    // To speed this up, we compute line total as the sum, then subtract out isolated
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + pixels[px][ch];
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        lineTotal[ch] = lineTotal[ch] - isolatedTotal[ch];
+
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(16) - numPixelsIsolated;
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                else
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier++)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[3][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = unquantizedColor;
+                lineColors[2][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MSInt32 selectors = ParallelMath::MakeSInt32(0);
+            MFloat error = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat pixelError = isolatedError[px];
+
+                MUInt15 pixelBestSelector = ParallelMath::MakeUInt15(0);
+                for (int i = 0; i < 3; i++)
+                {
+                    MFloat error = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, pixelError);
+                    pixelError = ParallelMath::Min(error, pixelError);
+                    pixelBestSelector = ParallelMath::Select(ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(i + 1), pixelBestSelector);
+                }
+
+                error = error + pixelError;
+                selectors = selectors | (ParallelMath::ToInt32(pixelBestSelector) << (px * 2));
+            }
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+            bestError = ParallelMath::Min(error, bestError);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (ch * 5)) & 15;
+
+            EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    MUInt15 zero15 = ParallelMath::MakeUInt15(0);
+
+    MUInt15 counts[2] = { zero15, zero15 };
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 totals[2][3] =
+    {
+        { zero15, zero15, zero15 },
+        { zero15, zero15, zero15 }
+    };
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            totals[0][ch] = totals[0][ch] + pixels[px][ch];
+            totals[1][ch] = totals[1][ch] + ParallelMath::SelectOrZero(groupings[px], pixels[px][ch]);
+        }
+        counts[1] = counts[1] + ParallelMath::SelectOrZero(groupings[px], ParallelMath::MakeUInt15(1));
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        totals[0][ch] = totals[0][ch] - totals[1][ch];
+    counts[0] = ParallelMath::MakeUInt15(16) - counts[1];
+
+    MUInt16 bestSectorBits = ParallelMath::MakeUInt16(0);
+    MUInt16 bestSignBits = ParallelMath::MakeUInt16(0);
+    MUInt15 bestColors[2] = { zero15, zero15 };
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+
+    for (int table = 0; table < 8; table++)
+    {
+        MUInt15 numUniqueColors = zero15;
+
+        int modifier = cvtt::Tables::ETC1::g_thModifierTable[table];
+
+        for (int sector = 0; sector < 2; sector++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int blockNumUniqueColors = 0;
+                uint16_t blockUniqueQuantizedColors[31];
+
+                int maxOffsetMultiplier = ParallelMath::Extract(counts[sector], block);
+                int minOffsetMultiplier = -maxOffsetMultiplier;
+
+                int modifierOffset = modifier * 2;
+
+                int blockSectorCounts = ParallelMath::Extract(counts[sector], block);
+                int blockSectorTotals[3];
+                for (int ch = 0; ch < 3; ch++)
+                    blockSectorTotals[ch] = ParallelMath::Extract(totals[sector][ch], block);
+
+                for (int offsetPremultiplier = minOffsetMultiplier; offsetPremultiplier <= maxOffsetMultiplier; offsetPremultiplier++)
+                {
+                    // TODO: This isn't ideal for FakeBT709
+                    int16_t quantized[3];
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        if (blockSectorCounts == 0)
+                            quantized[ch] = 0;
+                        else
+                            quantized[ch] = std::min<int16_t>(15, std::max<int16_t>(0, (blockSectorTotals[ch] * 2 + blockSectorCounts * 17 + modifierOffset * offsetPremultiplier)) / (blockSectorCounts * 34));
+                    }
+
+                    uint16_t packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+                    if (blockNumUniqueColors == 0 || packedColor != blockUniqueQuantizedColors[blockNumUniqueColors - 1])
+                    {
+                        assert(blockNumUniqueColors < 32);
+                        blockUniqueQuantizedColors[blockNumUniqueColors++] = packedColor;
+                    }
+                }
+
+                ParallelMath::PutUInt15(he.numUniqueColors[sector], block, blockNumUniqueColors);
+
+                int baseIndex = 0;
+                if (sector == 1)
+                    baseIndex = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+                for (int i = 0; i < blockNumUniqueColors; i++)
+                    ParallelMath::PutUInt15(he.uniqueQuantizedColors[baseIndex + i], block, blockUniqueQuantizedColors[i]);
+            }
+        }
+
+        MUInt15 totalColors = he.numUniqueColors[0] + he.numUniqueColors[1];
+        int maxErrorColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            maxErrorColors = std::max<int>(maxErrorColors, ParallelMath::Extract(totalColors, block));
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int lastColor = ParallelMath::Extract(totalColors, block);
+            uint16_t stripeColor = ParallelMath::Extract(he.uniqueQuantizedColors[0], block);
+            for (int i = lastColor; i < maxErrorColors; i++)
+                ParallelMath::PutUInt15(he.uniqueQuantizedColors[i], block, stripeColor);
+        }
+
+        for (int ci = 0; ci < maxErrorColors; ci++)
+        {
+            MUInt15 fifteen = ParallelMath::MakeUInt15(15);
+            MUInt15 twoFiftyFive = ParallelMath::MakeUInt15(255);
+            MSInt16 zeroS16 = ParallelMath::MakeSInt16(0);
+
+            MUInt15 colors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = ParallelMath::RightShift(he.uniqueQuantizedColors[ci], ((2 - ch) * 5)) & fifteen;
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                colors[0][ch] = ParallelMath::Min(twoFiftyFive, unquantizedColor + modifier);
+                colors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(zeroS16, ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::MakeSInt16(modifier)));
+            }
+
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errors[2];
+                for (int i = 0; i < 2; i++)
+                {
+                    if (isFakeBT709)
+                        errors[i] = ComputeErrorFakeBT709(colors[i], preWeightedPixels[px]);
+                    else if (isUniform)
+                        errors[i] = ComputeErrorUniform(colors[i], pixels[px]);
+                    else
+                        errors[i] = ComputeErrorWeighted(colors[i], preWeightedPixels[px], options);
+                }
+
+                ParallelMath::Int16CompFlag errorOneLess = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errors[1], errors[0]));
+                he.errors[ci][px] = ParallelMath::Min(errors[0], errors[1]);
+                signBits = signBits | ParallelMath::SelectOrZero(errorOneLess, ParallelMath::MakeUInt16(1 << px));
+            }
+            he.signBits[ci] = signBits;
+        }
+
+        int maxUniqueColorCombos = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            int numUniqueColorCombos = ParallelMath::Extract(he.numUniqueColors[0], block) * ParallelMath::Extract(he.numUniqueColors[1], block);
+            if (numUniqueColorCombos > maxUniqueColorCombos)
+                maxUniqueColorCombos = numUniqueColorCombos;
+        }
+
+        MUInt15 indexes[2] = { zero15, zero15 };
+        MUInt15 maxIndex[2] = { he.numUniqueColors[0] - ParallelMath::MakeUInt15(1), he.numUniqueColors[1] - ParallelMath::MakeUInt15(1) };
+
+        int block1Starts[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            block1Starts[block] = ParallelMath::Extract(he.numUniqueColors[0], block);
+
+        for (int combo = 0; combo < maxUniqueColorCombos; combo++)
+        {
+            MUInt15 index0 = indexes[0] + ParallelMath::MakeUInt15(1);
+            ParallelMath::Int16CompFlag index0Overflow = ParallelMath::Less(maxIndex[0], index0);
+            ParallelMath::ConditionalSet(index0, index0Overflow, ParallelMath::MakeUInt15(0));
+
+            MUInt15 index1 = ParallelMath::Min(maxIndex[1], indexes[1] + ParallelMath::SelectOrZero(index0Overflow, ParallelMath::MakeUInt15(1)));
+            indexes[0] = index0;
+            indexes[1] = index1;
+
+            int ci0[ParallelMath::ParallelSize];
+            int ci1[ParallelMath::ParallelSize];
+            MUInt15 color0;
+            MUInt15 color1;
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                ci0[block] = ParallelMath::Extract(index0, block);
+                ci1[block] = ParallelMath::Extract(index1, block) + block1Starts[block];
+                ParallelMath::PutUInt15(color0, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci0[block]], block));
+                ParallelMath::PutUInt15(color1, block, ParallelMath::Extract(he.uniqueQuantizedColors[ci1[block]], block));
+            }
+
+            MFloat totalError = ParallelMath::MakeFloatZero();
+            MUInt16 sectorBits = ParallelMath::MakeUInt16(0);
+            MUInt16 signBits = ParallelMath::MakeUInt16(0);
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat errorCI0;
+                MFloat errorCI1;
+                MUInt16 signBits0;
+                MUInt16 signBits1;
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    ParallelMath::PutFloat(errorCI0, block, ParallelMath::Extract(he.errors[ci0[block]][px], block));
+                    ParallelMath::PutFloat(errorCI1, block, ParallelMath::Extract(he.errors[ci1[block]][px], block));
+                    ParallelMath::PutUInt16(signBits0, block, ParallelMath::Extract(he.signBits[ci0[block]], block));
+                    ParallelMath::PutUInt16(signBits1, block, ParallelMath::Extract(he.signBits[ci1[block]], block));
+                }
+
+                totalError = totalError + ParallelMath::Min(errorCI0, errorCI1);
+
+                MUInt16 bitPosition = ParallelMath::MakeUInt16(1 << px);
+
+                ParallelMath::Int16CompFlag error1Better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(errorCI1, errorCI0));
+
+                sectorBits = sectorBits | ParallelMath::SelectOrZero(error1Better, bitPosition);
+                signBits = signBits | (bitPosition & ParallelMath::Select(error1Better, signBits1, signBits0));
+            }
+
+            ParallelMath::FloatCompFlag totalErrorBetter = ParallelMath::Less(totalError, bestError);
+            ParallelMath::Int16CompFlag totalErrorBetter16 = ParallelMath::FloatFlagToInt16(totalErrorBetter);
+            if (ParallelMath::AnySet(totalErrorBetter16))
+            {
+                bestIsThisMode = bestIsThisMode | totalErrorBetter16;
+                ParallelMath::ConditionalSet(bestTable, totalErrorBetter16, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestColors[0], totalErrorBetter16, color0);
+                ParallelMath::ConditionalSet(bestColors[1], totalErrorBetter16, color1);
+                ParallelMath::ConditionalSet(bestSectorBits, totalErrorBetter16, sectorBits);
+                ParallelMath::ConditionalSet(bestSignBits, totalErrorBetter16, signBits);
+                bestError = ParallelMath::Min(totalError, bestError);
+            }
+        }
+    }
+
+    if (ParallelMath::AnySet(bestIsThisMode))
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(bestIsThisMode, block))
+                continue;
+
+            ParallelMath::ScalarUInt16 blockBestColors[2] = { ParallelMath::Extract(bestColors[0], block), ParallelMath::Extract(bestColors[1], block) };
+            ParallelMath::ScalarUInt16 blockBestSectorBits = ParallelMath::Extract(bestSectorBits, block);
+            ParallelMath::ScalarUInt16 blockBestSignBits = ParallelMath::Extract(bestSignBits, block);
+            ParallelMath::ScalarUInt16 blockBestTable = ParallelMath::Extract(bestTable, block);
+
+            EmitHModeBlock(outputBuffer + block * 8, blockBestColors, blockBestSectorBits, blockBestSignBits, blockBestTable, true);
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolatedBase[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options)
+{
+    // We treat T and H mode as the same mode ("Virtual T mode") with punchthrough, because of how the colors work:
+    //
+    // T mode: C1, C2+M, Transparent, C2-M
+    // H mode: C1+M, C1-M, Transparent, C2-M
+    //
+    // So in either case, we have 2 colors +/- a modifier, and a third unique color, which is basically T mode except without the middle color.
+    // The only thing that matters is whether it's better to store the isolated color as T mode color 1, or store it offset in H mode color 2.
+    //
+    // Sometimes it won't even be possible to store it in H mode color 2 because the table low bit derives from a numeric comparison of the colors,
+    // but unlike opaque blocks, we can't flip them.
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    ParallelMath::FloatCompFlag isTransparentF[16];
+    for (int px = 0; px < 16; px++)
+        isTransparentF[px] = ParallelMath::Int16FlagToFloat(isTransparent[px]);
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag bestIsHMode = ParallelMath::MakeBoolInt16(false);
+
+    MUInt15 isolatedTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+    MUInt15 lineTotal[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    MUInt15 numPixelsIsolated = ParallelMath::MakeUInt15(0);
+    MUInt15 numPixelsLine = ParallelMath::MakeUInt15(0);
+
+    ParallelMath::Int16CompFlag isIsolated[16];
+    ParallelMath::Int16CompFlag isLine[16];
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        ParallelMath::Int16CompFlag isOpaque = ParallelMath::Not(isTransparent[px]);
+        isIsolated[px] = isIsolatedBase[px] & isOpaque;
+        isLine[px] = ParallelMath::Not(isIsolatedBase[px]) & isOpaque;
+    }
+
+    for (unsigned int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            isolatedTotal[ch] = isolatedTotal[ch] + ParallelMath::SelectOrZero(isIsolated[px], pixels[px][ch]);
+            lineTotal[ch] = lineTotal[ch] + ParallelMath::SelectOrZero(isLine[px], pixels[px][ch]);
+        }
+        numPixelsIsolated = numPixelsIsolated + ParallelMath::SelectOrZero(isIsolated[px], ParallelMath::MakeUInt15(1));
+        numPixelsLine = numPixelsLine + ParallelMath::SelectOrZero(isLine[px], ParallelMath::MakeUInt15(1));
+    }
+
+    MUInt15 isolatedAverageQuantized[3];
+    MUInt15 hModeIsolatedQuantized[8][3];
+    MUInt15 isolatedAverageTargets[3];
+    {
+        int divisors[ParallelMath::ParallelSize];
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            divisors[block] = ParallelMath::Extract(numPixelsIsolated, block) * 34;
+
+        MUInt15 addend = (numPixelsIsolated << 4) | numPixelsIsolated;
+        for (int ch = 0; ch < 3; ch++)
+        {
+            // isolatedAverageQuantized[ch] = (isolatedTotal[ch] * 2 + numPixelsIsolated * 17) / (numPixelsIsolated * 34);
+
+            MUInt15 numerator = isolatedTotal[ch] + isolatedTotal[ch];
+            if (!isFakeBT709)
+                numerator = numerator + addend;
+
+            MUInt15 hModeIsolatedNumerators[8];
+            for (int table = 0; table < 8; table++)
+            {
+                // FIXME: Handle fake BT.709 correctly
+                MUInt15 offsetTotal = isolatedTotal[ch] + ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]), numPixelsIsolated));
+
+                hModeIsolatedNumerators[table] = (offsetTotal + offsetTotal) + addend;
+            }
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                int divisor = divisors[block];
+                if (divisor == 0)
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, 0);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, 0);
+                }
+                else
+                {
+                    ParallelMath::PutUInt15(isolatedAverageQuantized[ch], block, ParallelMath::Extract(numerator, block) / divisor);
+                    for (int table = 0; table < 8; table++)
+                        ParallelMath::PutUInt15(hModeIsolatedQuantized[table][ch], block, ParallelMath::Extract(hModeIsolatedNumerators[table], block) / divisor);
+                }
+            }
+
+            isolatedAverageTargets[ch] = numerator;
+        }
+    }
+
+    if (isFakeBT709)
+        ResolveTHFakeBT709Rounding(isolatedAverageQuantized, isolatedAverageTargets, numPixelsIsolated);
+
+    for (int table = 0; table < 8; table++)
+        for (int ch = 0; ch < 3; ch++)
+            hModeIsolatedQuantized[table][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), hModeIsolatedQuantized[table][ch]);
+
+    MUInt15 isolatedColor[3];
+    for (int ch = 0; ch < 3; ch++)
+        isolatedColor[ch] = (isolatedAverageQuantized[ch]) | (isolatedAverageQuantized[ch] << 4);
+
+    MFloat isolatedError[16];
+    for (int px = 0; px < 16; px++)
+    {
+        if (isFakeBT709)
+            isolatedError[px] = ComputeErrorFakeBT709(isolatedColor, preWeightedPixels[px]);
+        else if (isUniform)
+            isolatedError[px] = ComputeErrorUniform(pixels[px], isolatedColor);
+        else
+            isolatedError[px] = ComputeErrorWeighted(isolatedColor, preWeightedPixels[px], options);
+
+        ParallelMath::ConditionalSet(isolatedError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+    }
+
+    MSInt32 bestSelectors = ParallelMath::MakeSInt32(0);
+    MUInt15 bestTable = ParallelMath::MakeUInt15(0);
+    MUInt15 bestLineColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIsolatedColor = ParallelMath::MakeUInt15(0);
+    MUInt15 bestHModeColor2 = ParallelMath::MakeUInt15(0);
+    ParallelMath::Int16CompFlag bestUseHMode = ParallelMath::MakeBoolInt16(false);
+
+    MSInt16 maxLine = ParallelMath::LosslessCast<MSInt16>::Cast(numPixelsLine);
+    MSInt16 minLine = ParallelMath::MakeSInt16(0) - maxLine;
+
+    int16_t clusterMaxLine = 0;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int16_t blockMaxLine = ParallelMath::Extract(maxLine, block);
+        if (blockMaxLine > clusterMaxLine)
+            clusterMaxLine = blockMaxLine;
+    }
+
+    int16_t clusterMinLine = -clusterMaxLine;
+
+    int lineDivisors[ParallelMath::ParallelSize];
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        lineDivisors[block] = ParallelMath::Extract(numPixelsLine, block) * 34;
+
+    MUInt15 lineAddend = (numPixelsLine << 4) | numPixelsLine;
+
+    for (int table = 0; table < 8; table++)
+    {
+        int numUniqueColors[ParallelMath::ParallelSize];
+        MUInt15 uniqueQuantizedColors[31];
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            numUniqueColors[block] = 0;
+
+        MUInt15 modifier = ParallelMath::MakeUInt15(cvtt::Tables::ETC2::g_thModifierTable[table]);
+        MUInt15 modifierOffset = (modifier + modifier);
+
+        for (int16_t offsetPremultiplier = clusterMinLine; offsetPremultiplier <= clusterMaxLine; offsetPremultiplier += 2)
+        {
+            MSInt16 clampedOffsetPremultiplier = ParallelMath::Max(minLine, ParallelMath::Min(maxLine, ParallelMath::MakeSInt16(offsetPremultiplier)));
+            MSInt16 modifierAddend = ParallelMath::CompactMultiply(clampedOffsetPremultiplier, modifierOffset);
+
+            MUInt15 quantized[3];
+            if (isFakeBT709)
+            {
+                MUInt15 targets[3];
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch]) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                    targets[ch] = numerator;
+                }
+
+                ResolveTHFakeBT709Rounding(quantized, targets, numPixelsLine);
+            }
+            else
+            {
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    //quantized[ch] = std::min<int16_t>(15, std::max(0, (lineTotal[ch] * 2 + numDAIILine * 17 + modifierOffset * offsetPremultiplier)) / (numDAIILine * 34));
+                    MUInt15 numerator = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(lineTotal[ch] + lineTotal[ch] + lineAddend) + modifierAddend));
+                    MUInt15 divided = ParallelMath::MakeUInt15(0);
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int divisor = lineDivisors[block];
+                        if (divisor == 0)
+                            ParallelMath::PutUInt15(divided, block, 0);
+                        else
+                            ParallelMath::PutUInt15(divided, block, ParallelMath::Extract(numerator, block) / divisor);
+                    }
+                    quantized[ch] = ParallelMath::Min(ParallelMath::MakeUInt15(15), divided);
+                }
+            }
+
+            MUInt15 packedColor = (quantized[0] << 10) | (quantized[1] << 5) | quantized[2];
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t blockPackedColor = ParallelMath::Extract(packedColor, block);
+                if (numUniqueColors[block] == 0 || blockPackedColor != ParallelMath::Extract(uniqueQuantizedColors[numUniqueColors[block] - 1], block))
+                    ParallelMath::PutUInt15(uniqueQuantizedColors[numUniqueColors[block]++], block, blockPackedColor);
+            }
+        }
+
+        // Stripe unfilled unique colors
+        int maxUniqueColors = 0;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (numUniqueColors[block] > maxUniqueColors)
+                maxUniqueColors = numUniqueColors[block];
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            uint16_t fillColor = ParallelMath::Extract(uniqueQuantizedColors[0], block);
+
+            int numUnique = numUniqueColors[block];
+            for (int fill = numUnique + 1; fill < maxUniqueColors; fill++)
+                ParallelMath::PutUInt15(uniqueQuantizedColors[fill], block, fillColor);
+        }
+
+        MFloat hModeErrors[16];
+        MUInt15 hModeUnquantizedColor[3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 quantizedChannel = hModeIsolatedQuantized[table][ch];
+
+            MUInt15 unquantizedCh = (quantizedChannel << 4) | quantizedChannel;
+            hModeUnquantizedColor[ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedCh) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+        }
+
+        for (int px = 0; px < 16; px++)
+        {
+            hModeErrors[px] = isUniform ? ComputeErrorUniform(hModeUnquantizedColor, pixels[px]) : ComputeErrorWeighted(hModeUnquantizedColor, preWeightedPixels[px], options);
+            ParallelMath::ConditionalSet(hModeErrors[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+        }
+
+        MUInt15 packedHModeColor2 = (hModeIsolatedQuantized[table][0] << 10) | (hModeIsolatedQuantized[table][1] << 5) | hModeIsolatedQuantized[table][2];
+        ParallelMath::Int16CompFlag tableLowBitIsZero = ((table & 1) == 0) ? ParallelMath::MakeBoolInt16(true) : ParallelMath::MakeBoolInt16(false);
+
+        for (int ci = 0; ci < maxUniqueColors; ci++)
+        {
+            MUInt15 lineColors[2][3];
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MUInt15 quantizedChannel = (ParallelMath::RightShift(uniqueQuantizedColors[ci], 10 - (ch * 5)) & ParallelMath::MakeUInt15(15));
+
+                MUInt15 unquantizedColor = (quantizedChannel << 4) | quantizedChannel;
+                lineColors[0][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantizedColor + modifier);
+                lineColors[1][ch] = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), ParallelMath::LosslessCast<MSInt16>::Cast(unquantizedColor) - ParallelMath::LosslessCast<MSInt16>::Cast(modifier)));
+            }
+
+            MUInt15 bestLineSelector[16];
+            MFloat bestLineError[16];
+            for (int px = 0; px < 16; px++)
+            {
+                MFloat lineErrors[2];
+                for (int i = 0; i < 2; i++)
+                    lineErrors[i] = isUniform ? ComputeErrorUniform(lineColors[i], pixels[px]) : ComputeErrorWeighted(lineColors[i], preWeightedPixels[px], options);
+
+                ParallelMath::Int16CompFlag firstIsBetter = ParallelMath::FloatFlagToInt16(ParallelMath::LessOrEqual(lineErrors[0], lineErrors[1]));
+                bestLineSelector[px] = ParallelMath::Select(firstIsBetter, ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(3));
+                bestLineError[px] = ParallelMath::Min(lineErrors[0], lineErrors[1]);
+
+                ParallelMath::ConditionalSet(bestLineError[px], isTransparentF[px], ParallelMath::MakeFloatZero());
+            }
+
+            // One case considered here was if it was possible to force H mode to be valid when the line color is unused.
+            // That case isn't actually useful because it's equivalent to the isolated color being unused at maximum offset,
+            // which is always checked after a swap.
+            MFloat tModeError = ParallelMath::MakeFloatZero();
+            MFloat hModeError = ParallelMath::MakeFloatZero();
+            for (int px = 0; px < 16; px++)
+            {
+                tModeError = tModeError + ParallelMath::Min(bestLineError[px], isolatedError[px]);
+                hModeError = hModeError + ParallelMath::Min(bestLineError[px], hModeErrors[px]);
+            }
+
+            ParallelMath::FloatCompFlag hLessError = ParallelMath::Less(hModeError, tModeError);
+
+            MUInt15 packedHModeColor1 = uniqueQuantizedColors[ci];
+
+            ParallelMath::Int16CompFlag hModeTableLowBitMustBeZero = ParallelMath::Less(packedHModeColor1, packedHModeColor2);
+
+            ParallelMath::Int16CompFlag hModeIsLegal = ParallelMath::Equal(hModeTableLowBitMustBeZero, tableLowBitIsZero);
+            ParallelMath::Int16CompFlag useHMode = ParallelMath::FloatFlagToInt16(hLessError) & hModeIsLegal;
+
+            MFloat roundBestError = tModeError;
+            ParallelMath::ConditionalSet(roundBestError, ParallelMath::Int16FlagToFloat(useHMode), hModeError);
+
+            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(roundBestError, bestError));
+            ParallelMath::FloatCompFlag useHModeF = ParallelMath::Int16FlagToFloat(useHMode);
+
+            if (ParallelMath::AnySet(errorBetter))
+            {
+                MSInt32 selectors = ParallelMath::MakeSInt32(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 selector = bestLineSelector[px];
+
+                    MFloat isolatedPixelError = ParallelMath::Select(useHModeF, hModeErrors[px], isolatedError[px]);
+                    ParallelMath::Int16CompFlag isolatedBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(isolatedPixelError, bestLineError[px]));
+
+                    ParallelMath::ConditionalSet(selector, isolatedBetter, ParallelMath::MakeUInt15(0));
+                    ParallelMath::ConditionalSet(selector, isTransparent[px], ParallelMath::MakeUInt15(2));
+                    selectors = selectors | (ParallelMath::ToInt32(selector) << (px * 2));
+                }
+
+                bestError = ParallelMath::Min(bestError, roundBestError);
+                ParallelMath::ConditionalSet(bestLineColor, errorBetter, uniqueQuantizedColors[ci]);
+                ParallelMath::ConditionalSet(bestSelectors, errorBetter, selectors);
+                ParallelMath::ConditionalSet(bestTable, errorBetter, ParallelMath::MakeUInt15(table));
+                ParallelMath::ConditionalSet(bestIsHMode, errorBetter, useHMode);
+                ParallelMath::ConditionalSet(bestHModeColor2, errorBetter, packedHModeColor2);
+                
+                bestIsThisMode = bestIsThisMode | errorBetter;
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (ParallelMath::Extract(bestIsThisMode, block))
+        {
+            uint32_t lowBits = 0;
+            uint32_t highBits = 0;
+
+            uint16_t blockBestLineColor = ParallelMath::Extract(bestLineColor, block);
+            ParallelMath::ScalarUInt16 blockIsolatedAverageQuantized[3];
+
+            for (int ch = 0; ch < 3; ch++)
+                blockIsolatedAverageQuantized[ch] = ParallelMath::Extract(isolatedAverageQuantized[ch], block);
+
+            uint16_t blockBestTable = ParallelMath::Extract(bestTable, block);
+            int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+
+            ParallelMath::ScalarUInt16 lineColor[3];
+            for (int ch = 0; ch < 3; ch++)
+                lineColor[ch] = (blockBestLineColor >> (10 - (ch * 5))) & 15;
+
+            if (ParallelMath::Extract(bestIsHMode, block))
+            {
+                // T mode: C1, C2+M, Transparent, C2-M
+                // H mode: C1+M, C1-M, Transparent, C2-M
+                static const ParallelMath::ScalarUInt16 selectorRemapSector[4] = { 1, 0, 1, 0 };
+                static const ParallelMath::ScalarUInt16 selectorRemapSign[4] = { 1, 0, 0, 1 };
+
+                // Remap selectors
+                ParallelMath::ScalarUInt16 signBits = 0;
+                ParallelMath::ScalarUInt16 sectorBits = 0;
+                int32_t blockBestSelectors = ParallelMath::Extract(bestSelectors, block);
+                for (int px = 0; px < 16; px++)
+                {
+                    int32_t selector = (blockBestSelectors >> (px * 2)) & 3;
+                    sectorBits |= (selectorRemapSector[selector] << px);
+                    signBits |= (selectorRemapSign[selector] << px);
+                }
+
+                ParallelMath::ScalarUInt16 blockColors[2] = { blockBestLineColor, ParallelMath::Extract(bestHModeColor2, block) };
+
+                EmitHModeBlock(outputBuffer + block * 8, blockColors, sectorBits, signBits, blockBestTable, false);
+            }
+            else
+                EmitTModeBlock(outputBuffer + block * 8, lineColor, blockIsolatedAverageQuantized, blockBestSelectors, blockBestTable, false);
+        }
+    }
+}
+
+
+cvtt::ParallelMath::UInt15 cvtt::Internal::ETCComputer::DecodePlanarCoeff(const MUInt15 &coeff, int ch)
+{
+    if (ch == 1)
+        return (coeff << 1) | (ParallelMath::RightShift(coeff, 6));
+    else
+        return (coeff << 2) | (ParallelMath::RightShift(coeff, 4));
+}
+
+void cvtt::Internal::ETCComputer::EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options)
+{
+    // NOTE: If it's desired to do this in another color space, the best way to do it would probably be
+    // to do everything in that color space and then transform it back to RGB.
+
+    // We compute H = (H-O)/4 and V= (V-O)/4 to simplify the math
+
+    // error = (x*H + y*V + O - C)^2
+    MFloat h[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat v[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+    MFloat o[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    MFloat totalError = ParallelMath::MakeFloatZero();
+    MUInt15 bestCoeffs[3][3];	// [Channel][Coeff]
+    for (int ch = 0; ch < 3; ch++)
+    {
+        float fhh = 0.f;
+        float fho = 0.f;
+        float fhv = 0.f;
+        float foo = 0.f;
+        float fov = 0.f;
+        float fvv = 0.f;
+        MFloat fc = ParallelMath::MakeFloatZero();
+        MFloat fh = ParallelMath::MakeFloatZero();
+        MFloat fv = ParallelMath::MakeFloatZero();
+        MFloat fo = ParallelMath::MakeFloatZero();
+
+        float &foh = fho;
+        float &fvh = fhv;
+        float &fvo = fov;
+
+        for (int px = 0; px < 16; px++)
+        {
+            float x = static_cast<float>(px % 4);
+            float y = static_cast<float>(px / 4);
+            MFloat c = isFakeBT709 ? preWeightedPixels[px][ch] : ParallelMath::ToFloat(pixels[px][ch]);
+
+            // (x*H + y*V + O - C)^2
+            fhh += x * x;
+            fhv += x * y;
+            fho += x;
+            fh = fh - c * x;
+
+            fvh += y * x;
+            fvv += y * y;
+            fvo += y;
+            fv = fv - c * y;
+
+            foh += x;
+            fov += y;
+            foo += 1;
+            fo = fo - c;
+
+            fh = fh - c * x;
+            fv = fv - c * y;
+            fo = fo - c;
+            fc = fc + c * c;
+        }
+
+        //float totalError = fhh * h * h + fho * h*o + fhv * h*v + foo * o * o + fov * o*v + fvv * v * v + fh * h + fv * v + fo * o + fc;
+
+        // error = fhh*h^2 + fho*h*o + fhv*h*v + foo*o^2 + fov*o*v + fvv*v^2 + fh*h + fv*v + fo*o + fc
+        // derror/dh = 2*fhh*h + fho*o + fhv*v + fh
+        // derror/dv = fhv*h + fov*o + 2*fvv*v + fv
+        // derror/do = fho*h + 2*foo*o + fov*v + fo
+
+        // Solve system of equations
+        // h o v 1 = 0
+        // -------
+        // d e f g  R0
+        // i j k l  R1
+        // m n p q  R2
+
+        float d = 2.0f * fhh;
+        float e = fho;
+        float f = fhv;
+        MFloat gD = fh;
+
+        float i = fhv;
+        float j = fov;
+        float k = 2.0f * fvv;
+        MFloat lD = fv;
+
+        float m = fho;
+        float n = 2.0f * foo;
+        float p = fov;
+        MFloat qD = fo;
+
+        {
+            // Factor out first column from R1 and R2
+            float r0to1 = -i / d;
+            float r0to2 = -m / d;
+
+            // 0 j1 k1 l1D
+            float j1 = j + r0to1 * e;
+            float k1 = k + r0to1 * f;
+            MFloat l1D = lD + gD * r0to1;
+
+            // 0 n1 p1 q1D
+            float n1 = n + r0to2 * e;
+            float p1 = p + r0to2 * f;
+            MFloat q1D = qD + gD * r0to2;
+
+            // Factor out third column from R2
+            float r1to2 = -p1 / k1;
+
+            // 0 n2 0 q2D
+            float n2 = n1 + r1to2 * j1;
+            MFloat q2D = q1D + l1D * r1to2;
+
+            o[ch] = -q2D / n2;
+
+            // Factor out second column from R1
+            // 0 n2 0 q2D
+
+            float r2to1 = -j1 / n2;
+
+            // 0 0 k1 l2D
+            // 0 n2 0 q2D
+            MFloat l2D = l1D + q2D * r2to1;
+
+            float elim2 = -f / k1;
+            float elim1 = -e / n2;
+
+            // d 0 0 g2D
+            MFloat g2D = gD + l2D * elim2 + q2D * elim1;
+
+            // n2*o + q2 = 0
+            // o = -q2 / n2
+            h[ch] = -g2D / d;
+            v[ch] = -l2D / k1;
+        }
+
+        // Undo the local transformation
+        h[ch] = h[ch] * 4.0f + o[ch];
+        v[ch] = v[ch] * 4.0f + o[ch];
+    }
+
+    if (isFakeBT709)
+    {
+        MFloat oRGB[3];
+        MFloat hRGB[3];
+        MFloat vRGB[3];
+
+        ConvertFromFakeBT709(oRGB, o);
+        ConvertFromFakeBT709(hRGB, h);
+        ConvertFromFakeBT709(vRGB, v);
+
+        // Twiddling in fake BT.607 is a mess, just round off for now (the precision is pretty good anyway)
+        {
+            ParallelMath::RoundTowardNearestForScope rtn;
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                MFloat fcoeffs[3] = { oRGB[ch], hRGB[ch], vRGB[ch] };
+
+                for (int c = 0; c < 3; c++)
+                {
+                    MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                    if (ch == 1)
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                    else
+                        coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                    fcoeffs[c] = coeff;
+                }
+
+                for (int c = 0; c < 3; c++)
+                    bestCoeffs[ch][c] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rtn);
+            }
+        }
+
+        MUInt15 reconstructed[16][3];
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MUInt15 dO = DecodePlanarCoeff(bestCoeffs[ch][0], ch);
+            MUInt15 dH = DecodePlanarCoeff(bestCoeffs[ch][1], ch);
+            MUInt15 dV = DecodePlanarCoeff(bestCoeffs[ch][2], ch);
+
+            MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+            MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+            MFloat error = ParallelMath::MakeFloatZero();
+
+            MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+            for (int px = 0; px < 16; px++)
+            {
+                MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                reconstructed[px][ch] = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+            }
+        }
+
+        totalError = ParallelMath::MakeFloatZero();
+        for (int px = 0; px < 16; px++)
+            totalError = totalError + ComputeErrorFakeBT709(reconstructed[px], preWeightedPixels[px]);
+    }
+    else
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            MFloat fcoeffs[3] = { o[ch], h[ch], v[ch] };
+            MUInt15 coeffRanges[3][2];
+
+            for (int c = 0; c < 3; c++)
+            {
+                MFloat coeff = ParallelMath::Max(ParallelMath::MakeFloatZero(), fcoeffs[c]);
+                if (ch == 1)
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(127.0f), coeff * (127.0f / 255.0f));
+                else
+                    coeff = ParallelMath::Min(ParallelMath::MakeFloat(63.0f), coeff * (63.0f / 255.0f));
+                fcoeffs[c] = coeff;
+            }
+
+            {
+                ParallelMath::RoundDownForScope rd;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][0] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &rd);
+            }
+
+            {
+                ParallelMath::RoundUpForScope ru;
+                for (int c = 0; c < 3; c++)
+                    coeffRanges[c][1] = ParallelMath::RoundAndConvertToU15(fcoeffs[c], &ru);
+            }
+
+            MFloat bestChannelError = ParallelMath::MakeFloat(FLT_MAX);
+            for (int io = 0; io < 2; io++)
+            {
+                MUInt15 dO = DecodePlanarCoeff(coeffRanges[0][io], ch);
+
+                for (int ih = 0; ih < 2; ih++)
+                {
+                    MUInt15 dH = DecodePlanarCoeff(coeffRanges[1][ih], ch);
+                    MSInt16 hMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dH) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                    for (int iv = 0; iv < 2; iv++)
+                    {
+                        MUInt15 dV = DecodePlanarCoeff(coeffRanges[2][iv], ch);
+                        MSInt16 vMinusO = ParallelMath::LosslessCast<MSInt16>::Cast(dV) - ParallelMath::LosslessCast<MSInt16>::Cast(dO);
+
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        MSInt16 addend = ParallelMath::LosslessCast<MSInt16>::Cast(dO << 2) + 2;
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 pxv = ParallelMath::MakeUInt15(px);
+                            MSInt16 x = ParallelMath::LosslessCast<MSInt16>::Cast(pxv & ParallelMath::MakeUInt15(3));
+                            MSInt16 y = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RightShift(pxv, 2));
+
+                            MSInt16 interpolated = ParallelMath::RightShift(ParallelMath::CompactMultiply(x, hMinusO) + ParallelMath::CompactMultiply(y, vMinusO) + addend, 2);
+                            MUInt15 clampedLow = ParallelMath::ToUInt15(ParallelMath::Max(ParallelMath::MakeSInt16(0), interpolated));
+                            MUInt15 dec = ParallelMath::Min(ParallelMath::MakeUInt15(255), clampedLow);
+
+                            MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(dec);
+
+                            MFloat deltaF = ParallelMath::ToFloat(delta);
+                            error = error + deltaF * deltaF;
+                        }
+
+                        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestChannelError));
+                        if (ParallelMath::AnySet(errorBetter))
+                        {
+                            bestChannelError = ParallelMath::Min(error, bestChannelError);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][0], errorBetter, coeffRanges[0][io]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][1], errorBetter, coeffRanges[1][ih]);
+                            ParallelMath::ConditionalSet(bestCoeffs[ch][2], errorBetter, coeffRanges[2][iv]);
+                        }
+                    }
+                }
+            }
+
+            if (!isUniform)
+            {
+                switch (ch)
+                {
+                case 0:
+                    bestChannelError = bestChannelError * (options.redWeight * options.redWeight);
+                    break;
+                case 1:
+                    bestChannelError = bestChannelError * (options.greenWeight * options.greenWeight);
+                    break;
+                case 2:
+                    bestChannelError = bestChannelError * (options.blueWeight * options.blueWeight);
+                    break;
+                default:
+                    break;
+                }
+            }
+
+            totalError = totalError + bestChannelError;
+        }
+    }
+
+    ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(totalError, bestError));
+    if (ParallelMath::AnySet(errorBetter))
+    {
+        bestError = ParallelMath::Min(bestError, totalError);
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            if (!ParallelMath::Extract(errorBetter, block))
+                continue;
+
+            int ro = ParallelMath::Extract(bestCoeffs[0][0], block);
+            int rh = ParallelMath::Extract(bestCoeffs[0][1], block);
+            int rv = ParallelMath::Extract(bestCoeffs[0][2], block);
+
+            int go = ParallelMath::Extract(bestCoeffs[1][0], block);
+            int gh = ParallelMath::Extract(bestCoeffs[1][1], block);
+            int gv = ParallelMath::Extract(bestCoeffs[1][2], block);
+
+            int bo = ParallelMath::Extract(bestCoeffs[2][0], block);
+            int bh = ParallelMath::Extract(bestCoeffs[2][1], block);
+            int bv = ParallelMath::Extract(bestCoeffs[2][2], block);
+
+            int go1 = go >> 6;
+            int go2 = go & 63;
+
+            int bo1 = bo >> 5;
+            int bo2 = (bo >> 3) & 3;
+            int bo3 = bo & 7;
+
+            int rh1 = (rh >> 1);
+            int rh2 = rh & 1;
+
+            int fakeR = ro >> 2;
+            int fakeDR = go1 | ((ro & 3) << 1);
+
+            int fakeG = (go2 >> 2);
+            int fakeDG = ((go2 & 3) << 1) | bo1;
+
+            int fakeB = bo2;
+            int fakeDB = bo3 >> 1;
+
+            uint32_t highBits = 0;
+            uint32_t lowBits = 0;
+
+            // Avoid overflowing R
+            if ((fakeDR & 4) != 0 && fakeR + fakeDR < 8)
+                highBits |= 1 << (63 - 32);
+
+            // Avoid overflowing G
+            if ((fakeDG & 4) != 0 && fakeG + fakeDG < 8)
+                highBits |= 1 << (55 - 32);
+
+            // Overflow B
+            if (fakeB + fakeDB < 4)
+            {
+                // Overflow low
+                highBits |= 1 << (42 - 32);
+            }
+            else
+            {
+                // Overflow high
+                highBits |= 7 << (45 - 32);
+            }
+
+            highBits |= ro << (57 - 32);
+            highBits |= go1 << (56 - 32);
+            highBits |= go2 << (49 - 32);
+            highBits |= bo1 << (48 - 32);
+            highBits |= bo2 << (43 - 32);
+            highBits |= bo3 << (39 - 32);
+            highBits |= rh1 << (34 - 32);
+            highBits |= 1 << (33 - 32);
+            highBits |= rh2 << (32 - 32);
+
+            lowBits |= gh << 25;
+            lowBits |= bh << 19;
+            lowBits |= rv << 13;
+            lowBits |= gv << 6;
+            lowBits |= bv << 0;
+
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i] = (highBits >> (24 - i * 8)) & 0xff;
+            for (int i = 0; i < 4; i++)
+                outputBuffer[block * 8 + i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha)
+{
+    ParallelMath::Int16CompFlag pixelIsTransparent[16];
+    ParallelMath::Int16CompFlag anyTransparent = ParallelMath::MakeBoolInt16(false);
+    ParallelMath::Int16CompFlag allTransparent = ParallelMath::MakeBoolInt16(true);
+
+    if (punchthroughAlpha)
+    {
+        const float fThreshold = std::max<float>(std::min<float>(1.0f, options.threshold), 0.0f) * 255.0f;
+
+        // +1.0f is intentional, we want to take the next valid integer (even if it's 256) since everything else lower is transparent
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(std::floor(fThreshold + 1.0f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            MUInt15 alpha;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(alpha, block, pixelBlocks[block].m_pixels[px][3]);
+
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(alpha, threshold);
+            anyTransparent = (anyTransparent | isTransparent);
+            allTransparent = (allTransparent & isTransparent);
+            pixelIsTransparent[px] = isTransparent;
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+            pixelIsTransparent[px] = ParallelMath::MakeBoolInt16(false);
+
+        allTransparent = anyTransparent = ParallelMath::MakeBoolInt16(false);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    ETC2CompressionDataInternal* internalData = static_cast<ETC2CompressionDataInternal*>(compressionData);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, pixelBlocks, options);
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag flag = pixelIsTransparent[px];
+            ParallelMath::FloatCompFlag fflag = ParallelMath::Int16FlagToFloat(flag);
+
+            for (int ch = 0; ch < 3; ch++)
+            {
+                ParallelMath::ConditionalSet(pixels[px][ch], flag, ParallelMath::MakeUInt15(0));
+                ParallelMath::ConditionalSet(preWeightedPixels[px][ch], fflag, ParallelMath::MakeFloat(0.0f));
+            }
+        }
+    }
+
+    if (!ParallelMath::AllSet(allTransparent))
+        EncodePlanar(outputBuffer, bestError, pixels, preWeightedPixels, options);
+
+    MFloat chromaDelta[16][2];
+
+    MUInt15 numOpaque = ParallelMath::MakeUInt15(16);
+    for (int px = 0; px < 16; px++)
+        numOpaque = numOpaque - ParallelMath::SelectOrZero(pixelIsTransparent[px], ParallelMath::MakeUInt15(1));
+
+    if (options.flags & cvtt::Flags::Uniform)
+    {
+        MSInt16 chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            chromaCoordinates3[px][0] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+            chromaCoordinates3[px][1] = ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][0]) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][1] << 1) + ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px][2]);
+        }
+
+        MSInt16 chromaCoordinateCentroid[2] = { ParallelMath::MakeSInt16(0), ParallelMath::MakeSInt16(0) };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MUInt15 chromaCoordinateMultiplied = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(chromaCoordinates3[px][ch], numOpaque));
+                    MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(chromaCoordinateMultiplied) - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = ParallelMath::ToFloat(delta);
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = ParallelMath::ToFloat((chromaCoordinates3[px][ch] << 4) - chromaCoordinateCentroid[ch]);
+            }
+        }
+
+        const MFloat rcpSqrt3 = ParallelMath::MakeFloat(0.57735026918962576450914878050196f);
+
+        for (int px = 0; px < 16; px++)
+            chromaDelta[px][1] = chromaDelta[px][1] * rcpSqrt3;
+    }
+    else
+    {
+        const float chromaAxis0[3] = { internalData->m_chromaSideAxis0[0], internalData->m_chromaSideAxis0[1], internalData->m_chromaSideAxis0[2] };
+        const float chromaAxis1[3] = { internalData->m_chromaSideAxis1[0], internalData->m_chromaSideAxis1[1], internalData->m_chromaSideAxis1[2] };
+
+        MFloat chromaCoordinates3[16][2];
+        for (int px = 0; px < 16; px++)
+        {
+            const MFloat &px0 = preWeightedPixels[px][0];
+            const MFloat &px1 = preWeightedPixels[px][1];
+            const MFloat &px2 = preWeightedPixels[px][2];
+
+            chromaCoordinates3[px][0] = px0 * chromaAxis0[0] + px1 * chromaAxis0[1] + px2 * chromaAxis0[2];
+            chromaCoordinates3[px][1] = px0 * chromaAxis1[0] + px1 * chromaAxis1[1] + px2 * chromaAxis1[2];
+        }
+
+        MFloat chromaCoordinateCentroid[2] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 2; ch++)
+                chromaCoordinateCentroid[ch] = chromaCoordinateCentroid[ch] + chromaCoordinates3[px][ch];
+        }
+
+        if (punchthroughAlpha)
+        {
+            const MFloat numOpaqueF = ParallelMath::ToFloat(numOpaque);
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                {
+                    MFloat chromaCoordinateMultiplied = chromaCoordinates3[px][ch] * numOpaqueF;
+                    MFloat delta = chromaCoordinateMultiplied - chromaCoordinateCentroid[ch];
+                    chromaDelta[px][ch] = delta;
+                }
+            }
+        }
+        else
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < 2; ch++)
+                    chromaDelta[px][ch] = chromaCoordinates3[px][ch] * 16.0f - chromaCoordinateCentroid[ch];
+            }
+        }
+    }
+
+
+    MFloat covXX = ParallelMath::MakeFloatZero();
+    MFloat covYY = ParallelMath::MakeFloatZero();
+    MFloat covXY = ParallelMath::MakeFloatZero();
+
+    for (int px = 0; px < 16; px++)
+    {
+        MFloat nx = chromaDelta[px][0];
+        MFloat ny = chromaDelta[px][1];
+
+        covXX = covXX + nx * nx;
+        covYY = covYY + ny * ny;
+        covXY = covXY + nx * ny;
+    }
+
+    MFloat halfTrace = (covXX + covYY) * 0.5f;
+    MFloat det = covXX * covYY - covXY * covXY;
+
+    MFloat mm = ParallelMath::Sqrt(ParallelMath::Max(ParallelMath::MakeFloatZero(), halfTrace * halfTrace - det));
+
+    MFloat ev = halfTrace + mm;
+
+    MFloat dx = (covYY - ev + covXY);
+    MFloat dy = -(covXX - ev + covXY);
+
+    // If evenly distributed, pick an arbitrary plane
+    ParallelMath::FloatCompFlag allZero = ParallelMath::Equal(dx, ParallelMath::MakeFloatZero()) & ParallelMath::Equal(dy, ParallelMath::MakeFloatZero());
+    ParallelMath::ConditionalSet(dx, allZero, ParallelMath::MakeFloat(1.f));
+
+    ParallelMath::Int16CompFlag sectorAssignments[16];
+    for (int px = 0; px < 16; px++)
+        sectorAssignments[px] = ParallelMath::FloatFlagToInt16(ParallelMath::Less(chromaDelta[px][0] * dx + chromaDelta[px][1] * dy, ParallelMath::MakeFloatZero()));
+
+    if (!ParallelMath::AllSet(allTransparent))
+    {
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeTMode(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, options);
+
+        EncodeHMode(outputBuffer, bestError, sectorAssignments, pixels, internalData->m_h, preWeightedPixels, options);
+
+        CompressETC1BlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, internalData->m_drs, options, true);
+    }
+
+    if (ParallelMath::AnySet(anyTransparent))
+    {
+        if (!ParallelMath::AllSet(allTransparent))
+        {
+            // Flip sector assignments
+            for (int px = 0; px < 16; px++)
+                sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+        }
+
+        // Reset the error of any transparent blocks to max and retry with punchthrough modes
+        ParallelMath::ConditionalSet(bestError, ParallelMath::Int16FlagToFloat(anyTransparent), ParallelMath::MakeFloat(FLT_MAX));
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        // Flip sector assignments
+        for (int px = 0; px < 16; px++)
+            sectorAssignments[px] = ParallelMath::Not(sectorAssignments[px]);
+
+        EncodeVirtualTModePunchthrough(outputBuffer, bestError, sectorAssignments, pixels, preWeightedPixels, pixelIsTransparent, anyTransparent, allTransparent, options);
+
+        CompressETC1PunchthroughBlockInternal(bestError, outputBuffer, pixels, preWeightedPixels, pixelIsTransparent, static_cast<ETC2CompressionDataInternal*>(compressionData)->m_drs, options);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *pixelBlocks, const Options &options)
+{
+    MUInt15 pixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutUInt15(pixels[px], block, pixelBlocks[block].m_pixels[px][3]);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, false, false, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options)
+{
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(is11Bit ? 2047 : 255);
+    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px]);
+        maxAlpha = ParallelMath::Max(maxAlpha, pixels[px]);
+    }
+
+    MUInt15 alphaSpan = maxAlpha - minAlpha;
+    MUInt15 alphaSpanMidpointTimes2 = maxAlpha + minAlpha;
+
+    MUInt31 bestTotalError = ParallelMath::MakeUInt31(0x7fffffff);
+    MUInt15 bestTableIndex = ParallelMath::MakeUInt15(0);
+    MUInt15 bestBaseCodeword = ParallelMath::MakeUInt15(0);
+    MUInt15 bestMultiplier = ParallelMath::MakeUInt15(0);
+    MUInt15 bestIndexes[16];
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    const int numAlphaRanges = 10;
+    for (uint16_t tableIndex = 0; tableIndex < 16; tableIndex++)
+    {
+        for (int r = 0; r < numAlphaRanges; r++)
+        {
+            int subrange = r % 3;
+            int mainRange = r / 3;
+
+            int16_t maxOffset = Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - (subrange & 1)];
+            int16_t minOffset = -Tables::ETC2::g_alphaModifierTablePositive[tableIndex][3 - mainRange - ((subrange >> 1) & 1)] - 1;
+            uint16_t offsetSpan = static_cast<uint16_t>(maxOffset - minOffset);
+
+            MSInt16 vminOffset = ParallelMath::MakeSInt16(minOffset);
+            MUInt15 vmaxOffset = ParallelMath::MakeUInt15(maxOffset);
+            MUInt15 voffsetSpan = ParallelMath::MakeUInt15(offsetSpan);
+
+            MUInt15 minMultiplier = ParallelMath::MakeUInt15(0);
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                uint16_t singleAlphaSpan = ParallelMath::Extract(alphaSpan, block);
+
+                uint16_t lowMultiplier = singleAlphaSpan / offsetSpan;
+                ParallelMath::PutUInt15(minMultiplier, block, lowMultiplier);
+            }
+
+            if (is11Bit)
+            {
+                // Clamps this to valid multipliers under 15 and rounds down to nearest multiple of 8
+                minMultiplier = ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(112)) & ParallelMath::MakeUInt15(120);
+            }
+            else
+            {
+                // We cap at 1 and 14 so both multipliers are valid and dividable
+                // Cases where offset span is 0 should be caught by multiplier 1 of table 13
+                minMultiplier = ParallelMath::Max(ParallelMath::Min(minMultiplier, ParallelMath::MakeUInt15(14)), ParallelMath::MakeUInt15(1));
+            }
+
+            for (uint16_t multiplierOffset = 0; multiplierOffset < 2; multiplierOffset++)
+            {
+                MUInt15 multiplier = minMultiplier;
+
+                if (is11Bit)
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(8);
+                    else
+                        multiplier = ParallelMath::Max(multiplier, ParallelMath::MakeUInt15(1));
+                }
+                else
+                {
+                    if (multiplierOffset == 1)
+                        multiplier = multiplier + ParallelMath::MakeUInt15(1);
+                }
+
+                MSInt16 multipliedMinOffset = ParallelMath::CompactMultiply(ParallelMath::LosslessCast<MSInt16>::Cast(multiplier), vminOffset);
+                MUInt15 multipliedMaxOffset = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(multiplier, vmaxOffset));
+
+                // codeword = (maxOffset + minOffset + minAlpha + maxAlpha) / 2
+                MSInt16 unclampedBaseAlphaTimes2 = ParallelMath::LosslessCast<MSInt16>::Cast(alphaSpanMidpointTimes2) - ParallelMath::LosslessCast<MSInt16>::Cast(multipliedMaxOffset) - multipliedMinOffset;
+
+                MUInt15 baseAlpha;
+                if (is11Bit)
+                {
+                    // In unsigned, 4 is added to the unquantized alpha, so compensating for that cancels the 4 we have to add to do rounding.
+                    if (isSigned)
+                        unclampedBaseAlphaTimes2 = unclampedBaseAlphaTimes2 + ParallelMath::MakeSInt16(8);
+
+                    // -128 is illegal for some reason
+                    MSInt16 minBaseAlphaTimes2 = isSigned ? ParallelMath::MakeSInt16(16) : ParallelMath::MakeSInt16(0);
+
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, minBaseAlphaTimes2)), ParallelMath::MakeUInt15(4095));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2, 1) & ParallelMath::MakeUInt15(2040);
+
+                    if (!isSigned)
+                        baseAlpha = baseAlpha + ParallelMath::MakeUInt15(4);
+                }
+                else
+                {
+                    MUInt15 clampedBaseAlphaTimes2 = ParallelMath::Min(ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(unclampedBaseAlphaTimes2, ParallelMath::MakeSInt16(0))), ParallelMath::MakeUInt15(510));
+                    baseAlpha = ParallelMath::RightShift(clampedBaseAlphaTimes2 + ParallelMath::MakeUInt15(1), 1);
+                }
+
+                MUInt15 indexes[16];
+                MUInt31 totalError = ParallelMath::MakeUInt31(0);
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 quantizedValues;
+                    QuantizeETC2Alpha(tableIndex, pixels[px], baseAlpha, multiplier, is11Bit, isSigned, indexes[px], quantizedValues);
+
+                    if (is11Bit)
+                    {
+                        MSInt16 delta = ParallelMath::LosslessCast<MSInt16>::Cast(quantizedValues) - ParallelMath::LosslessCast<MSInt16>::Cast(pixels[px]);
+                        MSInt32 deltaSq = ParallelMath::XMultiply(delta, delta);
+                        totalError = totalError + ParallelMath::LosslessCast<MUInt31>::Cast(deltaSq);
+                    }
+                    else
+                        totalError = totalError + ParallelMath::ToUInt31(ParallelMath::SqDiffUInt8(quantizedValues, pixels[px]));
+                }
+
+                ParallelMath::Int16CompFlag isBetter = ParallelMath::Int32FlagToInt16(ParallelMath::Less(totalError, bestTotalError));
+                if (ParallelMath::AnySet(isBetter))
+                {
+                    ParallelMath::ConditionalSet(bestTotalError, isBetter, totalError);
+                    ParallelMath::ConditionalSet(bestTableIndex, isBetter, ParallelMath::MakeUInt15(tableIndex));
+                    ParallelMath::ConditionalSet(bestBaseCodeword, isBetter, baseAlpha);
+                    ParallelMath::ConditionalSet(bestMultiplier, isBetter, multiplier);
+
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], isBetter, indexes[px]);
+                }
+
+                // TODO: Do one refine pass
+            }
+        }
+    }
+
+    if (is11Bit)
+    {
+        bestMultiplier = ParallelMath::RightShift(bestMultiplier, 3);
+
+        if (isSigned)
+            bestBaseCodeword = bestBaseCodeword ^ ParallelMath::MakeUInt15(0x80);
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint8_t *output = outputBuffer + block * 8;
+
+        output[0] = static_cast<uint8_t>(ParallelMath::Extract(bestBaseCodeword, block));
+
+        ParallelMath::ScalarUInt16 multiplier = ParallelMath::Extract(bestMultiplier, block);
+        ParallelMath::ScalarUInt16 tableIndex = ParallelMath::Extract(bestTableIndex, block);
+
+        output[1] = static_cast<uint8_t>((multiplier << 4) | tableIndex);
+
+        static const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+        ParallelMath::ScalarUInt16 indexes[16];
+        for (int px = 0; px < 16; px++)
+            indexes[pixelSelectorOrder[px]] = ParallelMath::Extract(bestIndexes[px], block);
+
+        int outputOffset = 2;
+        int outputBits = 0;
+        int numOutputBits = 0;
+        for (int s = 0; s < 16; s++)
+        {
+            outputBits = (outputBits << 3) | indexes[s];
+            numOutputBits += 3;
+
+            if (numOutputBits >= 8)
+            {
+                output[outputOffset++] = static_cast<uint8_t>(outputBits >> (numOutputBits - 8));
+                numOutputBits -= 8;
+
+                outputBits &= ((1 << numOutputBits) - 1);
+            }
+        }
+
+        assert(outputOffset == 8 && numOutputBits == 0);
+    }
+}
+
+void cvtt::Internal::ETCComputer::CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options)
+{
+    MUInt15 pixels[16];
+    for (int px = 0; px < 16; px++)
+    {
+        MSInt16 adjustedPixel;
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            ParallelMath::PutSInt16(adjustedPixel, block, inputBlocks[block].m_pixels[px]);
+
+        // We use a slightly shifted range here so we can keep the unquantized base color in a UInt15
+        // That is, signed range is 1..2047, and unsigned range is 0..2047
+        if (isSigned)
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(1023)) + ParallelMath::MakeSInt16(1024);
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(1), adjustedPixel);
+        }
+        else
+        {
+            adjustedPixel = ParallelMath::Min(adjustedPixel, ParallelMath::MakeSInt16(2047));
+            adjustedPixel = ParallelMath::Max(ParallelMath::MakeSInt16(0), adjustedPixel);
+        }
+
+
+        pixels[px] = ParallelMath::LosslessCast<MUInt15>::Cast(adjustedPixel);
+    }
+
+    CompressETC2AlphaBlockInternal(outputBuffer, pixels, true, isSigned, options);
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options)
+{
+    DifferentialResolveStorage &drs = static_cast<ETC1CompressionDataInternal*>(compressionData)->m_drs;
+    MFloat bestTotalError = ParallelMath::MakeFloat(FLT_MAX);
+
+    MUInt15 pixels[16][3];
+    MFloat preWeightedPixels[16][3];
+    ExtractBlocks(pixels, preWeightedPixels, inputBlocks, options);
+
+    CompressETC1BlockInternal(bestTotalError, outputBuffer, pixels, preWeightedPixels, drs, options, false);
+}
+
+void cvtt::Internal::ETCComputer::ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options)
+{
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+    bool isUniform = ((options.flags & cvtt::Flags::Uniform) != 0);
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                ParallelMath::PutUInt15(pixels[px][ch], block, inputBlocks[block].m_pixels[px][ch]);
+        }
+
+        if (isFakeBT709)
+            ConvertToFakeBT709(preWeightedPixels[px], pixels[px]);
+        else if (isUniform)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+        }
+        else
+        {
+            preWeightedPixels[px][0] = ParallelMath::ToFloat(pixels[px][0]) * options.redWeight;
+            preWeightedPixels[px][1] = ParallelMath::ToFloat(pixels[px][1]) * options.greenWeight;
+            preWeightedPixels[px][2] = ParallelMath::ToFloat(pixels[px][2]) * options.blueWeight;
+        }
+    }
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    for (int ch = 0; ch < 3; ch++)
+    {
+        const MUInt15& cu15 = sectorCumulative[ch];
+
+        if (isDifferential)
+        {
+            //quantized[ch] = (cu * 31 + (cu >> 3)) >> 11;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 11)
+            );
+        }
+        else
+        {
+            //quantized[ch] = (cu * 30 + (cu >> 3)) >> 12;
+            quantized[ch] = ParallelMath::ToUInt15(
+                ParallelMath::RightShift(
+                (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3))
+                    , 12)
+            );
+        }
+    }
+
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized;
+        MUInt15 unquantizedNext;
+        if (isDifferential)
+        {
+            unquantized = (quantized[ch] << 3) | ParallelMath::RightShift(quantized[ch], 2);
+            MUInt15 quantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(31), quantized[ch] + ParallelMath::MakeUInt15(1));
+            unquantizedNext = (quantizedNext << 3) | ParallelMath::RightShift(quantizedNext, 2);
+        }
+        else
+        {
+            unquantized = (quantized[ch] << 4) | quantized[ch];
+            unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+        }
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantized << 3);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(unquantizedNext << 3);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, sectorCumulative);
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential)
+{
+    // sectorCumulative range is 0..2040 (11 bits)
+    MUInt15 roundingOffset = ParallelMath::MakeUInt15(0);
+
+    MUInt15 rOffset;
+    MUInt15 gOffset;
+    MUInt15 bOffset;
+    MUInt15 quantizedBase[3];
+    MUInt15 upperBound;
+
+    MUInt15 sectorCumulativeFillIn[3];
+    for (int ch = 0; ch < 3; ch++)
+        sectorCumulativeFillIn[ch] = sectorCumulative[ch] + ParallelMath::RightShift(sectorCumulative[ch], 8);
+
+    if (isDifferential)
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 6) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 4) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 2) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 6);
+
+        upperBound = ParallelMath::MakeUInt15(31);
+    }
+    else
+    {
+        rOffset = (sectorCumulativeFillIn[0] << 5) & ParallelMath::MakeUInt15(0xf00);
+        gOffset = (sectorCumulativeFillIn[1] << 1) & ParallelMath::MakeUInt15(0x0f0);
+        bOffset = ParallelMath::RightShift(sectorCumulativeFillIn[2], 3) & ParallelMath::MakeUInt15(0x00f);
+
+        for (int ch = 0; ch < 3; ch++)
+            quantizedBase[ch] = ParallelMath::RightShift(sectorCumulativeFillIn[ch], 7);
+
+        upperBound = ParallelMath::MakeUInt15(15);
+    }
+
+    MUInt15 lookupIndex = (rOffset | gOffset | bOffset);
+
+    MUInt15 octant;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        ParallelMath::PutUInt15(octant, block, Tables::FakeBT709::g_rounding16[ParallelMath::Extract(lookupIndex, block)]);
+
+    quantizedBase[0] = quantizedBase[0] + (octant & ParallelMath::MakeUInt15(1));
+    quantizedBase[1] = quantizedBase[1] + (ParallelMath::RightShift(octant, 1) & ParallelMath::MakeUInt15(1));
+    quantizedBase[2] = quantizedBase[2] + (ParallelMath::RightShift(octant, 2) & ParallelMath::MakeUInt15(1));
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = ParallelMath::Min(quantizedBase[ch], upperBound);
+}
+
+void cvtt::Internal::ETCComputer::ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 targets[3], const MUInt15 &granularity)
+{
+    MFloat lowOctantRGBFloat[3];
+    MFloat highOctantRGBFloat[3];
+
+    for (int ch = 0; ch < 3; ch++)
+    {
+        MUInt15 unquantized = (quantized[ch] << 4) | quantized[ch];
+        MUInt15 unquantizedNext = ParallelMath::Min(ParallelMath::MakeUInt15(255), unquantized + ParallelMath::MakeUInt15(17));
+
+        lowOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantized, granularity) << 1);
+        highOctantRGBFloat[ch] = ParallelMath::ToFloat(ParallelMath::CompactMultiply(unquantizedNext, granularity) << 1);
+    }
+
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestOctant = ParallelMath::MakeUInt15(0);
+
+    MFloat cumulativeYUV[3];
+    ConvertToFakeBT709(cumulativeYUV, ParallelMath::ToFloat(targets[0]), ParallelMath::ToFloat(targets[1]), ParallelMath::ToFloat(targets[2]));
+
+    for (uint16_t octant = 0; octant < 8; octant++)
+    {
+        const MFloat &r = (octant & 1) ? highOctantRGBFloat[0] : lowOctantRGBFloat[0];
+        const MFloat &g = (octant & 2) ? highOctantRGBFloat[1] : lowOctantRGBFloat[1];
+        const MFloat &b = (octant & 4) ? highOctantRGBFloat[2] : lowOctantRGBFloat[2];
+
+        MFloat octantYUV[3];
+        ConvertToFakeBT709(octantYUV, r, g, b);
+
+        MFloat delta[3];
+        for (int ch = 0; ch < 3; ch++)
+            delta[ch] = octantYUV[ch] - cumulativeYUV[ch];
+
+        MFloat error = delta[0] * delta[0] + delta[1] + delta[1] + delta[2] * delta[2];
+        ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestError));
+        ParallelMath::ConditionalSet(bestOctant, errorBetter, ParallelMath::MakeUInt15(octant));
+        bestError = ParallelMath::Min(error, bestError);
+    }
+
+    for (int ch = 0; ch < 3; ch++)
+        quantized[ch] = quantized[ch] + (ParallelMath::RightShift(bestOctant, ch) & ParallelMath::MakeUInt15(1));
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3])
+{
+    MFloat floatRGB[3];
+    for (int ch = 0; ch < 3; ch++)
+        floatRGB[ch] = ParallelMath::ToFloat(color[ch]);
+
+    ConvertToFakeBT709(yuv, floatRGB);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3])
+{
+    ConvertToFakeBT709(yuv, color[0], color[1], color[2]);
+}
+
+void cvtt::Internal::ETCComputer::ConvertToFakeBT709(MFloat yuv[3], const MFloat &pr, const MFloat &pg, const MFloat &pb)
+{
+    MFloat r = pr;
+    MFloat g = pg;
+    MFloat b = pb;
+
+    yuv[0] = r * 0.368233989135369f + g * 1.23876274963149f + b * 0.125054068802017f;
+    yuv[1] = r * 0.5f - g * 0.4541529f - b * 0.04584709f;
+    yuv[2] = r * -0.081014709086133f - g * 0.272538676238785f + b * 0.353553390593274f;
+}
+
+void cvtt::Internal::ETCComputer::ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3])
+{
+    MFloat yy = yuv[0] * 0.57735026466774571071f;
+    MFloat u = yuv[1];
+    MFloat v = yuv[2];
+
+    rgb[0] = yy + u * 1.5748000207960953486f;
+    rgb[1] = yy - u * 0.46812425854364753669f - v * 0.26491652528157560861f;
+    rgb[2] = yy + v * 2.6242146882856944069f;
+}
+
+
+void cvtt::Internal::ETCComputer::QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues)
+{
+    MSInt16 offset = ParallelMath::LosslessCast<MSInt16>::Cast(value) - ParallelMath::LosslessCast<MSInt16>::Cast(baseValue);
+    MSInt16 offsetTimes2 = offset + offset;
+
+    // ETC2's offset tables all have a reflect about 0.5*multiplier
+    MSInt16 offsetAboutReflectorTimes2 = offsetTimes2 + ParallelMath::LosslessCast<MSInt16>::Cast(multiplier);
+
+    MUInt15 absOffsetAboutReflectorTimes2 = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Abs(offsetAboutReflectorTimes2));
+    MUInt15 lookupIndex = ParallelMath::RightShift(absOffsetAboutReflectorTimes2, 1);
+
+    MUInt15 positiveIndex;
+    MUInt15 positiveOffsetUnmultiplied;
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        uint16_t blockLookupIndex = ParallelMath::Extract(lookupIndex, block) / ParallelMath::Extract(multiplier, block);
+        if (blockLookupIndex >= Tables::ETC2::g_alphaRoundingTableWidth)
+            blockLookupIndex = Tables::ETC2::g_alphaRoundingTableWidth - 1;
+        uint16_t index = Tables::ETC2::g_alphaRoundingTables[tableIndex][blockLookupIndex];
+        ParallelMath::PutUInt15(positiveIndex, block, index);
+        ParallelMath::PutUInt15(positiveOffsetUnmultiplied, block, Tables::ETC2::g_alphaModifierTablePositive[tableIndex][index]);
+
+        // TODO: This is suboptimal when the offset is capped.  We should detect 0 and 255 values and always map them to the maximum offsets.
+        // Doing that will also affect refinement though.
+    }
+
+    MSInt16 signBits = ParallelMath::RightShift(offsetAboutReflectorTimes2, 15);
+    MSInt16 offsetUnmultiplied = ParallelMath::LosslessCast<MSInt16>::Cast(positiveOffsetUnmultiplied) ^ signBits;
+    MSInt16 quantizedOffset = ParallelMath::CompactMultiply(offsetUnmultiplied, multiplier);
+
+    MSInt16 offsetValue = ParallelMath::LosslessCast<MSInt16>::Cast(baseValue) + quantizedOffset;
+
+    if (is11Bit)
+    {
+        if (isSigned)
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(1), offsetValue)));
+        else
+            outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(2047), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+    }
+    else
+        outQuantizedValues = ParallelMath::Min(ParallelMath::MakeUInt15(255), ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Max(ParallelMath::MakeSInt16(0), offsetValue)));
+
+    MUInt15 indexSub = ParallelMath::LosslessCast<MUInt15>::Cast(signBits) & ParallelMath::MakeUInt15(4);
+
+    outIndexes = positiveIndex + ParallelMath::MakeUInt15(4) - indexSub;
+}
+
+
+void cvtt::Internal::ETCComputer::EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    int rh = ((isolatedColor[0] >> 2) & 3);
+    int rl = (isolatedColor[0] & 3);
+
+    if (rh + rl < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (58 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (61 - 32);
+    }
+
+    highBits |= rh << (59 - 32);
+    highBits |= rl << (56 - 32);
+    highBits |= isolatedColor[1] << (52 - 32);
+    highBits |= isolatedColor[2] << (48 - 32);
+    highBits |= lineColor[0] << (44 - 32);
+    highBits |= lineColor[1] << (40 - 32);
+    highBits |= lineColor[2] << (36 - 32);
+    highBits |= ((table >> 1) & 3) << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= (table & 1) << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sel = (packedSelectors >> (2 * selectorOrder[px])) & 3;
+        if ((sel & 0x1) != 0)
+            lowBits |= (1 << px);
+        if ((sel & 0x2) != 0)
+            lowBits |= (1 << (16 + px));
+    }
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque)
+{
+    if (blockColors[0] == blockColors[1])
+    {
+        // Base colors are the same.
+        // If the table low bit isn't 1, then we can't encode this, because swapping the block colors will have no effect
+        // on their order.
+        // Instead, we encode this as T mode where all of the indexes are on the line.
+
+        ParallelMath::ScalarUInt16 lineColor[3];
+        ParallelMath::ScalarUInt16 isolatedColor[3];
+
+        lineColor[0] = isolatedColor[0] = (blockColors[0] >> 10) & 0x1f;
+        lineColor[1] = isolatedColor[1] = (blockColors[0] >> 5) & 0x1f;
+        lineColor[2] = isolatedColor[2] = (blockColors[0] >> 0) & 0x1f;
+
+        int32_t packedSelectors = 0x55555555;
+        for (int px = 0; px < 16; px++)
+            packedSelectors |= ((signBits >> px) & 1) << ((px * 2) + 1);
+
+        EmitTModeBlock(outputBuffer, lineColor, isolatedColor, packedSelectors, table, opaque);
+        return;
+    }
+
+    static const int selectorOrder[] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int16_t colors[2][3];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            colors[sector][ch] = (blockColors[sector] >> ((2 - ch) * 5)) & 15;
+    }
+
+    uint32_t lowBits = 0;
+    uint32_t highBits = 0;
+
+    if (((table & 1) == 1) != (blockColors[0] > blockColors[1]))
+    {
+        for (int ch = 0; ch < 3; ch++)
+            std::swap(colors[0][ch], colors[1][ch]);
+        sectorBits ^= 0xffff;
+    }
+
+    int r1 = colors[0][0];
+    int g1a = colors[0][1] >> 1;
+    int g1b = (colors[0][1] & 1);
+    int b1a = colors[0][2] >> 3;
+    int b1b = colors[0][2] & 7;
+    int r2 = colors[1][0];
+    int g2 = colors[1][1];
+    int b2 = colors[1][2];
+
+    // Avoid overflowing R
+    if ((g1a & 4) != 0 && r1 + g1a < 8)
+        highBits |= 1 << (63 - 32);
+
+    int fakeDG = b1b >> 1;
+    int fakeG = b1a | (g1b << 1);
+
+    if (fakeG + fakeDG < 4)
+    {
+        // Overflow low
+        highBits |= 1 << (50 - 32);
+    }
+    else
+    {
+        // Overflow high
+        highBits |= 7 << (53 - 32);
+    }
+
+    int da = (table >> 2) & 1;
+    int db = (table >> 1) & 1;
+
+    highBits |= r1 << (59 - 32);
+    highBits |= g1a << (56 - 32);
+    highBits |= g1b << (52 - 32);
+    highBits |= b1a << (51 - 32);
+    highBits |= b1b << (47 - 32);
+    highBits |= r2 << (43 - 32);
+    highBits |= g2 << (39 - 32);
+    highBits |= b2 << (35 - 32);
+    highBits |= da << (34 - 32);
+    if (opaque)
+        highBits |= 1 << (33 - 32);
+    highBits |= db << (32 - 32);
+
+    for (int px = 0; px < 16; px++)
+    {
+        int sectorBit = (sectorBits >> selectorOrder[px]) & 1;
+        int signBit = (signBits >> selectorOrder[px]) & 1;
+
+        lowBits |= (signBit << px);
+        lowBits |= (sectorBit << (16 + px));
+    }
+
+    uint8_t *output = outputBuffer;
+
+    for (int i = 0; i < 4; i++)
+        output[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        output[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent)
+{
+    uint32_t highBits = 0;
+    uint32_t lowBits = 0;
+
+    if (blockBestD == 0)
+    {
+        highBits |= blockBestColors[0][0] << 28;
+        highBits |= blockBestColors[1][0] << 24;
+        highBits |= blockBestColors[0][1] << 20;
+        highBits |= blockBestColors[1][1] << 16;
+        highBits |= blockBestColors[0][2] << 12;
+        highBits |= blockBestColors[1][2] << 8;
+    }
+    else
+    {
+        highBits |= blockBestColors[0][0] << 27;
+        highBits |= ((blockBestColors[1][0] - blockBestColors[0][0]) & 7) << 24;
+        highBits |= blockBestColors[0][1] << 19;
+        highBits |= ((blockBestColors[1][1] - blockBestColors[0][1]) & 7) << 16;
+        highBits |= blockBestColors[0][2] << 11;
+        highBits |= ((blockBestColors[1][2] - blockBestColors[0][2]) & 7) << 8;
+    }
+
+    highBits |= (blockBestTables[0] << 5);
+    highBits |= (blockBestTables[1] << 2);
+    if (!transparent)
+        highBits |= (blockBestD << 1);
+    highBits |= blockBestFlip;
+
+    const uint8_t modifierCodes[4] = { 3, 2, 0, 1 };
+
+    uint8_t unpackedSelectors[16];
+    uint8_t unpackedSelectorCodes[16];
+    for (int sector = 0; sector < 2; sector++)
+    {
+        int blockSectorBestSelectors = blockBestSelectors[sector];
+
+        for (int px = 0; px < 8; px++)
+        {
+            int selector = (blockSectorBestSelectors >> (2 * px)) & 3;
+            unpackedSelectorCodes[g_flipTables[blockBestFlip][sector][px]] = modifierCodes[selector];
+            unpackedSelectors[g_flipTables[blockBestFlip][sector][px]] = selector;
+        }
+    }
+
+    const int pixelSelectorOrder[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+    int lowBitOffset = 0;
+    for (int sb = 0; sb < 2; sb++)
+        for (int px = 0; px < 16; px++)
+            lowBits |= ((unpackedSelectorCodes[pixelSelectorOrder[px]] >> sb) & 1) << (px + sb * 16);
+
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i] = (highBits >> (24 - i * 8)) & 0xff;
+    for (int i = 0; i < 4; i++)
+        outputBuffer[i + 4] = (lowBits >> (24 - i * 8)) & 0xff;
+}
+
+void cvtt::Internal::ETCComputer::CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage &drs, const Options &options, bool punchthrough)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+    MUInt15 bestD = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+			}
+		}
+	}
+
+	static const MSInt16 modifierTables[8][4] =
+	{
+		{ ParallelMath::MakeSInt16(-8), ParallelMath::MakeSInt16(-2), ParallelMath::MakeSInt16(2), ParallelMath::MakeSInt16(8) },
+		{ ParallelMath::MakeSInt16(-17), ParallelMath::MakeSInt16(-5), ParallelMath::MakeSInt16(5), ParallelMath::MakeSInt16(17) },
+		{ ParallelMath::MakeSInt16(-29), ParallelMath::MakeSInt16(-9), ParallelMath::MakeSInt16(9), ParallelMath::MakeSInt16(29) },
+		{ ParallelMath::MakeSInt16(-42), ParallelMath::MakeSInt16(-13), ParallelMath::MakeSInt16(13), ParallelMath::MakeSInt16(42) },
+		{ ParallelMath::MakeSInt16(-60), ParallelMath::MakeSInt16(-18), ParallelMath::MakeSInt16(18), ParallelMath::MakeSInt16(60) },
+		{ ParallelMath::MakeSInt16(-80), ParallelMath::MakeSInt16(-24), ParallelMath::MakeSInt16(24), ParallelMath::MakeSInt16(80) },
+		{ ParallelMath::MakeSInt16(-106), ParallelMath::MakeSInt16(-33), ParallelMath::MakeSInt16(33), ParallelMath::MakeSInt16(106) },
+		{ ParallelMath::MakeSInt16(-183), ParallelMath::MakeSInt16(-47), ParallelMath::MakeSInt16(47), ParallelMath::MakeSInt16(183) },
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    int minD = punchthrough ? 1 : 0;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		MFloat bestIndError[2] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
+		MUInt16 bestIndSelectors[2] = { ParallelMath::MakeUInt16(0), ParallelMath::MakeUInt16(0) };
+		MUInt15 bestIndColors[2] = { zeroU15, zeroU15 };
+		MUInt15 bestIndTable[2] = { zeroU15, zeroU15 };
+
+		for (int d = minD; d < 2; d++)
+		{
+			for (int sector = 0; sector < 2; sector++)
+			{
+				const int16_t *potentialOffsets = cvtt::Tables::ETC1::g_potentialOffsets4;
+
+				for (int table = 0; table < 8; table++)
+				{
+					int16_t numOffsets = *potentialOffsets++;
+
+					MUInt15 possibleColors[cvtt::Tables::ETC1::g_maxPotentialOffsets];
+
+                    MUInt15 quantized[3];
+                    for (int oi = 0; oi < numOffsets; oi++)
+                    {
+                        if (!isFakeBT709)
+                        {
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                if (d == 1)
+                                {
+                                    //quantized[ch] = (cu * 31 + (cu >> 3) + 1024) >> 11;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                            (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(1024)
+                                            , 11)
+                                        );
+                                }
+                                else
+                                {
+                                    //quantized[ch] = (cu * 30 + (cu >> 3) + 2048) >> 12;
+                                    quantized[ch] = ParallelMath::ToUInt15(
+                                        ParallelMath::RightShift(
+                                        (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15 << 1) + ParallelMath::LosslessCast<MUInt16>::Cast(ParallelMath::RightShift(cu15, 3)) + ParallelMath::MakeUInt16(2048)
+                                            , 12)
+                                    );
+                                }
+						    }
+                        }
+                        else
+                        {
+                            MUInt15 offsetCumulative[3];
+						    for (int ch = 0; ch < 3; ch++)
+						    {
+                                // cu is in range 0..2040
+                                MUInt15 cu15 = ParallelMath::Min(
+                                    ParallelMath::MakeUInt15(2040),
+                                    ParallelMath::ToUInt15(
+                                        ParallelMath::Max(
+                                            ParallelMath::MakeSInt16(0),
+                                            ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + ParallelMath::MakeSInt16(potentialOffsets[oi])
+                                        )
+                                    )
+                                );
+
+                                offsetCumulative[ch] = cu15;
+						    }
+
+                            if ((options.flags & cvtt::Flags::ETC_FakeBT709Accurate) != 0)
+                                ResolveHalfBlockFakeBT709RoundingAccurate(quantized, offsetCumulative, d == 1);
+                            else
+                                ResolveHalfBlockFakeBT709RoundingFast(quantized, offsetCumulative, d == 1);
+                        }
+
+						possibleColors[oi] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+					}
+
+					potentialOffsets += numOffsets;
+
+                    ParallelMath::UInt15 numUniqueColors;
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t blockNumUniqueColors = 1;
+                        for (int i = 1; i < numOffsets; i++)
+                        {
+                            uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                            if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                                ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                        }
+
+                        ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                    }
+
+                    int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                    for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                        maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                        for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                            ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                    }
+
+					for (int i = 0; i < maxUniqueColors; i++)
+					{
+						MFloat error = ParallelMath::MakeFloatZero();
+						MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                        MUInt15 quantized = possibleColors[i];
+						TestHalfBlock(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], modifierTables[table], d == 1, options);
+
+						if (d == 0)
+						{
+                            ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, bestIndError[sector]));
+							if (ParallelMath::AnySet(errorBetter))
+							{
+								bestIndError[sector] = ParallelMath::Min(error, bestIndError[sector]);
+								ParallelMath::ConditionalSet(bestIndSelectors[sector], errorBetter, selectors);
+                                ParallelMath::ConditionalSet(bestIndColors[sector], errorBetter, quantized);
+                                ParallelMath::ConditionalSet(bestIndTable[sector], errorBetter, ParallelMath::MakeUInt15(table));
+							}
+						}
+						else
+						{
+                            ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+							MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                            drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            {
+                                int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                                ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                                ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                                ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                                ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                            }
+						}
+					}
+				}
+			}
+
+			if (d == 0)
+			{
+				MFloat bestIndErrorTotal = bestIndError[0] + bestIndError[1];
+                ParallelMath::Int16CompFlag errorBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(bestIndErrorTotal, bestTotalError));
+				if (ParallelMath::AnySet(errorBetter))
+				{
+                    bestIsThisMode = bestIsThisMode | errorBetter;
+
+					bestTotalError = ParallelMath::Min(bestTotalError, bestIndErrorTotal);
+					ParallelMath::ConditionalSet(bestFlip, errorBetter, ParallelMath::MakeUInt15(flip));
+                    ParallelMath::ConditionalSet(bestD, errorBetter, ParallelMath::MakeUInt15(d));
+					for (int sector = 0; sector < 2; sector++)
+					{
+                        ParallelMath::ConditionalSet(bestColors[sector], errorBetter, bestIndColors[sector]);
+                        ParallelMath::ConditionalSet(bestSelectors[sector], errorBetter, bestIndSelectors[sector]);
+                        ParallelMath::ConditionalSet(bestTables[sector], errorBetter, bestIndTable[sector]);
+					}
+				}
+			}
+			else
+			{
+                ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(false), ParallelMath::MakeBoolInt16(false) };
+                FindBestDifferentialCombination(flip, d, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestD, bestColors, bestSelectors, bestTables, drs);
+			}
+		}
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        uint32_t highBits = 0;
+        uint32_t lowBits = 0;
+
+        int blockBestFlip = ParallelMath::Extract(bestFlip, block);
+        int blockBestD = ParallelMath::Extract(bestD, block);
+        int blockBestTables[2] = { ParallelMath::Extract(bestTables[0], block), ParallelMath::Extract(bestTables[1], block) };
+        ParallelMath::ScalarUInt16 blockBestSelectors[2] = { ParallelMath::Extract(bestSelectors[0], block), ParallelMath::Extract(bestSelectors[1], block) };
+
+        int colors[2][3];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                colors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, blockBestFlip, blockBestD, colors, blockBestTables, blockBestSelectors, false);
+    }
+}
+
+
+void cvtt::Internal::ETCComputer::CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage &drs, const Options &options)
+{
+	int numTries = 0;
+
+    MUInt15 zeroU15 = ParallelMath::MakeUInt15(0);
+    MUInt16 zeroU16 = ParallelMath::MakeUInt16(0);
+
+    MUInt15 bestColors[2] = { zeroU15, zeroU15 };
+    MUInt16 bestSelectors[2] = { zeroU16, zeroU16 };
+    MUInt15 bestTables[2] = { zeroU15, zeroU15 };
+    MUInt15 bestFlip = zeroU15;
+
+    MUInt15 sectorPixels[2][2][8][3];
+    ParallelMath::Int16CompFlag sectorTransparent[2][2][8];
+    MFloat sectorPreWeightedPixels[2][2][8][3];
+    MUInt15 sectorCumulative[2][2][3];
+
+    ParallelMath::Int16CompFlag bestIsThisMode = ParallelMath::MakeBoolInt16(false);
+
+    for (int flip = 0; flip < 2; flip++)
+	{
+		for (int sector = 0; sector < 2; sector++)
+		{
+			for (int ch = 0; ch < 3; ch++)
+				sectorCumulative[flip][sector][ch] = zeroU15;
+
+			for (int px = 0; px < 8; px++)
+			{
+				for (int ch = 0; ch < 3; ch++)
+				{
+					MUInt15 pixelChannelValue = pixels[g_flipTables[flip][sector][px]][ch];
+					sectorPixels[flip][sector][px][ch] = pixelChannelValue;
+                    sectorPreWeightedPixels[flip][sector][px][ch] = preWeightedPixels[g_flipTables[flip][sector][px]][ch];
+					sectorCumulative[flip][sector][ch] = sectorCumulative[flip][sector][ch] + pixelChannelValue;
+				}
+
+                sectorTransparent[flip][sector][px] = isTransparent[g_flipTables[flip][sector][px]];
+			}
+		}
+	}
+
+	static const MUInt15 modifiers[8] =
+	{
+		ParallelMath::MakeUInt15(8),
+		ParallelMath::MakeUInt15(17),
+		ParallelMath::MakeUInt15(29),
+		ParallelMath::MakeUInt15(42),
+		ParallelMath::MakeUInt15(60),
+		ParallelMath::MakeUInt15(80),
+		ParallelMath::MakeUInt15(106),
+		ParallelMath::MakeUInt15(183),
+	};
+
+    bool isFakeBT709 = ((options.flags & cvtt::Flags::ETC_UseFakeBT709) != 0);
+
+    const int maxSectorCumulativeOffsets = 17;
+
+	for (int flip = 0; flip < 2; flip++)
+	{
+        ParallelMath::Int16CompFlag canIgnoreSector[2] = { ParallelMath::MakeBoolInt16(true), ParallelMath::MakeBoolInt16(false) };
+
+        for (int sector = 0; sector < 2; sector++)
+            for (int px = 0; px < 8; px++)
+                canIgnoreSector[sector] = canIgnoreSector[sector] & sectorTransparent[flip][sector][px];
+
+		drs.diffNumAttempts[0] = drs.diffNumAttempts[1] = zeroU15;
+
+		for (int sector = 0; sector < 2; sector++)
+		{
+            MUInt15 sectorNumOpaque = ParallelMath::MakeUInt15(0);
+            for (int px = 0; px < 8; px++)
+                sectorNumOpaque = sectorNumOpaque + ParallelMath::SelectOrZero(sectorTransparent[flip][sector][px], ParallelMath::MakeUInt15(1));
+
+            int sectorMaxOpaque = 0;
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                sectorMaxOpaque = std::max<int>(sectorMaxOpaque, ParallelMath::Extract(sectorNumOpaque, block));
+
+            int sectorNumOpaqueMultipliers = sectorMaxOpaque * 2 + 1;
+
+            MUInt15 sectorNumOpaqueDenominator = ParallelMath::Max(ParallelMath::MakeUInt15(1), sectorNumOpaque) << 8;
+            MUInt15 sectorNumOpaqueAddend = sectorNumOpaque << 7;
+
+            MSInt16 sectorNumOpaqueSigned = ParallelMath::LosslessCast<MSInt16>::Cast(sectorNumOpaque);
+            MSInt16 negSectorNumOpaqueSigned = ParallelMath::MakeSInt16(0) - sectorNumOpaqueSigned;
+
+            MUInt15 sectorCumulativeMax = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(ParallelMath::MakeUInt15(255), sectorNumOpaque));
+
+			for (int table = 0; table < 8; table++)
+			{
+				MUInt15 possibleColors[maxSectorCumulativeOffsets];
+
+                MUInt15 quantized[3];
+                for (int om = -sectorMaxOpaque; om <= sectorMaxOpaque; om++)
+                {
+                    MSInt16 clampedOffsetMult = ParallelMath::Max(ParallelMath::Min(ParallelMath::MakeSInt16(om), sectorNumOpaqueSigned), negSectorNumOpaqueSigned);
+                    MSInt16 offset = ParallelMath::CompactMultiply(clampedOffsetMult, modifiers[table]);
+
+                    for (int ch = 0; ch < 3; ch++)
+                    {
+                        // cu is in range 0..255*numOpaque (at most 0..2040)
+                        MUInt15 cu15 = ParallelMath::Min(
+                            sectorCumulativeMax,
+                            ParallelMath::ToUInt15(
+                                ParallelMath::Max(
+                                    ParallelMath::MakeSInt16(0),
+                                    ParallelMath::LosslessCast<MSInt16>::Cast(sectorCumulative[flip][sector][ch]) + offset
+                                )
+                            )
+                        );
+
+                        //quantized[ch] = (cu * 31 + (cu >> 3) + (numOpaque * 128)) / (numOpaque * 256)
+                        MUInt16 cuTimes31 = (ParallelMath::LosslessCast<MUInt16>::Cast(cu15) << 5) - ParallelMath::LosslessCast<MUInt16>::Cast(cu15);
+                        MUInt15 cuDiv8 = ParallelMath::RightShift(cu15, 3);
+                        MUInt16 numerator = cuTimes31 + ParallelMath::LosslessCast<MUInt16>::Cast(cuDiv8 + sectorNumOpaqueAddend);
+                        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                            ParallelMath::PutUInt15(quantized[ch], block, ParallelMath::Extract(numerator, block) / ParallelMath::Extract(sectorNumOpaqueDenominator, block));
+                    }
+
+					possibleColors[om + sectorMaxOpaque] = quantized[0] | (quantized[1] << 5) | (quantized[2] << 10);
+				}
+
+                ParallelMath::UInt15 numUniqueColors;
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t blockNumUniqueColors = 1;
+                    for (int i = 1; i < sectorNumOpaqueMultipliers; i++)
+                    {
+                        uint16_t color = ParallelMath::Extract(possibleColors[i], block);
+                        if (color != ParallelMath::Extract(possibleColors[blockNumUniqueColors - 1], block))
+                            ParallelMath::PutUInt15(possibleColors[blockNumUniqueColors++], block, color);
+                    }
+
+                    ParallelMath::PutUInt15(numUniqueColors, block, blockNumUniqueColors);
+                }
+
+                int maxUniqueColors = ParallelMath::Extract(numUniqueColors, 0);
+                for (int block = 1; block < ParallelMath::ParallelSize; block++)
+                    maxUniqueColors = std::max<int>(maxUniqueColors, ParallelMath::Extract(numUniqueColors, block));
+
+                for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                {
+                    uint16_t fillColor = ParallelMath::Extract(possibleColors[0], block);
+                    for (int i = ParallelMath::Extract(numUniqueColors, block); i < maxUniqueColors; i++)
+                        ParallelMath::PutUInt15(possibleColors[i], block, fillColor);
+                }
+
+				for (int i = 0; i < maxUniqueColors; i++)
+				{
+					MFloat error = ParallelMath::MakeFloatZero();
+					MUInt16 selectors = ParallelMath::MakeUInt16(0);
+                    MUInt15 quantized = possibleColors[i];
+					TestHalfBlockPunchthrough(error, selectors, quantized, sectorPixels[flip][sector], sectorPreWeightedPixels[flip][sector], sectorTransparent[flip][sector], modifiers[table], options);
+
+                    ParallelMath::Int16CompFlag isInBounds = ParallelMath::Less(ParallelMath::MakeUInt15(i), numUniqueColors);
+
+					MUInt15 storageIndexes = drs.diffNumAttempts[sector];
+                    drs.diffNumAttempts[sector] = drs.diffNumAttempts[sector] + ParallelMath::SelectOrZero(isInBounds, ParallelMath::MakeUInt15(1));
+
+                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+                    {
+                        int storageIndex = ParallelMath::Extract(storageIndexes, block);
+
+                        ParallelMath::PutFloat(drs.diffErrors[sector][storageIndex], block, ParallelMath::Extract(error, block));
+                        ParallelMath::PutUInt16(drs.diffSelectors[sector][storageIndex], block, ParallelMath::Extract(selectors, block));
+                        ParallelMath::PutUInt15(drs.diffColors[sector][storageIndex], block, ParallelMath::Extract(quantized, block));
+                        ParallelMath::PutUInt15(drs.diffTables[sector][storageIndex], block, table);
+                    }
+                }
+            }
+        }
+
+        MUInt15 bestDDummy = ParallelMath::MakeUInt15(0);
+        FindBestDifferentialCombination(flip, 1, canIgnoreSector, bestIsThisMode, bestTotalError, bestFlip, bestDDummy, bestColors, bestSelectors, bestTables, drs);
+	}
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        if (!ParallelMath::Extract(bestIsThisMode, block))
+            continue;
+
+        int blockBestColors[2][3];
+        int blockBestTables[2];
+        ParallelMath::ScalarUInt16 blockBestSelectors[2];
+        for (int sector = 0; sector < 2; sector++)
+        {
+            int sectorColor = ParallelMath::Extract(bestColors[sector], block);
+            for (int ch = 0; ch < 3; ch++)
+                blockBestColors[sector][ch] = (sectorColor >> (ch * 5)) & 31;
+
+            blockBestTables[sector] = ParallelMath::Extract(bestTables[sector], block);
+            blockBestSelectors[sector] = ParallelMath::Extract(bestSelectors[sector], block);
+        }
+
+        EmitETC1Block(outputBuffer + block * 8, ParallelMath::Extract(bestFlip, block), 1, blockBestColors, blockBestTables, blockBestSelectors, true);
+    }
+}
+
+
+cvtt::ETC1CompressionData *cvtt::Internal::ETCComputer::AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC1CompressionDataInternal(context);
+    return static_cast<ETC1CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC1CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC1CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC1CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC1CompressionDataInternal));
+}
+
+cvtt::ETC2CompressionData *cvtt::Internal::ETCComputer::AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options)
+{
+    void *buffer = allocFunc(context, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+    if (!buffer)
+        return NULL;
+    new (buffer) cvtt::Internal::ETCComputer::ETC2CompressionDataInternal(context, options);
+    return static_cast<ETC2CompressionData*>(buffer);
+}
+
+void cvtt::Internal::ETCComputer::ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc)
+{
+    cvtt::Internal::ETCComputer::ETC2CompressionDataInternal* internalData = static_cast<cvtt::Internal::ETCComputer::ETC2CompressionDataInternal*>(compressionData);
+    void *context = internalData->m_context;
+    internalData->~ETC2CompressionDataInternal();
+    freeFunc(context, compressionData, sizeof(cvtt::Internal::ETCComputer::ETC2CompressionDataInternal));
+}
+
+cvtt::Internal::ETCComputer::ETC2CompressionDataInternal::ETC2CompressionDataInternal(void *context, const cvtt::Options &options)
+    : m_context(context)
+{
+    const float cd[3] = { options.redWeight, options.greenWeight, options.blueWeight };
+    const float rotCD[3] = { cd[1], cd[2], cd[0] };
+
+    const float offs = -(rotCD[0] * cd[0] + rotCD[1] * cd[1] + rotCD[2] * cd[2]) / (cd[0] * cd[0] + cd[1] * cd[1] + cd[2] * cd[2]);
+
+    const float chromaAxis0[3] = { rotCD[0] + cd[0] * offs, rotCD[1] + cd[1] * offs, rotCD[2] + cd[2] * offs };
+
+    const float chromaAxis1Unnormalized[3] =
+    {
+        chromaAxis0[1] * cd[2] - chromaAxis0[2] * cd[1],
+        chromaAxis0[2] * cd[0] - chromaAxis0[0] * cd[2],
+        chromaAxis0[0] * cd[1] - chromaAxis0[1] * cd[0]
+    };
+
+    const float ca0LengthSq = (chromaAxis0[0] * chromaAxis0[0] + chromaAxis0[1] * chromaAxis0[1] + chromaAxis0[2] * chromaAxis0[2]);
+    const float ca1UNLengthSq = (chromaAxis1Unnormalized[0] * chromaAxis1Unnormalized[0] + chromaAxis1Unnormalized[1] * chromaAxis1Unnormalized[1] + chromaAxis1Unnormalized[2] * chromaAxis1Unnormalized[2]);
+    const float lengthRatio = static_cast<float>(std::sqrt(ca0LengthSq / ca1UNLengthSq));
+
+    const float chromaAxis1[3] = { chromaAxis1Unnormalized[0] * lengthRatio, chromaAxis1Unnormalized[1] * lengthRatio, chromaAxis1Unnormalized[2] * lengthRatio };
+
+    for (int i = 0; i < 3; i++)
+    {
+        m_chromaSideAxis0[i] = chromaAxis0[i];
+        m_chromaSideAxis1[i] = chromaAxis1[i];
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC.h b/thirdparty/cvtt/ConvectionKernels_ETC.h
new file mode 100644
index 0000000000..5e3c4d74fd
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC.h
@@ -0,0 +1,126 @@
+#pragma once
+#ifndef __CVTT_CONVECTIONKERNELS_ETC_H__
+#define __CVTT_CONVECTIONKERNELS_ETC_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct Options;
+
+    namespace Internal
+    {
+        class ETCComputer
+        {
+        public:
+            static void CompressETC1Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC1CompressionData *compressionData, const Options &options);
+            static void CompressETC2Block(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, ETC2CompressionData *compressionData, const Options &options, bool punchthroughAlpha);
+            static void CompressETC2AlphaBlock(uint8_t *outputBuffer, const PixelBlockU8 *inputBlocks, const Options &options);
+            static void CompressEACBlock(uint8_t *outputBuffer, const PixelBlockScalarS16 *inputBlocks, bool isSigned, const Options &options);
+
+            static ETC2CompressionData *AllocETC2Data(cvtt::Kernels::allocFunc_t allocFunc, void *context, const cvtt::Options &options);
+            static void ReleaseETC2Data(ETC2CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+            static ETC1CompressionData *AllocETC1Data(cvtt::Kernels::allocFunc_t allocFunc, void *context);
+            static void ReleaseETC1Data(ETC1CompressionData *compressionData, cvtt::Kernels::freeFunc_t freeFunc);
+
+        private:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+            struct DifferentialResolveStorage
+            {
+                static const unsigned int MaxAttemptsPerSector = 57 + 81 + 81 + 81 + 81 + 81 + 81 + 81;
+
+                MUInt15 diffNumAttempts[2];
+                MFloat diffErrors[2][MaxAttemptsPerSector];
+                MUInt16 diffSelectors[2][MaxAttemptsPerSector];
+                MUInt15 diffColors[2][MaxAttemptsPerSector];
+                MUInt15 diffTables[2][MaxAttemptsPerSector];
+
+                uint16_t attemptSortIndexes[2][MaxAttemptsPerSector];
+            };
+
+            struct HModeEval
+            {
+                MFloat errors[62][16];
+                MUInt16 signBits[62];
+                MUInt15 uniqueQuantizedColors[62];
+                MUInt15 numUniqueColors[2];
+            };
+
+            struct ETC1CompressionDataInternal : public cvtt::ETC1CompressionData
+            {
+                explicit ETC1CompressionDataInternal(void *context)
+                    : m_context(context)
+                {
+                }
+
+                DifferentialResolveStorage m_drs;
+                void *m_context;
+            };
+
+            struct ETC2CompressionDataInternal : public cvtt::ETC2CompressionData
+            {
+                explicit ETC2CompressionDataInternal(void *context, const cvtt::Options &options);
+
+                HModeEval m_h;
+                DifferentialResolveStorage m_drs;
+
+                void *m_context;
+                float m_chromaSideAxis0[3];
+                float m_chromaSideAxis1[3];
+            };
+
+            static MFloat ComputeErrorUniform(const MUInt15 pixelA[3], const MUInt15 pixelB[3]);
+            static MFloat ComputeErrorWeighted(const MUInt15 reconstructed[3], const MFloat pixelB[3], const Options options);
+            static MFloat ComputeErrorFakeBT709(const MUInt15 reconstructed[3], const MFloat pixelB[3]);
+
+            static void TestHalfBlock(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const MSInt16 modifiers[4], bool isDifferential, const Options &options);
+            static void TestHalfBlockPunchthrough(MFloat &outError, MUInt16 &outSelectors, MUInt15 quantizedPackedColor, const MUInt15 pixels[8][3], const MFloat preWeightedPixels[8][3], const ParallelMath::Int16CompFlag isTransparent[8], const MUInt15 modifier, const Options &options);
+            static void FindBestDifferentialCombination(int flip, int d, const ParallelMath::Int16CompFlag canIgnoreSector[2], ParallelMath::Int16CompFlag& bestIsThisMode, MFloat& bestTotalError, MUInt15& bestFlip, MUInt15& bestD, MUInt15 bestColors[2], MUInt16 bestSelectors[2], MUInt15 bestTables[2], DifferentialResolveStorage &drs);
+
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegalForChannel(const MUInt15 &a, const MUInt15 &b);
+            static ParallelMath::Int16CompFlag ETCDifferentialIsLegal(const MUInt15 &a, const MUInt15 &b);
+            static bool ETCDifferentialIsLegalForChannelScalar(const uint16_t &a, const uint16_t &b);
+            static bool ETCDifferentialIsLegalScalar(const uint16_t &a, const uint16_t &b);
+
+            static void EncodeTMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+            static void EncodeHMode(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag groupings[16], const MUInt15 pixels[16][3], HModeEval &he, const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void EncodeVirtualTModePunchthrough(uint8_t *outputBuffer, MFloat &bestError, const ParallelMath::Int16CompFlag isIsolated[16], const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], const ParallelMath::Int16CompFlag& anyTransparent, const ParallelMath::Int16CompFlag& allTransparent, const Options &options);
+
+            static MUInt15 DecodePlanarCoeff(const MUInt15 &coeff, int ch);
+            static void EncodePlanar(uint8_t *outputBuffer, MFloat &bestError, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const Options &options);
+
+            static void CompressETC1BlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], DifferentialResolveStorage& compressionData, const Options &options, bool punchthrough);
+            static void CompressETC1PunchthroughBlockInternal(MFloat &bestTotalError, uint8_t *outputBuffer, const MUInt15 pixels[16][3], const MFloat preWeightedPixels[16][3], const ParallelMath::Int16CompFlag isTransparent[16], DifferentialResolveStorage& compressionData, const Options &options);
+            static void CompressETC2AlphaBlockInternal(uint8_t *outputBuffer, const MUInt15 pixels[16], bool is11Bit, bool isSigned, const Options &options);
+
+            static void ExtractBlocks(MUInt15 pixels[16][3], MFloat preWeightedPixels[16][3], const PixelBlockU8 *inputBlocks, const Options &options);
+
+            static void ResolveHalfBlockFakeBT709RoundingAccurate(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveHalfBlockFakeBT709RoundingFast(MUInt15 quantized[3], const MUInt15 sectorCumulative[3], bool isDifferential);
+            static void ResolveTHFakeBT709Rounding(MUInt15 quantized[3], const MUInt15 target[3], const MUInt15 &granularity);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MUInt15 color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat color[3]);
+            static void ConvertToFakeBT709(MFloat yuv[3], const MFloat &r, const MFloat &g, const MFloat &b);
+            static void ConvertFromFakeBT709(MFloat rgb[3], const MFloat yuv[3]);
+
+            static void QuantizeETC2Alpha(int tableIndex, const MUInt15& value, const MUInt15& baseValue, const MUInt15& multiplier, bool is11Bit, bool isSigned, MUInt15& outIndexes, MUInt15& outQuantizedValues);
+
+            static void EmitTModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 lineColor[3], const ParallelMath::ScalarUInt16 isolatedColor[3], int32_t packedSelectors, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitHModeBlock(uint8_t *outputBuffer, const ParallelMath::ScalarUInt16 blockColors[2], ParallelMath::ScalarUInt16 sectorBits, ParallelMath::ScalarUInt16 signBits, ParallelMath::ScalarUInt16 table, bool opaque);
+            static void EmitETC1Block(uint8_t *outputBuffer, int blockBestFlip, int blockBestD, const int blockBestColors[2][3], const int blockBestTables[2], const ParallelMath::ScalarUInt16 blockBestSelectors[2], bool transparent);
+
+            static const int g_flipTables[2][2][8];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC1.h b/thirdparty/cvtt/ConvectionKernels_ETC1.h
new file mode 100644
index 0000000000..775e41669f
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC1.h
@@ -0,0 +1,29 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC1
+        {
+            const int16_t g_potentialOffsets4[] =
+            {
+                57, -64, -58, -54, -52, -48, -46, -44, -42, -40, -38, -36, -34, -32, -30, -28, -26, -24, -22, -20, -18, -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 52, 54, 58, 64,
+                81, -136, -124, -114, -112, -102, -100, -92, -90, -88, -80, -78, -76, -70, -68, -66, -64, -58, -56, -54, -52, -48, -46, -44, -42, -40, -36, -34, -32, -30, -26, -24, -22, -20, -18, -14, -12, -10, -8, -4, -2, 0, 2, 4, 8, 10, 12, 14, 18, 20, 22, 24, 26, 30, 32, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 64, 66, 68, 70, 76, 78, 80, 88, 90, 92, 100, 102, 112, 114, 124, 136,
+                81, -232, -212, -194, -192, -174, -172, -156, -154, -152, -136, -134, -132, -118, -116, -114, -112, -98, -96, -94, -92, -80, -78, -76, -74, -72, -60, -58, -56, -54, -42, -40, -38, -36, -34, -22, -20, -18, -16, -4, -2, 0, 2, 4, 16, 18, 20, 22, 34, 36, 38, 40, 42, 54, 56, 58, 60, 72, 74, 76, 78, 80, 92, 94, 96, 98, 112, 114, 116, 118, 132, 134, 136, 152, 154, 156, 172, 174, 192, 194, 212, 232,
+                81, -336, -307, -281, -278, -252, -249, -226, -223, -220, -197, -194, -191, -171, -168, -165, -162, -142, -139, -136, -133, -116, -113, -110, -107, -104, -87, -84, -81, -78, -61, -58, -55, -52, -49, -32, -29, -26, -23, -6, -3, 0, 3, 6, 23, 26, 29, 32, 49, 52, 55, 58, 61, 78, 81, 84, 87, 104, 107, 110, 113, 116, 133, 136, 139, 142, 162, 165, 168, 171, 191, 194, 197, 220, 223, 226, 249, 252, 278, 281, 307, 336,
+                81, -480, -438, -402, -396, -360, -354, -324, -318, -312, -282, -276, -270, -246, -240, -234, -228, -204, -198, -192, -186, -168, -162, -156, -150, -144, -126, -120, -114, -108, -90, -84, -78, -72, -66, -48, -42, -36, -30, -12, -6, 0, 6, 12, 30, 36, 42, 48, 66, 72, 78, 84, 90, 108, 114, 120, 126, 144, 150, 156, 162, 168, 186, 192, 198, 204, 228, 234, 240, 246, 270, 276, 282, 312, 318, 324, 354, 360, 396, 402, 438, 480,
+                81, -640, -584, -536, -528, -480, -472, -432, -424, -416, -376, -368, -360, -328, -320, -312, -304, -272, -264, -256, -248, -224, -216, -208, -200, -192, -168, -160, -152, -144, -120, -112, -104, -96, -88, -64, -56, -48, -40, -16, -8, 0, 8, 16, 40, 48, 56, 64, 88, 96, 104, 112, 120, 144, 152, 160, 168, 192, 200, 208, 216, 224, 248, 256, 264, 272, 304, 312, 320, 328, 360, 368, 376, 416, 424, 432, 472, 480, 528, 536, 584, 640,
+                81, -848, -775, -709, -702, -636, -629, -570, -563, -556, -497, -490, -483, -431, -424, -417, -410, -358, -351, -344, -337, -292, -285, -278, -271, -264, -219, -212, -205, -198, -153, -146, -139, -132, -125, -80, -73, -66, -59, -14, -7, 0, 7, 14, 59, 66, 73, 80, 125, 132, 139, 146, 153, 198, 205, 212, 219, 264, 271, 278, 285, 292, 337, 344, 351, 358, 410, 417, 424, 431, 483, 490, 497, 556, 563, 570, 629, 636, 702, 709, 775, 848,
+                81, -1464, -1328, -1234, -1192, -1098, -1056, -1004, -962, -920, -868, -826, -784, -774, -732, -690, -648, -638, -596, -554, -544, -512, -502, -460, -418, -408, -376, -366, -324, -314, -282, -272, -230, -188, -178, -146, -136, -94, -84, -52, -42, 0, 42, 52, 84, 94, 136, 146, 178, 188, 230, 272, 282, 314, 324, 366, 376, 408, 418, 460, 502, 512, 544, 554, 596, 638, 648, 690, 732, 774, 784, 826, 868, 920, 962, 1004, 1056, 1098, 1192, 1234, 1328, 1464
+            };
+
+            const unsigned int g_maxPotentialOffsets = 81;
+
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2.h b/thirdparty/cvtt/ConvectionKernels_ETC2.h
new file mode 100644
index 0000000000..4befc8e8c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2.h
@@ -0,0 +1,35 @@
+#include <stdint.h>
+
+namespace cvtt
+{
+    namespace Tables
+    {
+        namespace ETC2
+        {
+            const int16_t g_thModifierTable[8] =
+            {
+                3, 6, 11, 16, 23, 32, 41, 64
+            };
+
+            const int16_t g_alphaModifierTablePositive[16][4] =
+            {
+                { 2, 5, 8, 14, },
+                { 2, 6, 9, 12, },
+                { 1, 4, 7, 12, },
+                { 1, 3, 5, 12, },
+                { 2, 5, 7, 11, },
+                { 2, 6, 8, 10, },
+                { 3, 6, 7, 10, },
+                { 2, 4, 7, 10, },
+                { 1, 5, 7, 9, },
+                { 1, 4, 7, 9, },
+                { 1, 3, 7, 9, },
+                { 1, 4, 6, 9, },
+                { 2, 3, 6, 9, },
+                { 0, 1, 2, 9, },
+                { 3, 5, 7, 8, },
+                { 2, 4, 6, 8, },
+            };
+        }
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
new file mode 100644
index 0000000000..a4f5a3ddfa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ETC2_Rounding.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace ETC2 {
+    const int g_alphaRoundingTableWidth = 13;
+    const uint8_t g_alphaRoundingTables[16][13] =
+    {
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3 },
+        { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3 },
+        { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 3 },
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
new file mode 100644
index 0000000000..c1276553b2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointRefiner.h
@@ -0,0 +1,181 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTREFINER_H__
+#define __CVTT_ENDPOINTREFINER_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        // Solve for a, b where v = a*t + b
+        // This allows endpoints to be mapped to where T=0 and T=1
+        // Least squares from totals:
+        // a = (tv - t*v/w)/(tt - t*t/w)
+        // b = (v - a*t)/w
+        template<int TVectorSize>
+        class EndpointRefiner
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            MFloat m_tv[TVectorSize];
+            MFloat m_v[TVectorSize];
+            MFloat m_tt;
+            MFloat m_t;
+            MFloat m_w;
+            int m_wu;
+
+            float m_rcpMaxIndex;
+            float m_channelWeights[TVectorSize];
+            float m_rcpChannelWeights[TVectorSize];
+
+            void Init(int indexRange, const float channelWeights[TVectorSize])
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_tv[ch] = ParallelMath::MakeFloatZero();
+                    m_v[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_tt = ParallelMath::MakeFloatZero();
+                m_t = ParallelMath::MakeFloatZero();
+                m_w = ParallelMath::MakeFloatZero();
+
+                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_channelWeights[ch] = channelWeights[ch];
+                    m_rcpChannelWeights[ch] = 1.0f;
+                    if (m_channelWeights[ch] != 0.0f)
+                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
+                }
+
+                m_wu = 0;
+            }
+
+            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch] * weight;
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + weight * t * t;
+                m_t = m_t + weight * t;
+                m_w = m_w + weight;
+            }
+
+            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
+            {
+                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MFloat v = pwFloatPixel[ch];
+
+                    m_tv[ch] = m_tv[ch] + t * v;
+                    m_v[ch] = m_v[ch] + v;
+                }
+                m_tt = m_tt + t * t;
+                m_t = m_t + t;
+                m_wu++;
+            }
+
+            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
+            {
+                ContributeUnweightedPW(floatPixel, index, TVectorSize);
+            }
+
+            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
+            {
+                // a = (tv - t*v/w)/(tt - t*t/w)
+                // b = (v - a*t)/w
+                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
+
+                ParallelMath::MakeSafeDenominator(w);
+                MFloat wRcp = ParallelMath::Reciprocal(w);
+
+                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
+
+                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
+                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    /*
+                    if (adenom == 0.0)
+                    p1 = p2 = er.v / er.w;
+                    else
+                    {
+                    float4 a = (er.tv - er.t*er.v / er.w) / adenom;
+                    float4 b = (er.v - a * er.t) / er.w;
+                    p1 = b;
+                    p2 = a + b;
+                    }
+                    */
+
+                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
+                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
+
+                    MFloat p1 = b;
+                    MFloat p2 = a + b;
+
+                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
+                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
+
+                    // Unweight
+                    float inverseWeight = m_rcpChannelWeights[ch];
+
+                    endPoint[0][ch] = p1 * inverseWeight;
+                    endPoint[1][ch] = p2 * inverseWeight;
+                }
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
+            }
+
+            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
+            }
+
+            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                MFloat floatEndPoint[2][TVectorSize];
+                GetRefinedEndpoints(floatEndPoint);
+
+                for (int epi = 0; epi < 2; epi++)
+                {
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                    {
+                        MFloat f = floatEndPoint[epi][ch];
+                        if (isSigned)
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
+                        else
+                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
+                    }
+                }
+            }
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
new file mode 100644
index 0000000000..e09dfd248c
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_EndpointSelector.h
@@ -0,0 +1,153 @@
+#pragma once
+#ifndef __CVTT_ENDPOINTSELECTOR_H__
+#define __CVTT_ENDPOINTSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_PackedCovarianceMatrix.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        static const int NumEndpointSelectorPasses = 3;
+
+        template<int TVectorSize, int TIterationCount>
+        class EndpointSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+
+            EndpointSelector()
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_centroid[ch] = ParallelMath::MakeFloatZero();
+                    m_direction[ch] = ParallelMath::MakeFloatZero();
+                }
+                m_weightTotal = ParallelMath::MakeFloatZero();
+                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
+                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
+            }
+
+            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
+            {
+                if (pass == 0)
+                    ContributeCentroid(value, weight);
+                else if (pass == 1)
+                    ContributeDirection(value, weight);
+                else if (pass == 2)
+                    ContributeMinMax(value);
+            }
+
+            void FinishPass(int pass)
+            {
+                if (pass == 0)
+                    FinishCentroid();
+                else if (pass == 1)
+                    FinishDirection();
+            }
+
+            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
+            {
+                MFloat unweightedBase[TVectorSize];
+                MFloat unweightedOffset[TVectorSize];
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
+                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
+
+                    float safeWeight = channelWeights[ch];
+                    if (safeWeight == 0.f)
+                        safeWeight = 1.0f;
+
+                    unweightedBase[ch] = min / channelWeights[ch];
+                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
+                }
+
+                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
+            }
+
+        private:
+            void ContributeCentroid(const MFloat *value, const MFloat &weight)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
+                m_weightTotal = m_weightTotal + weight;
+            }
+
+            void FinishCentroid()
+            {
+                MFloat denom = m_weightTotal;
+                ParallelMath::MakeSafeDenominator(denom);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_centroid[ch] = m_centroid[ch] / denom;
+            }
+
+            void ContributeDirection(const MFloat *value, const MFloat &weight)
+            {
+                MFloat diff[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    diff[ch] = value[ch] - m_centroid[ch];
+
+                m_covarianceMatrix.Add(diff, weight);
+            }
+
+            void FinishDirection()
+            {
+                MFloat approx[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approx[ch] = ParallelMath::MakeFloat(1.0f);
+
+                for (int i = 0; i < TIterationCount; i++)
+                {
+                    MFloat product[TVectorSize];
+                    m_covarianceMatrix.Product(product, approx);
+
+                    MFloat largestComponent = product[0];
+                    for (int ch = 1; ch < TVectorSize; ch++)
+                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
+
+                    // product = largestComponent*newApprox
+                    ParallelMath::MakeSafeDenominator(largestComponent);
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        approx[ch] = product[ch] / largestComponent;
+                }
+
+                // Normalize
+                MFloat approxLen = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approxLen = approxLen + approx[ch] * approx[ch];
+
+                approxLen = ParallelMath::Sqrt(approxLen);
+
+                ParallelMath::MakeSafeDenominator(approxLen);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_direction[ch] = approx[ch] / approxLen;
+            }
+
+            void ContributeMinMax(const MFloat *value)
+            {
+                MFloat dist = ParallelMath::MakeFloatZero();
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
+
+                m_minDist = ParallelMath::Min(m_minDist, dist);
+                m_maxDist = ParallelMath::Max(m_maxDist, dist);
+            }
+
+            ParallelMath::Float m_centroid[TVectorSize];
+            ParallelMath::Float m_direction[TVectorSize];
+            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
+            ParallelMath::Float m_weightTotal;
+
+            ParallelMath::Float m_minDist;
+            ParallelMath::Float m_maxDist;
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
new file mode 100644
index 0000000000..1eb924befe
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_FakeBT709_Rounding.h
@@ -0,0 +1,282 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace FakeBT709 {
+    const uint8_t g_rounding16[] =
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 
+        0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 
+        3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 
+
+    };
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
new file mode 100644
index 0000000000..b3d1b5497e
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.cpp
@@ -0,0 +1,66 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        const ParallelMath::UInt16 g_weightReciprocals[17] =
+        {
+            ParallelMath::MakeUInt16(0),        // -1 
+            ParallelMath::MakeUInt16(0),        // 0
+            ParallelMath::MakeUInt16(32768),    // 1
+            ParallelMath::MakeUInt16(16384),    // 2
+            ParallelMath::MakeUInt16(10923),    // 3
+            ParallelMath::MakeUInt16(8192),     // 4
+            ParallelMath::MakeUInt16(6554),     // 5
+            ParallelMath::MakeUInt16(5461),     // 6
+            ParallelMath::MakeUInt16(4681),     // 7
+            ParallelMath::MakeUInt16(4096),     // 8
+            ParallelMath::MakeUInt16(3641),     // 9
+            ParallelMath::MakeUInt16(3277),     // 10
+            ParallelMath::MakeUInt16(2979),     // 11
+            ParallelMath::MakeUInt16(2731),     // 12
+            ParallelMath::MakeUInt16(2521),     // 13
+            ParallelMath::MakeUInt16(2341),     // 14
+            ParallelMath::MakeUInt16(2185),     // 15
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelector.h b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
new file mode 100644
index 0000000000..0f9d209183
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelector.h
@@ -0,0 +1,147 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTOR_H__
+#define __CVTT_INDEXSELECTOR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        extern const ParallelMath::UInt16 g_weightReciprocals[17];
+
+        template<int TVectorSize>
+        class IndexSelector
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::AInt16 MAInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::UInt31 MUInt31;
+
+
+            template<class TInterpolationEPType, class TColorEPType>
+            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
+            {
+                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
+                // We need to select indexes using the color-space endpoints.
+
+                m_isUniform = true;
+                for (int ch = 1; ch < TVectorSize; ch++)
+                {
+                    if (channelWeights[ch] != channelWeights[0])
+                        m_isUniform = false;
+                }
+
+                // To work with channel weights, we need something where:
+                // pxDiff = px - ep[0]
+                // epDiff = ep[1] - ep[0]
+                //
+                // weightedEPDiff = epDiff * channelWeights
+                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
+                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
+                // index = normalizedIndex * maxValue
+                //
+                // Equivalent to:
+                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
+                // index = dot(axis, pxDiff)
+
+                for (int ep = 0; ep < 2; ep++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
+
+                m_range = range;
+                m_maxValue = static_cast<float>(range - 1);
+
+                MFloat epDiffWeighted[TVectorSize];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
+                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
+                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
+                }
+
+                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
+
+                ParallelMath::MakeSafeDenominator(lenSquared);
+
+                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
+            }
+
+            template<bool TSigned>
+            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
+            {
+                MAInt16 converted[2][TVectorSize];
+                for (int epi = 0; epi < 2; epi++)
+                    for (int ch = 0; ch < TVectorSize; ch++)
+                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
+
+                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
+                }
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
+
+                for (int ch = 0; ch < numRealChannels; ch++)
+                {
+                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
+                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
+                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
+                }
+            }
+
+            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDR_BC7(index, pixel, TVectorSize);
+            }
+
+            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
+            {
+                ReconstructLDRPrecise(index, pixel, TVectorSize);
+            }
+
+            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
+
+                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
+            }
+
+        protected:
+            MAInt16 m_endPoint[2][TVectorSize];
+
+        private:
+            MFloat m_origin[TVectorSize];
+            MFloat m_axis[TVectorSize];
+            int m_range;
+            float m_maxValue;
+            bool m_isUniform;
+        };
+    }
+}
+
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
new file mode 100644
index 0000000000..84795cd689
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_IndexSelectorHDR.h
@@ -0,0 +1,155 @@
+#pragma once
+#ifndef __CVTT_INDEXSELECTORHDR_H__
+#define __CVTT_INDEXSELECTORHDR_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+#include "ConvectionKernels_IndexSelector.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v);
+        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v);
+
+        template<int TVectorSize>
+        class IndexSelectorHDR : public IndexSelector<TVectorSize>
+        {
+        public:
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt31 MUInt31;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+            typedef ParallelMath::Float MFloat;
+
+        private:
+
+            MUInt15 InvertSingle(const MUInt15& anIndex) const
+            {
+                MUInt15 inverted = m_maxValueMinusOne - anIndex;
+                return ParallelMath::Select(m_isInverted, inverted, anIndex);
+            }
+
+            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
+                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
+
+                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
+                }
+            }
+
+            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
+            {
+                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
+                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
+
+                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
+
+                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
+
+                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
+                }
+            }
+
+            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
+            {
+                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
+                return diff * diff;
+            }
+
+            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
+            {
+                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
+                return error;
+            }
+
+        public:
+
+            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
+            {
+                assert(range <= 16);
+
+                m_range = range;
+
+                m_isInverted = ParallelMath::MakeBoolInt16(false);
+                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
+
+                if (!fastIndexing)
+                {
+                    for (int i = 0; i < range; i++)
+                    {
+                        MSInt16 recon2CL[TVectorSize];
+
+                        if (isSigned)
+                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+                        else
+                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
+
+                        for (int ch = 0; ch < TVectorSize; ch++)
+                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
+                    }
+                }
+            }
+
+            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
+            {
+                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
+            }
+
+            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
+            {
+                m_isInverted = invert;
+            }
+
+            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
+            {
+                MUInt15 index = ParallelMath::MakeUInt15(0);
+
+                MFloat bestError = ErrorForInterpolator(0, pixel);
+                for (int i = 1; i < m_range; i++)
+                {
+                    MFloat error = ErrorForInterpolator(i, pixel);
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+                    bestError = ParallelMath::Min(bestError, error);
+                }
+
+                return InvertSingle(index);
+            }
+
+            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
+            {
+                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
+            }
+
+        private:
+            MFloat m_reconstructedInterpolators[16][TVectorSize];
+            ParallelMath::Int16CompFlag m_isInverted;
+            MUInt15 m_maxValueMinusOne;
+            int m_range;
+        };
+    }
+}
+#endif
+
diff --git a/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
new file mode 100644
index 0000000000..7ac3d4fdda
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_PackedCovarianceMatrix.h
@@ -0,0 +1,68 @@
+#pragma once
+#ifndef __CVTT_COVARIANCEMATRIX_H__
+#define __CVTT_COVARIANCEMATRIX_H__
+
+namespace cvtt
+{
+    namespace Internal
+    {
+
+        template<int TMatrixSize>
+        class PackedCovarianceMatrix
+        {
+        public:
+            // 0: xx,
+            // 1: xy, yy
+            // 3: xz, yz, zz 
+            // 6: xw, yw, zw, ww
+            // ... etc.
+            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
+
+            typedef ParallelMath::Float MFloat;
+
+            PackedCovarianceMatrix()
+            {
+                for (int i = 0; i < PyramidSize; i++)
+                    m_values[i] = ParallelMath::MakeFloatZero();
+            }
+
+            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
+            {
+                int index = 0;
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    for (int col = 0; col <= row; col++)
+                    {
+                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
+                        index++;
+                    }
+                }
+            }
+
+            void Product(MFloat *outVec, const MFloat *inVec)
+            {
+                for (int row = 0; row < TMatrixSize; row++)
+                {
+                    MFloat sum = ParallelMath::MakeFloatZero();
+
+                    int index = (row * (row + 1)) >> 1;
+                    for (int col = 0; col < TMatrixSize; col++)
+                    {
+                        sum = sum + inVec[col] * m_values[index];
+                        if (col >= row)
+                            index += col + 1;
+                        else
+                            index++;
+                    }
+
+                    outVec[row] = sum;
+                }
+            }
+
+        private:
+            ParallelMath::Float m_values[PyramidSize];
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_ParallelMath.h b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
new file mode 100644
index 0000000000..9e25280f45
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h
@@ -0,0 +1,1816 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+#pragma once
+#ifndef __CVTT_PARALLELMATH_H__
+#define __CVTT_PARALLELMATH_H__
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_Config.h"
+
+#ifdef CVTT_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#include <float.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+#include <math.h>
+
+#define UNREFERENCED_PARAMETER(n) ((void)n)
+
+// Parallel math implementation
+//
+// After preprocessor defs are handled, what this should do is expose the following types:
+// SInt16 - Signed 16-bit integer
+// UInt16 - Signed 16-bit integer
+// UInt15 - Unsigned 15-bit integer
+// SInt32 - Signed 32-bit integer
+// UInt31 - Unsigned 31-bit integer
+// AInt16 - 16-bit integer of unknown signedness (only used for storage)
+// Int16CompFlag - Comparison flags from comparing 16-bit integers
+// Int32CompFlag - Comparison flags from comparing 32-bit integers
+// FloatCompFlag - Comparison flags from comparing 32-bit floats
+//
+// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
+// (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
+// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
+// can elide the bit flips if unsigned versions are not available.
+
+namespace cvtt
+{
+#ifdef CVTT_USE_SSE2
+    // SSE2 version
+    struct ParallelMath
+    {
+        typedef uint16_t ScalarUInt16;
+        typedef int16_t ScalarSInt16;
+
+        template<unsigned int TRoundingMode>
+        struct RoundForScope
+        {
+            unsigned int m_oldCSR;
+
+            RoundForScope()
+            {
+                m_oldCSR = _mm_getcsr();
+                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
+            }
+
+            ~RoundForScope()
+            {
+                _mm_setcsr(m_oldCSR);
+            }
+        };
+
+        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
+        {
+        };
+
+        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
+        {
+        };
+
+        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
+        {
+        };
+
+        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
+        {
+        };
+
+        static const int ParallelSize = 8;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        template<int TSubtype>
+        struct VInt16
+        {
+            __m128i m_value;
+
+            inline VInt16 operator+(int16_t other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
+                return result;
+            }
+
+            inline VInt16 operator+(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_add_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator|(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator&(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator-(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_sub_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline VInt16 operator<<(int bits) const
+            {
+                VInt16 result;
+                result.m_value = _mm_slli_epi16(m_value, bits);
+                return result;
+            }
+
+            inline VInt16 operator^(const VInt16 &other) const
+            {
+                VInt16 result;
+                result.m_value = _mm_xor_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        typedef VInt16<IntSubtype_Signed> SInt16;
+        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
+        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
+        typedef VInt16<IntSubtype_Abstract> AInt16;
+
+        template<int TSubtype>
+        struct VInt32
+        {
+            __m128i m_values[2];
+
+            inline VInt32 operator+(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator-(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline VInt32 operator<<(const int other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
+                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
+                return result;
+            }
+
+            inline VInt32 operator|(const VInt32& other) const
+            {
+                VInt32 result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        typedef VInt32<IntSubtype_Signed> SInt32;
+        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
+        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
+        typedef VInt32<IntSubtype_Abstract> AInt32;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+#ifdef CVTT_PERMIT_ALIASING
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt32<TSubtype>&>(src);
+            }
+
+            template<int TSrcSubtype>
+            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
+            {
+                return reinterpret_cast<VInt16<TSubtype>&>(src);
+            }
+#else
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_values[0] = src.m_values[0];
+                result.m_values[1] = src.m_values[1];
+                return result;
+            }
+
+            template<int TSrcSubtype>
+            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
+            {
+                TTargetType result;
+                result.m_value = src.m_value;
+                return result;
+            }
+#endif
+        };
+
+        struct Int64
+        {
+            __m128i m_values[4];
+        };
+
+        struct Float
+        {
+            __m128 m_values[2];
+
+            inline Float operator+(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator+(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator-(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator-() const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
+                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator/(const Float &other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator/(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+        };
+
+        struct Int16CompFlag
+        {
+            __m128i m_value;
+
+            inline Int16CompFlag operator&(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_and_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16CompFlag operator|(const Int16CompFlag &other) const
+            {
+                Int16CompFlag result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+        };
+
+        struct Int32CompFlag
+        {
+            __m128i m_values[2];
+
+            inline Int32CompFlag operator&(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Int32CompFlag operator|(const Int32CompFlag &other) const
+            {
+                Int32CompFlag result;
+                result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        struct FloatCompFlag
+        {
+            __m128 m_values[2];
+
+            inline FloatCompFlag operator&(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline FloatCompFlag operator|(const FloatCompFlag &other) const
+            {
+                FloatCompFlag result;
+                result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+        };
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
+            return result;
+        }
+
+        template<int TSubtype>
+        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
+        {
+            VInt16<TSubtype> result;
+            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
+            return result;
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        template<int TSubtype>
+        static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
+        {
+            __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
+            __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
+            dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
+            dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
+        }
+
+        static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
+            return result;
+        }
+
+        template<int TSubtype>
+        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
+        {
+            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
+        }
+
+        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static void MakeSafeDenominator(Float& v)
+        {
+            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
+        }
+
+        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
+        {
+            int lostBits = 16 - precision;
+            if (lostBits == 0)
+                return v;
+
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
+            return result;
+        }
+
+        static UInt16 Min(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Min(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Min(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Min(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Max(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i bitFlip = _mm_set1_epi16(-32768);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
+            return result;
+        }
+
+        static SInt16 Max(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt15 Max(const UInt15 &a, const UInt15 &b)
+        {
+            UInt15 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Max(const Float &a, const Float &b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Float Clamp(const Float &v, float min, float max)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
+            return result;
+        }
+
+        static Float Reciprocal(const Float &v)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
+            return result;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
+        {
+            int16_t values[8];
+            for (int i = 0; i < 8; i++)
+                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
+
+            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
+        }
+
+        static Float MakeFloat(float v)
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
+            return f;
+        }
+
+        static Float MakeFloatZero()
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
+            return f;
+        }
+
+        static UInt16 MakeUInt16(uint16_t v)
+        {
+            UInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt16 MakeSInt16(int16_t v)
+        {
+            SInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static AInt16 MakeAInt16(int16_t v)
+        {
+            AInt16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static UInt15 MakeUInt15(uint16_t v)
+        {
+            UInt15 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static SInt32 MakeSInt32(int32_t v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static UInt31 MakeUInt31(uint32_t v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_set1_epi32(v);
+            result.m_values[1] = _mm_set1_epi32(v);
+            return result;
+        }
+
+        static uint16_t Extract(const UInt16 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const SInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static uint16_t Extract(const UInt15 &v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
+        }
+
+        static int16_t Extract(const AInt16 &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
+        }
+
+        static int32_t Extract(const SInt32 &v, int offset)
+        {
+            return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static float Extract(const Float &v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
+        }
+
+        static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
+        {
+            return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
+        }
+
+        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
+        {
+            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
+        }
+
+        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v;
+        }
+
+        static float ExtractFloat(const Float& v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v)[offset];
+        }
+
+        static void PutFloat(Float &dest, int offset, float v)
+        {
+            reinterpret_cast<float*>(&dest)[offset] = v;
+        }
+
+        static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
+        {
+            reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
+        }
+
+        static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
+            result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Less(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        template<int TSubtype>
+        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Equal(const Float &a, const Float &b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag notResult;
+            notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
+            return Not(notResult);
+        }
+
+        static Float ToFloat(const UInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static UInt31 ToUInt31(const UInt16 &v)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const UInt15 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
+            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
+            return result;
+        }
+
+        static SInt32 ToInt32(const SInt16 &v)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
+            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
+            return result;
+        }
+
+        static Float ToFloat(const SInt16 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
+            return result;
+        }
+
+        static Float ToFloat(const UInt15 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static Float ToFloat(const UInt31 &v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
+            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
+            return result;
+        }
+
+        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
+        {
+            __m128i lo = _mm_castps_si128(v.m_values[0]);
+            __m128i hi = _mm_castps_si128(v.m_values[1]);
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
+        {
+            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
+            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
+
+            FloatCompFlag result;
+            result.m_values[0] = _mm_castsi128_ps(lo);
+            result.m_values[1] = _mm_castsi128_ps(hi);
+            return result;
+        }
+
+        static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
+        {
+            __m128i lo = v.m_values[0];
+            __m128i hi = v.m_values[1];
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Int16CompFlag MakeBoolInt16(bool b)
+        {
+            Int16CompFlag result;
+            if (b)
+                result.m_value = _mm_set1_epi16(-1);
+            else
+                result.m_value = _mm_setzero_si128();
+            return result;
+        }
+
+        static FloatCompFlag MakeBoolFloat(bool b)
+        {
+            FloatCompFlag result;
+            if (b)
+                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
+            else
+                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
+            return result;
+        }
+
+        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
+            return result;
+        }
+
+        static Int16CompFlag Not(const Int16CompFlag &b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static Int32CompFlag Not(const Int32CompFlag &b)
+        {
+            Int32CompFlag result;
+            result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
+            result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
+            return result;
+        }
+
+        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
+            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt16 result;
+            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
+            return result;
+        }
+
+        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
+        {
+            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
+            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
+
+            __m128i packed = _mm_packs_epi32(lo, hi);
+
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Float Sqrt(const Float &f)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
+            return result;
+        }
+
+        static UInt16 Abs(const SInt16 &a)
+        {
+            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
+            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
+
+            UInt16 result;
+            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
+            return result;
+        }
+
+        static Float Abs(const Float& a)
+        {
+            __m128 invMask = _mm_set1_ps(-0.0f);
+
+            Float result;
+            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
+            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
+            return result;
+        }
+
+        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
+
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(diff, diff);
+            return result;
+        }
+
+        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
+
+            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
+            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
+            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
+            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
+
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
+            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
+
+            return result;
+        }
+
+        static Float TwosCLHalfToFloat(const SInt16 &v)
+        {
+            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
+
+            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
+            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
+            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
+
+            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
+
+            // Convert exponent to high-bits 
+            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
+
+            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
+
+            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
+            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
+
+            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
+            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
+
+            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
+
+            Float result;
+            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
+            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
+
+            return result;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static UInt16 RightShift(const UInt16 &v, int bits)
+        {
+            UInt16 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt31 RightShift(const UInt31 &v, int bits)
+        {
+            UInt31 result;
+            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 RightShift(const SInt16 &v, int bits)
+        {
+            SInt16 result;
+            result.m_value = _mm_srai_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static UInt15 RightShift(const UInt15 &v, int bits)
+        {
+            UInt15 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static SInt32 RightShift(const SInt32 &v, int bits)
+        {
+            SInt32 result;
+            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
+            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const SInt32 &v)
+        {
+            SInt16 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt16 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt16 ToSInt16(const UInt15 &v)
+        {
+            SInt16 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt32 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt16 ToUInt16(const UInt31 &v)
+        {
+            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
+            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
+
+            UInt16 result;
+            result.m_value = _mm_packs_epi32(low, high);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt31 &v)
+        {
+            UInt15 result;
+            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
+            return result;
+        }
+
+        static UInt15 ToUInt15(const SInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static UInt15 ToUInt15(const UInt16 &v)
+        {
+            UInt15 result;
+            result.m_value = v.m_value;
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            SInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt32 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            UInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
+        {
+            SInt16 result;
+            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
+        {
+            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
+            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
+
+            UInt31 result;
+            result.m_values[0] = _mm_unpacklo_epi16(low, high);
+            result.m_values[1] = _mm_unpackhi_epi16(low, high);
+            return result;
+        }
+
+        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
+        {
+            return XMultiply(b, a);
+        }
+
+        static bool AnySet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) != 0;
+        }
+
+        static bool AllSet(const Int16CompFlag &v)
+        {
+            return _mm_movemask_epi8(v.m_value) == 0xffff;
+        }
+
+        static bool AnySet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
+        }
+
+        static bool AllSet(const FloatCompFlag &v)
+        {
+            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
+        }
+    };
+
+#else
+    // Scalar version
+    struct ParallelMath
+    {
+        struct RoundTowardZeroForScope
+        {
+        };
+
+        struct RoundTowardNearestForScope
+        {
+        };
+
+        struct RoundUpForScope
+        {
+        };
+
+        struct RoundDownForScope
+        {
+        };
+
+        static const int ParallelSize = 1;
+
+        enum Int16Subtype
+        {
+            IntSubtype_Signed,
+            IntSubtype_UnsignedFull,
+            IntSubtype_UnsignedTruncated,
+            IntSubtype_Abstract,
+        };
+
+        typedef int32_t SInt16;
+        typedef int32_t UInt15;
+        typedef int32_t UInt16;
+        typedef int32_t AInt16;
+
+        typedef int32_t SInt32;
+        typedef int32_t UInt31;
+        typedef int32_t UInt32;
+        typedef int32_t AInt32;
+
+        typedef int32_t ScalarUInt16;
+        typedef int32_t ScalarSInt16;
+
+        typedef float Float;
+
+        template<class TTargetType>
+        struct LosslessCast
+        {
+            static const int32_t& Cast(const int32_t &src)
+            {
+                return src;
+            }
+        };
+
+        typedef bool Int16CompFlag;
+        typedef bool FloatCompFlag;
+
+        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
+        {
+            return a + b;
+        }
+
+        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
+        {
+            return a - b;
+        }
+
+        static float Select(bool flag, float a, float b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t Select(bool flag, int32_t a, int32_t b)
+        {
+            return flag ? a : b;
+        }
+
+        static int32_t SelectOrZero(bool flag, int32_t a)
+        {
+            return flag ? a : 0;
+        }
+
+        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(bool& dest, bool flag, bool src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static int32_t ConditionalNegate(bool flag, int32_t v)
+        {
+            return (flag) ? -v : v;
+        }
+
+        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void ConditionalSet(float& dest, bool flag, float src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        static void NotConditionalSet(float& dest, bool flag, float src)
+        {
+            if (!flag)
+                dest = src;
+        }
+
+        static void MakeSafeDenominator(float& v)
+        {
+            if (v == 0.0f)
+                v = 1.0f;
+        }
+
+        static int32_t SignedRightShift(int32_t v, int bits)
+        {
+            return v >> bits;
+        }
+
+        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
+        {
+            v = (v << (32 - precision)) & 0xffffffff;
+            return SignedRightShift(v, 32 - precision);
+        }
+
+        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
+        {
+            return v & ((1 << precision) - 1);
+        }
+
+        static int32_t Min(int32_t a, int32_t b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static float Min(float a, float b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        static int32_t Max(int32_t a, int32_t b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Max(float a, float b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        static float Abs(float a)
+        {
+            return fabsf(a);
+        }
+
+        static int32_t Abs(int32_t a)
+        {
+            if (a < 0)
+                return -a;
+            return a;
+        }
+
+        static float Clamp(float v, float min, float max)
+        {
+            if (v < min)
+                return min;
+            if (v > max)
+                return max;
+            return v;
+        }
+
+        static float Reciprocal(float v)
+        {
+            return 1.0f / v;
+        }
+
+        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
+        {
+            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
+        }
+
+        static float MakeFloat(float v)
+        {
+            return v;
+        }
+
+        static float MakeFloatZero()
+        {
+            return 0.0f;
+        }
+
+        static int32_t MakeUInt16(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeAInt16(int16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt15(uint16_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeSInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t MakeUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t Extract(int32_t v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static bool Extract(bool v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static float Extract(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static float ExtractFloat(float v, int offset)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            return v;
+        }
+
+        static void PutFloat(float &dest, int offset, float v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static void PutBoolInt16(bool &dest, int offset, bool v)
+        {
+            UNREFERENCED_PARAMETER(offset);
+            dest = v;
+        }
+
+        static bool Less(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool Less(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(int32_t a, int32_t b)
+        {
+            return a < b;
+        }
+
+        static bool LessOrEqual(float a, float b)
+        {
+            return a < b;
+        }
+
+        static bool Equal(int32_t a, int32_t b)
+        {
+            return a == b;
+        }
+
+        static bool Equal(float a, float b)
+        {
+            return a == b;
+        }
+
+        static float ToFloat(int32_t v)
+        {
+            return static_cast<float>(v);
+        }
+
+        static int32_t ToUInt31(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToInt32(int32_t v)
+        {
+            return v;
+        }
+
+        static bool FloatFlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int32FlagToInt16(bool v)
+        {
+            return v;
+        }
+
+        static bool Int16FlagToFloat(bool v)
+        {
+            return v;
+        }
+
+        static bool MakeBoolInt16(bool b)
+        {
+            return b;
+        }
+
+        static bool MakeBoolFloat(bool b)
+        {
+            return b;
+        }
+
+        static bool AndNot(bool a, bool b)
+        {
+            return a && !b;
+        }
+
+        static bool Not(bool b)
+        {
+            return !b;
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
+        {
+            UNREFERENCED_PARAMETER(rtz);
+            return static_cast<int>(v);
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
+        {
+            UNREFERENCED_PARAMETER(ru);
+            return static_cast<int>(ceilf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
+        {
+            UNREFERENCED_PARAMETER(rd);
+            return static_cast<int>(floorf(v));
+        }
+
+        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
+        {
+            UNREFERENCED_PARAMETER(rtn);
+            return static_cast<int>(floorf(v + 0.5f));
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        template<class TRoundMode>
+        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
+        {
+            return RoundAndConvertToInt(v, roundingMode);
+        }
+
+        static float Sqrt(float f)
+        {
+            return sqrtf(f);
+        }
+
+        static int32_t SqDiffUInt8(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static int32_t SqDiffSInt16(int32_t a, int32_t b)
+        {
+            int32_t delta = a - b;
+            return delta * delta;
+        }
+
+        static float TwosCLHalfToFloat(int32_t v)
+        {
+            int32_t absV = (v < 0) ? -v : v;
+
+            int32_t signBits = (absV & -32768);
+            int32_t mantissa = (absV & 0x03ff);
+            int32_t exponent = (absV & 0x7c00);
+
+            bool isDenormal = (exponent == 0);
+
+            // Convert exponent to high-bits
+            exponent = (exponent >> 3) + 14336;
+
+            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
+
+            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
+
+            float f, correction;
+            memcpy(&f, &fBits, 4);
+            memcpy(&correction, &denormalCorrection, 4);
+
+            return f - correction;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
+        {
+            Float fa = TwosCLHalfToFloat(a);
+            Float fb = TwosCLHalfToFloat(b);
+
+            Float diff = fa - fb;
+            return diff * diff;
+        }
+
+        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
+        {
+            Float fa = TwosCLHalfToFloat(a) * aWeight;
+
+            Float diff = fa - b;
+            return diff * diff;
+        }
+
+        static int32_t RightShift(int32_t v, int bits)
+        {
+            return SignedRightShift(v, bits);
+        }
+
+        static int32_t ToSInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt16(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t ToUInt15(int32_t v)
+        {
+            return v;
+        }
+
+        static int32_t XMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static int32_t CompactMultiply(int32_t a, int32_t b)
+        {
+            return a * b;
+        }
+
+        static bool AnySet(bool v)
+        {
+            return v;
+        }
+
+        static bool AllSet(bool v)
+        {
+            return v;
+        }
+    };
+
+#endif
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.cpp b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
new file mode 100644
index 0000000000..23f1bd3314
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.cpp
@@ -0,0 +1,1054 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels_S3TC.h"
+
+#include "ConvectionKernels_AggregatedError.h"
+#include "ConvectionKernels_BCCommon.h"
+#include "ConvectionKernels_EndpointRefiner.h"
+#include "ConvectionKernels_EndpointSelector.h"
+#include "ConvectionKernels_IndexSelector.h"
+#include "ConvectionKernels_UnfinishedEndpoints.h"
+#include "ConvectionKernels_S3TC_SingleColor.h"
+
+void cvtt::Internal::S3TCComputer::Init(MFloat& error)
+{
+    error = ParallelMath::MakeFloat(FLT_MAX);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
+    v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v)
+{
+    MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
+    v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
+}
+
+void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3])
+{
+    QuantizeTo5Bits(endPoint[0]);
+    QuantizeTo6Bits(endPoint[1]);
+    QuantizeTo5Bits(endPoint[2]);
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span)
+{
+    return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
+}
+
+cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
+{
+    MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
+    absDiff = absDiff + d;
+    return absDiff * absDiff;
+}
+
+void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+            totals[ch] = totals[ch] + pixels[px][ch];
+    }
+
+    MUInt15 average[3];
+    for (int ch = 0; ch < 3; ch++)
+        average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
+
+    const Tables::S3TCSC::TableEntry* rbTable = NULL;
+    const Tables::S3TCSC::TableEntry* gTable = NULL;
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3_p;
+            gTable = Tables::S3TCSC::g_singleColor6_3_p;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2_p;
+            gTable = Tables::S3TCSC::g_singleColor6_2_p;
+        }
+    }
+    else
+    {
+        if (range == 4)
+        {
+            rbTable = Tables::S3TCSC::g_singleColor5_3;
+            gTable = Tables::S3TCSC::g_singleColor6_3;
+        }
+        else
+        {
+            assert(range == 3);
+            rbTable = Tables::S3TCSC::g_singleColor5_2;
+            gTable = Tables::S3TCSC::g_singleColor6_2;
+        }
+    }
+
+    MUInt15 interpolated[3];
+    MUInt15 eps[2][3];
+    MSInt16 spans[3];
+    for (int i = 0; i < ParallelMath::ParallelSize; i++)
+    {
+        for (int ch = 0; ch < 3; ch++)
+        {
+            uint16_t avg = ParallelMath::Extract(average[ch], i);
+            const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
+            ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
+            ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
+            ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
+            ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
+        }
+    }
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    if (flags & cvtt::Flags::S3TC_Paranoid)
+    {
+        MFloat spanParanoidFactors[3];
+        for (int ch = 0; ch < 3; ch++)
+            spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
+
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
+        }
+    }
+    else
+    {
+        for (int px = 0; px < 16; px++)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
+        }
+    }
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+    ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
+
+    if (ParallelMath::AnySet(better16))
+    {
+        bestError = ParallelMath::Min(bestError, error);
+        for (int epi = 0; epi < 2; epi++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
+
+        MUInt15 vindexes = ParallelMath::MakeUInt15(1);
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
+
+        ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+    MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
+{
+    float channelWeightsSq[3];
+
+    for (int ch = 0; ch < 3; ch++)
+        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
+
+    MUInt15 endPoints[2][3];
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
+
+    QuantizeTo565(endPoints[0]);
+    QuantizeTo565(endPoints[1]);
+
+    IndexSelector<3> selector;
+    selector.Init<false>(channelWeights, endPoints, range);
+
+    MUInt15 indexes[16];
+
+    MFloat paranoidFactors[3];
+    for (int ch = 0; ch < 3; ch++)
+        paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
+
+    MFloat error = ParallelMath::MakeFloatZero();
+    AggregatedError<3> aggError;
+    for (int px = 0; px < 16; px++)
+    {
+        MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
+        indexes[px] = index;
+
+        if (refiner)
+            refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
+
+        MUInt15 reconstructed[3];
+        selector.ReconstructLDRPrecise(index, reconstructed);
+
+        if (flags & Flags::S3TC_Paranoid)
+        {
+            for (int ch = 0; ch < 3; ch++)
+                error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
+        }
+        else
+            BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
+    }
+
+    if (!(flags & Flags::S3TC_Paranoid))
+        error = aggError.Finalize(flags, channelWeightsSq);
+
+    ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
+
+    if (ParallelMath::AnySet(better))
+    {
+        ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
+
+        ParallelMath::ConditionalSet(bestError, better, error);
+
+        for (int ep = 0; ep < 2; ep++)
+            for (int ch = 0; ch < 3; ch++)
+                ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
+
+        for (int px = 0; px < 16; px++)
+            ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
+
+        ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
+    }
+}
+
+void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+    const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+    const ParallelMath::RoundTowardNearestForScope* rtn)
+{
+    UNREFERENCED_PARAMETER(alphaTest);
+    UNREFERENCED_PARAMETER(flags);
+
+    EndpointRefiner<3> refiner;
+
+    refiner.Init(nCounts, channelWeights);
+
+    bool escape = false;
+    int e = 0;
+    for (int i = 0; i < nCounts; i++)
+    {
+        for (int n = 0; n < counts[i]; n++)
+        {
+            ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
+            if (!ParallelMath::AnySet(valid))
+            {
+                escape = true;
+                break;
+            }
+
+            if (ParallelMath::AllSet(valid))
+                refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
+            else
+            {
+                MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
+                refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
+            }
+        }
+
+        if (escape)
+            break;
+    }
+
+    MUInt15 endPoints[2][3];
+    refiner.GetRefinedEndpointsLDR(endPoints, rtn);
+
+    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
+}
+
+void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
+{
+    UNREFERENCED_PARAMETER(flags);
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float weights[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
+
+    IndexSelector<1> selector;
+    selector.Init<false>(weights, ep, 16);
+
+    MUInt15 indexes[16];
+
+    for (int px = 0; px < 16; px++)
+        indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        for (int px = 0; px < 16; px += 2)
+        {
+            int index0 = ParallelMath::Extract(indexes[px], block);
+            int index1 = ParallelMath::Extract(indexes[px + 1], block);
+
+            packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
+{
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    float oneWeight[1] = { 1.0f };
+
+    MUInt15 pixels[16];
+    MFloat floatPixels[16];
+
+    MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
+    MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
+
+    for (int px = 0; px < 16; px++)
+    {
+        ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
+
+        if (isSigned)
+            pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
+
+        floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
+    }
+
+    MUInt15 sortedPixels[16];
+    for (int px = 0; px < 16; px++)
+        sortedPixels[px] = pixels[px];
+
+    for (int sortEnd = 15; sortEnd > 0; sortEnd--)
+    {
+        for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
+        {
+            MUInt15 a = sortedPixels[sortOffset];
+            MUInt15 b = sortedPixels[sortOffset + 1];
+
+            sortedPixels[sortOffset] = ParallelMath::Min(a, b);
+            sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
+        }
+    }
+
+    MUInt15 zero = ParallelMath::MakeUInt15(0);
+    MUInt15 one = ParallelMath::MakeUInt15(1);
+
+    MUInt15 bestIsFullRange = zero;
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+    MUInt15 bestEP[2] = { zero, zero };
+    MUInt15 bestIndexes[16] = {
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero,
+        zero, zero, zero, zero
+    };
+
+    // Full-precision
+    {
+        MUInt15 minEP = sortedPixels[0];
+        MUInt15 maxEP = sortedPixels[15];
+
+        MFloat base[1] = { ParallelMath::ToFloat(minEP) };
+        MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
+
+        UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+        int numTweakRounds = BCCommon::TweakRoundsForRange(8);
+        if (numTweakRounds > maxTweakRounds)
+            numTweakRounds = maxTweakRounds;
+
+        for (int tweak = 0; tweak < numTweakRounds; tweak++)
+        {
+            MUInt15 ep[2][1];
+
+            ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+            for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+            {
+                EndpointRefiner<1> refiner;
+                refiner.Init(8, oneWeight);
+
+                if (isSigned)
+                    for (int epi = 0; epi < 2; epi++)
+                        ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                IndexSelector<1> indexSelector;
+                indexSelector.Init<false>(oneWeight, ep, 8);
+
+                MUInt15 indexes[16];
+
+                AggregatedError<1> aggError;
+                for (int px = 0; px < 16; px++)
+                {
+                    MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                    MUInt15 reconstructedPixel;
+
+                    indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
+                    BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
+
+                    if (refinePass != numRefineRounds - 1)
+                        refiner.ContributeUnweightedPW(&floatPixels[px], index);
+
+                    indexes[px] = index;
+                }
+                MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
+
+                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                if (ParallelMath::AnySet(errorBetter16))
+                {
+                    bestError = ParallelMath::Min(error, bestError);
+                    ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
+                    for (int px = 0; px < 16; px++)
+                        ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                    for (int epi = 0; epi < 2; epi++)
+                        ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                }
+
+                if (refinePass != numRefineRounds - 1)
+                    refiner.GetRefinedEndpointsLDR(ep, &rtn);
+            }
+        }
+    }
+
+    // Reduced precision with special endpoints
+    {
+        MUInt15 bestHeuristicMin = sortedPixels[0];
+        MUInt15 bestHeuristicMax = sortedPixels[15];
+
+        ParallelMath::Int16CompFlag canTryClipping;
+
+        // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
+        // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
+        // This will usually not find anything, but it's cheap to check.
+
+        {
+            MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
+            MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
+
+            MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
+            canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
+        }
+
+        if (ParallelMath::AnySet(canTryClipping))
+        {
+            MUInt15 lowClearances[16];
+            MUInt15 highClearances[16];
+            MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
+
+            lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
+
+            for (int px = 1; px < 16; px++)
+            {
+                lowClearances[px] = sortedPixels[px - 1];
+                highClearances[px] = highTerminal - sortedPixels[16 - px];
+            }
+
+            for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
+            {
+                uint16_t numSkippedLow = firstIndex;
+
+                MUInt15 lowClearance = lowClearances[firstIndex];
+
+                for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
+                {
+                    uint16_t numSkippedHigh = 15 - lastIndex;
+                    uint16_t numSkipped = numSkippedLow + numSkippedHigh;
+
+                    MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
+
+                    ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
+
+                    if (!ParallelMath::AnySet(areMoreSkipped))
+                        continue;
+
+                    MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
+                    MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
+
+                    MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
+
+                    ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
+                    ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
+                    ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
+                }
+            }
+        }
+
+        MUInt15 bestSimpleMin = one;
+        MUInt15 bestSimpleMax = highTerminalMinusOne;
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
+            ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
+        }
+
+        MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
+        MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
+
+        int minEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
+            minEPRange = 1;
+
+        int maxEPRange = 2;
+        if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
+            maxEPRange = 1;
+
+        for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
+        {
+            for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
+            {
+                MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
+                MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
+
+                UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
+
+                int numTweakRounds = BCCommon::TweakRoundsForRange(6);
+                if (numTweakRounds > maxTweakRounds)
+                    numTweakRounds = maxTweakRounds;
+
+                for (int tweak = 0; tweak < numTweakRounds; tweak++)
+                {
+                    MUInt15 ep[2][1];
+
+                    ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
+
+                    for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
+                    {
+                        EndpointRefiner<1> refiner;
+                        refiner.Init(6, oneWeight);
+
+                        if (isSigned)
+                            for (int epi = 0; epi < 2; epi++)
+                                ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
+
+                        IndexSelector<1> indexSelector;
+                        indexSelector.Init<false>(oneWeight, ep, 6);
+
+                        MUInt15 indexes[16];
+                        MFloat error = ParallelMath::MakeFloatZero();
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
+
+                            MUInt15 reconstructedPixel;
+
+                            indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
+
+                            MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
+                            MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
+                            MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
+
+                            MFloat bestPixelError = zeroError;
+                            MUInt15 index = ParallelMath::MakeUInt15(6);
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
+                            bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
+
+                            ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
+
+                            if (ParallelMath::AllSet(selectedIndexBetter))
+                            {
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
+                            }
+                            else
+                            {
+                                MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
+
+                                if (refinePass != numRefineRounds - 1)
+                                    refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
+                            }
+
+                            ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
+                            bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
+
+                            error = error + bestPixelError;
+
+                            indexes[px] = index;
+                        }
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
+                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        if (ParallelMath::AnySet(errorBetter16))
+                        {
+                            bestError = ParallelMath::Min(error, bestError);
+                            ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
+                            for (int px = 0; px < 16; px++)
+                                ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
+
+                            for (int epi = 0; epi < 2; epi++)
+                                ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
+                        }
+
+                        if (refinePass != numRefineRounds - 1)
+                            refiner.GetRefinedEndpointsLDR(ep, &rtn);
+                    }
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        int ep0 = ParallelMath::Extract(bestEP[0], block);
+        int ep1 = ParallelMath::Extract(bestEP[1], block);
+        int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
+
+        if (isSigned)
+        {
+            ep0 -= 127;
+            ep1 -= 127;
+
+            assert(ep0 >= -127 && ep0 <= 127);
+            assert(ep1 >= -127 && ep1 <= 127);
+        }
+
+
+        bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
+
+        if (swapEndpoints)
+            std::swap(ep0, ep1);
+
+        uint16_t dumpBits = 0;
+        int dumpBitsOffset = 0;
+        int dumpByteOffset = 2;
+        packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
+
+        int maxValue = (isFullRange != 0) ? 7 : 5;
+
+        for (int px = 0; px < 16; px++)
+        {
+            int index = ParallelMath::Extract(bestIndexes[px], block);
+
+            if (swapEndpoints && index <= maxValue)
+                index = maxValue - index;
+
+            if (index != 0)
+            {
+                if (index == maxValue)
+                    index = 1;
+                else if (index < maxValue)
+                    index++;
+            }
+
+            assert(index >= 0 && index < 8);
+
+            dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
+            dumpBitsOffset += 3;
+
+            if (dumpBitsOffset >= 8)
+            {
+                assert(dumpByteOffset < 8);
+                packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
+                dumpBits >>= 8;
+                dumpBitsOffset -= 8;
+                dumpByteOffset++;
+            }
+        }
+
+        assert(dumpBitsOffset == 0);
+        assert(dumpByteOffset == 8);
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
+{
+    ParallelMath::RoundTowardNearestForScope rtn;
+
+    if (numRefineRounds < 1)
+        numRefineRounds = 1;
+
+    if (maxTweakRounds < 1)
+        maxTweakRounds = 1;
+
+    EndpointSelector<3, 8> endpointSelector;
+
+    MUInt15 pixels[16][4];
+    MFloat floatPixels[16][4];
+
+    MFloat preWeightedPixels[16][4];
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
+    }
+
+    for (int px = 0; px < 16; px++)
+    {
+        for (int ch = 0; ch < 4; ch++)
+            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
+    }
+
+    if (alphaTest)
+    {
+        MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
+
+        for (int px = 0; px < 16; px++)
+        {
+            ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
+            pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
+        }
+    }
+
+    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
+
+    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
+
+    for (int px = 0; px < 16; px++)
+        minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
+
+    MFloat pixelWeights[16];
+    for (int px = 0; px < 16; px++)
+    {
+        pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
+        if (alphaTest)
+        {
+            ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+            ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
+        }
+    }
+
+    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
+    {
+        for (int px = 0; px < 16; px++)
+            endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
+
+        endpointSelector.FinishPass(pass);
+    }
+
+    UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
+
+    MUInt15 bestEndpoints[2][3];
+    MUInt15 bestIndexes[16];
+    MUInt15 bestRange = ParallelMath::MakeUInt15(0);
+    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
+
+    for (int px = 0; px < 16; px++)
+        bestIndexes[px] = ParallelMath::MakeUInt15(0);
+
+    for (int ep = 0; ep < 2; ep++)
+        for (int ch = 0; ch < 3; ch++)
+            bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
+
+    if (exhaustive)
+    {
+        MSInt16 sortBins[16];
+
+        {
+            // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
+            // and pack the original indexes into the low bits.
+
+            MUInt15 sortEP[2][3];
+            ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
+
+            IndexSelector<3> sortSelector;
+            sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
+
+            for (int16_t px = 0; px < 16; px++)
+            {
+                MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
+
+                if (alphaTest)
+                {
+                    ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
+
+                    ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
+                }
+
+                sortBin = sortBin + ParallelMath::MakeSInt16(px);
+
+                sortBins[px] = sortBin;
+            }
+        }
+
+        // Sort bins
+        for (int sortEnd = 1; sortEnd < 16; sortEnd++)
+        {
+            for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
+            {
+                MSInt16 a = sortBins[sortLoc];
+                MSInt16 b = sortBins[sortLoc - 1];
+
+                sortBins[sortLoc] = ParallelMath::Max(a, b);
+                sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
+            }
+        }
+
+        MUInt15 firstElement = ParallelMath::MakeUInt15(0);
+        for (uint16_t e = 0; e < 16; e++)
+        {
+            ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
+            ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
+            if (!ParallelMath::AnySet(isInvalid))
+                break;
+        }
+
+        MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
+
+        MUInt15 sortedInputs[16][4];
+        MFloat floatSortedInputs[16][4];
+        MFloat pwFloatSortedInputs[16][4];
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+                sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
+        }
+
+        for (int block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
+            {
+                ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
+                int originalIndex = (sortBin & 15);
+
+                for (int ch = 0; ch < 4; ch++)
+                    ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
+            }
+        }
+
+        for (int e = 0; e < 16; e++)
+        {
+            for (int ch = 0; ch < 4; ch++)
+            {
+                MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
+                floatSortedInputs[e][ch] = f;
+                pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
+            }
+        }
+
+        for (int n0 = 0; n0 <= 15; n0++)
+        {
+            int remainingFor1 = 16 - n0;
+            if (remainingFor1 == 16)
+                remainingFor1 = 15;
+
+            for (int n1 = 0; n1 <= remainingFor1; n1++)
+            {
+                int remainingFor2 = 16 - n1 - n0;
+                if (remainingFor2 == 16)
+                    remainingFor2 = 15;
+
+                for (int n2 = 0; n2 <= remainingFor2; n2++)
+                {
+                    int n3 = 16 - n2 - n1 - n0;
+
+                    if (n3 == 16)
+                        continue;
+
+                    int counts[4] = { n0, n1, n2, n3 };
+
+                    TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+        }
+
+        TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+
+        if (alphaTest)
+        {
+            for (int n0 = 0; n0 <= 15; n0++)
+            {
+                int remainingFor1 = 16 - n0;
+                if (remainingFor1 == 16)
+                    remainingFor1 = 15;
+
+                for (int n1 = 0; n1 <= remainingFor1; n1++)
+                {
+                    int n2 = 16 - n1 - n0;
+
+                    if (n2 == 16)
+                        continue;
+
+                    int counts[3] = { n0, n1, n2 };
+
+                    TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+                }
+            }
+
+            TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
+        }
+    }
+    else
+    {
+        int minRange = alphaTest ? 3 : 4;
+
+        for (int range = minRange; range <= 4; range++)
+        {
+            int tweakRounds = BCCommon::TweakRoundsForRange(range);
+            if (tweakRounds > maxTweakRounds)
+                tweakRounds = maxTweakRounds;
+
+            for (int tweak = 0; tweak < tweakRounds; tweak++)
+            {
+                MUInt15 endPoints[2][3];
+
+                ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
+
+                for (int refine = 0; refine < numRefineRounds; refine++)
+                {
+                    EndpointRefiner<3> refiner;
+                    refiner.Init(range, channelWeights);
+
+                    TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
+
+                    if (refine != numRefineRounds - 1)
+                        refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
+                }
+            }
+        }
+    }
+
+    for (int block = 0; block < ParallelMath::ParallelSize; block++)
+    {
+        ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
+        assert(range == 3 || range == 4);
+
+        ParallelMath::ScalarUInt16 compressedEP[2];
+        for (int ep = 0; ep < 2; ep++)
+        {
+            ParallelMath::ScalarUInt16 endPoint[3];
+            for (int ch = 0; ch < 3; ch++)
+                endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
+
+            int compressed = (endPoint[0] & 0xf8) << 8;
+            compressed |= (endPoint[1] & 0xfc) << 3;
+            compressed |= (endPoint[2] & 0xf8) >> 3;
+
+            compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
+        }
+
+        int indexOrder[4];
+
+        if (range == 4)
+        {
+            if (compressedEP[0] == compressedEP[1])
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 0;
+                indexOrder[2] = 0;
+                indexOrder[3] = 0;
+            }
+            else if (compressedEP[0] < compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 3;
+                indexOrder[2] = 2;
+                indexOrder[3] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 3;
+                indexOrder[3] = 1;
+            }
+        }
+        else
+        {
+            assert(range == 3);
+
+            if (compressedEP[0] > compressedEP[1])
+            {
+                std::swap(compressedEP[0], compressedEP[1]);
+                indexOrder[0] = 1;
+                indexOrder[1] = 2;
+                indexOrder[2] = 0;
+            }
+            else
+            {
+                indexOrder[0] = 0;
+                indexOrder[1] = 2;
+                indexOrder[2] = 1;
+            }
+            indexOrder[3] = 3;
+        }
+
+        packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
+        packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
+        packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
+        packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
+
+        for (int i = 0; i < 16; i += 4)
+        {
+            int packedIndexes = 0;
+            for (int subi = 0; subi < 4; subi++)
+            {
+                ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
+                packedIndexes |= (indexOrder[index] << (subi * 2));
+            }
+
+            packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
+        }
+
+        packedBlocks += packedBlockStride;
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC.h b/thirdparty/cvtt/ConvectionKernels_S3TC.h
new file mode 100644
index 0000000000..aa197229c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC.h
@@ -0,0 +1,51 @@
+#pragma once
+#ifndef __CVTT_S3TC_H__
+#define __CVTT_S3TC_H__
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class EndpointRefiner;
+    }
+
+    struct PixelBlockU8;
+}
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        class S3TCComputer
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            static void Init(MFloat& error);
+            static void QuantizeTo6Bits(MUInt15& v);
+            static void QuantizeTo5Bits(MUInt15& v);
+            static void QuantizeTo565(MUInt15 endPoint[3]);
+            static MFloat ParanoidFactorForSpan(const MSInt16& span);
+            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d);
+            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
+                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn);
+            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
+                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
+                const ParallelMath::RoundTowardNearestForScope* rtn);
+            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride);
+            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds);
+            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds);
+        };
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
new file mode 100644
index 0000000000..c772b163c2
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_S3TC_SingleColor.h
@@ -0,0 +1,304 @@
+#pragma once
+#include <stdint.h>
+
+// This file is generated by the MakeTables app.  Do not edit this file manually.
+
+namespace cvtt { namespace Tables { namespace S3TCSC {
+
+struct TableEntry
+{
+    uint8_t m_min;
+    uint8_t m_max;
+    uint8_t m_actualColor;
+    uint8_t m_span;
+};
+
+TableEntry g_singleColor5_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 0, 69, 23, 69 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 8, 65, 27, 57 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 12, 69, 31, 57 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 20, 65, 35, 45 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 24, 69, 39, 45 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 93, 56, 80, 37 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 97, 60, 84, 37 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 105, 56, 88, 49 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 109, 60, 92, 49 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 77, 134, 96, 57 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 85, 130, 100, 45 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 89, 134, 104, 45 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 158, 121, 145, 37 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 162, 125, 149, 37 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 170, 121, 153, 49 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 174, 125, 157, 49 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 142, 199, 161, 57 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 150, 195, 165, 45 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 154, 199, 169, 45 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 223, 186, 210, 37 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 227, 190, 214, 37 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 235, 186, 218, 49 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 239, 190, 222, 49 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 247, 186, 226, 61 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 251, 190, 230, 61 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
+    { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
+    { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
+    { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
+    { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
+    { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 2, 8 }, { 0, 8, 2, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 0, 5, 8 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 10, 8 }, { 0, 33, 11, 33 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 8, 13, 8 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 18, 8 }, { 8, 41, 19, 33 }, { 24, 16, 21, 8 }, { 24, 16, 21, 8 }, { 33, 0, 22, 33 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 33, 27, 9 }, { 24, 41, 29, 17 }, { 33, 24, 30, 9 }, { 33, 24, 30, 9 },
+    { 24, 49, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 33, 41, 35, 8 }, { 33, 41, 35, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 }, { 41, 33, 38, 8 },
+    { 49, 24, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 43, 8 }, { 33, 66, 44, 33 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 }, { 49, 41, 46, 8 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 51, 8 }, { 41, 74, 52, 33 }, { 57, 49, 54, 8 }, { 57, 49, 54, 8 }, { 66, 33, 55, 33 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 66, 60, 9 }, { 57, 74, 62, 17 }, { 66, 57, 63, 9 },
+    { 66, 57, 63, 9 }, { 57, 82, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 66, 74, 68, 8 }, { 66, 74, 68, 8 }, { 74, 66, 71, 8 }, { 74, 66, 71, 8 },
+    { 74, 66, 71, 8 }, { 82, 57, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 76, 8 }, { 66, 99, 77, 33 }, { 82, 74, 79, 8 }, { 82, 74, 79, 8 },
+    { 82, 74, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 84, 8 }, { 74, 107, 85, 33 }, { 90, 82, 87, 8 }, { 90, 82, 87, 8 },
+    { 99, 66, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 99, 93, 9 }, { 90, 107, 95, 17 },
+    { 99, 90, 96, 9 }, { 99, 90, 96, 9 }, { 90, 115, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 99, 107, 101, 8 }, { 99, 107, 101, 8 }, { 107, 99, 104, 8 },
+    { 107, 99, 104, 8 }, { 107, 99, 104, 8 }, { 115, 90, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 109, 8 }, { 99, 132, 110, 33 }, { 115, 107, 112, 8 },
+    { 115, 107, 112, 8 }, { 115, 107, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 117, 8 }, { 107, 140, 118, 33 }, { 123, 115, 120, 8 },
+    { 123, 115, 120, 8 }, { 132, 99, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 }, { 123, 132, 126, 9 },
+    { 123, 140, 128, 17 }, { 132, 123, 129, 9 }, { 132, 123, 129, 9 }, { 123, 148, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 132, 140, 134, 8 }, { 132, 140, 134, 8 },
+    { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 140, 132, 137, 8 }, { 148, 123, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 142, 8 }, { 132, 165, 143, 33 },
+    { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 140, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 150, 8 }, { 140, 173, 151, 33 },
+    { 156, 148, 153, 8 }, { 156, 148, 153, 8 }, { 165, 132, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 159, 9 }, { 156, 165, 159, 9 },
+    { 156, 165, 159, 9 }, { 156, 173, 161, 17 }, { 165, 156, 162, 9 }, { 165, 156, 162, 9 }, { 156, 181, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 165, 173, 167, 8 },
+    { 165, 173, 167, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 173, 165, 170, 8 }, { 181, 156, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 181, 175, 8 },
+    { 165, 198, 176, 33 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 173, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 189, 183, 8 },
+    { 173, 206, 184, 33 }, { 189, 181, 186, 8 }, { 189, 181, 186, 8 }, { 198, 165, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 198, 192, 9 },
+    { 189, 198, 192, 9 }, { 189, 198, 192, 9 }, { 189, 206, 194, 17 }, { 198, 189, 195, 9 }, { 198, 189, 195, 9 }, { 189, 214, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 198, 206, 200, 8 }, { 198, 206, 200, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 206, 198, 203, 8 }, { 214, 189, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 214, 208, 8 }, { 198, 231, 209, 33 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 206, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 222, 216, 8 }, { 206, 239, 217, 33 }, { 222, 214, 219, 8 }, { 222, 214, 219, 8 }, { 231, 198, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 231, 225, 9 }, { 222, 239, 227, 17 }, { 231, 222, 228, 9 }, { 231, 222, 228, 9 }, { 222, 247, 230, 25 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 239, 233, 8 }, { 231, 239, 233, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 239, 231, 236, 8 }, { 247, 222, 238, 25 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 247, 241, 8 }, { 239, 247, 241, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 239, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 255, 249, 8 }, { 247, 255, 249, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 247, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_3_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 4, 1, 4 }, { 4, 0, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 5, 4 }, { 8, 4, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 12, 9, 4 }, { 12, 8, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 13, 4 }, { 16, 12, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 20, 17, 4 }, { 20, 16, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 21, 4 }, { 24, 20, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 28, 25, 4 }, { 28, 24, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 29, 4 }, { 32, 28, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 36, 33, 4 }, { 36, 32, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 37, 4 }, { 40, 36, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 44, 41, 4 }, { 44, 40, 42, 4 }, { 32, 65, 43, 33 }, { 44, 44, 44, 0 }, { 44, 48, 45, 4 }, { 48, 44, 46, 4 }, { 36, 69, 47, 33 },
+    { 48, 48, 48, 0 }, { 48, 52, 49, 4 }, { 52, 48, 50, 4 }, { 44, 65, 51, 21 }, { 52, 52, 52, 0 }, { 52, 56, 53, 4 }, { 56, 52, 54, 4 }, { 48, 69, 55, 21 },
+    { 56, 56, 56, 0 }, { 56, 60, 57, 4 }, { 60, 56, 58, 4 }, { 56, 65, 59, 9 }, { 60, 60, 60, 0 }, { 60, 65, 61, 5 }, { 65, 56, 62, 9 }, { 65, 60, 63, 5 },
+    { 60, 73, 64, 13 }, { 65, 65, 65, 0 }, { 65, 69, 66, 4 }, { 69, 65, 67, 4 }, { 73, 60, 68, 13 }, { 69, 69, 69, 0 }, { 69, 73, 70, 4 }, { 73, 69, 71, 4 },
+    { 81, 56, 72, 25 }, { 73, 73, 73, 0 }, { 73, 77, 74, 4 }, { 77, 73, 75, 4 }, { 85, 60, 76, 25 }, { 77, 77, 77, 0 }, { 77, 81, 78, 4 }, { 81, 77, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 82, 4 }, { 85, 81, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 86, 4 }, { 89, 85, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 90, 4 }, { 93, 89, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 94, 4 }, { 97, 93, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 98, 4 }, { 101, 97, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 102, 4 }, { 105, 101, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 106, 4 }, { 109, 105, 107, 4 }, { 97, 130, 108, 33 }, { 109, 109, 109, 0 }, { 109, 113, 110, 4 }, { 113, 109, 111, 4 },
+    { 101, 134, 112, 33 }, { 113, 113, 113, 0 }, { 113, 117, 114, 4 }, { 117, 113, 115, 4 }, { 109, 130, 116, 21 }, { 117, 117, 117, 0 }, { 117, 121, 118, 4 }, { 121, 117, 119, 4 },
+    { 113, 134, 120, 21 }, { 121, 121, 121, 0 }, { 121, 125, 122, 4 }, { 125, 121, 123, 4 }, { 121, 130, 124, 9 }, { 125, 125, 125, 0 }, { 125, 130, 126, 5 }, { 130, 121, 127, 9 },
+    { 130, 125, 128, 5 }, { 125, 138, 129, 13 }, { 130, 130, 130, 0 }, { 130, 134, 131, 4 }, { 134, 130, 132, 4 }, { 138, 125, 133, 13 }, { 134, 134, 134, 0 }, { 134, 138, 135, 4 },
+    { 138, 134, 136, 4 }, { 146, 121, 137, 25 }, { 138, 138, 138, 0 }, { 138, 142, 139, 4 }, { 142, 138, 140, 4 }, { 150, 125, 141, 25 }, { 142, 142, 142, 0 }, { 142, 146, 143, 4 },
+    { 146, 142, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 147, 4 }, { 150, 146, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 154, 151, 4 },
+    { 154, 150, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 155, 4 }, { 158, 154, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 162, 159, 4 },
+    { 162, 158, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 163, 4 }, { 166, 162, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 170, 167, 4 },
+    { 170, 166, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 171, 4 }, { 174, 170, 172, 4 }, { 162, 195, 173, 33 }, { 174, 174, 174, 0 }, { 174, 178, 175, 4 },
+    { 178, 174, 176, 4 }, { 166, 199, 177, 33 }, { 178, 178, 178, 0 }, { 178, 182, 179, 4 }, { 182, 178, 180, 4 }, { 174, 195, 181, 21 }, { 182, 182, 182, 0 }, { 182, 186, 183, 4 },
+    { 186, 182, 184, 4 }, { 178, 199, 185, 21 }, { 186, 186, 186, 0 }, { 186, 190, 187, 4 }, { 190, 186, 188, 4 }, { 186, 195, 189, 9 }, { 190, 190, 190, 0 }, { 190, 195, 191, 5 },
+    { 195, 186, 192, 9 }, { 195, 190, 193, 5 }, { 190, 203, 194, 13 }, { 195, 195, 195, 0 }, { 195, 199, 196, 4 }, { 199, 195, 197, 4 }, { 203, 190, 198, 13 }, { 199, 199, 199, 0 },
+    { 199, 203, 200, 4 }, { 203, 199, 201, 4 }, { 211, 186, 202, 25 }, { 203, 203, 203, 0 }, { 203, 207, 204, 4 }, { 207, 203, 205, 4 }, { 215, 190, 206, 25 }, { 207, 207, 207, 0 },
+    { 207, 211, 208, 4 }, { 211, 207, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 212, 4 }, { 215, 211, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 219, 216, 4 }, { 219, 215, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 220, 4 }, { 223, 219, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 227, 224, 4 }, { 227, 223, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 228, 4 }, { 231, 227, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 235, 232, 4 }, { 235, 231, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 236, 4 }, { 239, 235, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 243, 240, 4 }, { 243, 239, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 244, 4 }, { 247, 243, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 251, 248, 4 }, { 251, 247, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 252, 4 }, { 255, 251, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor5_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
+    { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
+    { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
+    { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
+    { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
+    { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
+    { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
+    { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
+    { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
+    { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
+    { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
+    { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
+    { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
+    { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
+    { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
+    { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
+    { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
+    { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
+    { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
+    { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
+    { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
+    { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
+    { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
+    { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
+    { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
+    { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+TableEntry g_singleColor6_2_p[256] =
+{
+    { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
+    { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
+    { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
+    { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
+    { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
+    { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
+    { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
+    { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
+    { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
+    { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
+    { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
+    { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
+    { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
+    { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
+    { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
+    { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
+    { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
+    { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
+    { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
+    { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
+    { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
+    { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
+    { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
+    { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
+    { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
+    { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
+    { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
+    { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
+    { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
+    { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
+    { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
+    { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
+};
+
+}}}
diff --git a/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
new file mode 100644
index 0000000000..ad59988655
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_SingleFile.cpp
@@ -0,0 +1,48 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if defined(CVTT_SINGLE_FILE)
+#define CVTT_SINGLE_FILE_IMPL
+
+#include "ConvectionKernels_API.cpp"
+#include "ConvectionKernels_BC67.cpp"
+#include "ConvectionKernels_BC6H_IO.cpp"
+#include "ConvectionKernels_BC7_PrioData.cpp"
+#include "ConvectionKernels_BCCommon.cpp"
+#include "ConvectionKernels_ETC.cpp"
+#include "ConvectionKernels_IndexSelector.cpp"
+#include "ConvectionKernels_S3TC.cpp"
+#include "ConvectionKernels_Util.cpp"
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
new file mode 100644
index 0000000000..371cbe54bf
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_UnfinishedEndpoints.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include "ConvectionKernels_Util.h"
+
+namespace cvtt
+{
+    namespace Internal
+    {
+        template<int TVectorSize>
+        class UnfinishedEndpoints
+        {
+        public:
+            typedef ParallelMath::Float MFloat;
+            typedef ParallelMath::UInt16 MUInt16;
+            typedef ParallelMath::UInt15 MUInt15;
+            typedef ParallelMath::SInt16 MSInt16;
+            typedef ParallelMath::SInt32 MSInt32;
+
+            UnfinishedEndpoints()
+            {
+            }
+
+            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = offset[ch];
+            }
+
+            UnfinishedEndpoints(const UnfinishedEndpoints& other)
+            {
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_base[ch] = other.m_base[ch];
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_offset[ch] = other.m_offset[ch];
+            }
+
+            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MUInt15 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
+                    }
+
+                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
+                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
+                }
+            }
+
+            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
+            {
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MSInt16 channelEPs[2];
+                    for (int epi = 0; epi < 2; epi++)
+                    {
+                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
+                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
+                    }
+
+                    outEP0[ch] = channelEPs[0];
+                    outEP1[ch] = channelEPs[1];
+                }
+            }
+
+            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
+            {
+                ParallelMath::RoundTowardNearestForScope roundingMode;
+
+                float tweakFactors[2];
+                Util::ComputeTweakFactors(tweak, range, tweakFactors);
+
+                for (int ch = 0; ch < TVectorSize; ch++)
+                {
+                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
+                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
+                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
+                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
+                }
+            }
+
+            template<int TNewVectorSize>
+            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
+            {
+                MFloat newBase[TNewVectorSize];
+                MFloat newOffset[TNewVectorSize];
+
+                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
+                {
+                    newBase[ch] = m_base[ch];
+                    newOffset[ch] = m_offset[ch];
+                }
+
+                MFloat fillerV = ParallelMath::MakeFloat(filler);
+
+                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
+                {
+                    newBase[ch] = fillerV;
+                    newOffset[ch] = ParallelMath::MakeFloatZero();
+                }
+
+                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
+            }
+
+        private:
+            MFloat m_base[TVectorSize];
+            MFloat m_offset[TVectorSize];
+        };
+    }
+}
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.cpp b/thirdparty/cvtt/ConvectionKernels_Util.cpp
new file mode 100644
index 0000000000..d9c25c7845
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.cpp
@@ -0,0 +1,88 @@
+/*
+Convection Texture Tools
+Copyright (c) 2018-2019 Eric Lasota
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject
+to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-------------------------------------------------------------------------------------
+
+Portions based on DirectX Texture Library (DirectXTex)
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+
+http://go.microsoft.com/fwlink/?LinkId=248926
+*/
+#include "ConvectionKernels_Config.h"
+
+#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
+
+#include "ConvectionKernels.h"
+#include "ConvectionKernels_ParallelMath.h"
+
+#include <algorithm>
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
+        {
+            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                const PixelBlockS8& inputSignedBlock = inputSigned[block];
+                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
+
+                for (size_t px = 0; px < 16; px++)
+                {
+                    for (size_t ch = 0; ch < 4; ch++)
+                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
+                }
+            }
+        }
+
+        void FillWeights(const Options &options, float channelWeights[4])
+        {
+            if (options.flags & Flags::Uniform)
+                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
+            else
+            {
+                channelWeights[0] = options.redWeight;
+                channelWeights[1] = options.greenWeight;
+                channelWeights[2] = options.blueWeight;
+                channelWeights[3] = options.alphaWeight;
+            }
+        }
+
+        void ComputeTweakFactors(int tweak, int range, float *outFactors)
+        {
+            int totalUnits = range - 1;
+            int minOutsideUnits = ((tweak >> 1) & 1);
+            int maxOutsideUnits = (tweak & 1);
+            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
+
+            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
+            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
+        }
+    }
+}
+
+#endif
diff --git a/thirdparty/cvtt/ConvectionKernels_Util.h b/thirdparty/cvtt/ConvectionKernels_Util.h
new file mode 100644
index 0000000000..c07b9bf2aa
--- /dev/null
+++ b/thirdparty/cvtt/ConvectionKernels_Util.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "ConvectionKernels_ParallelMath.h"
+
+namespace cvtt
+{
+    struct PixelBlockU8;
+    struct PixelBlockS8;
+    struct Options;
+}
+
+namespace cvtt
+{
+    namespace Util
+    {
+        // Signed input blocks are converted into unsigned space, with the maximum value being 254
+        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize]);
+        void FillWeights(const Options &options, float channelWeights[4]);
+        void ComputeTweakFactors(int tweak, int range, float *outFactors);
+    }
+}
diff --git a/thirdparty/cvtt/etc_notes.txt b/thirdparty/cvtt/etc_notes.txt
new file mode 100644
index 0000000000..bb041a8435
--- /dev/null
+++ b/thirdparty/cvtt/etc_notes.txt
@@ -0,0 +1,27 @@
+The ETC1 compressor uses modified cluster fit:
+
+Assume that there exists an ideal base color and set of selectors for a given table.
+For a given table and set of selectors, the ideal base color can be determined by subtracting the offsets from each pixel and averaging them.
+Doing that is equivalent to subtracting the average offset from the average color.
+Because positive and negative selectors of the same magnitude cancel out, the search space of possible average offsets is reduced: 57 unique offsets for the first table and 81 for the others.
+Most of the offsets result in the same color as another average offset due to quantization of the base color, so those can be de-duplicated.
+So:
+- Start with a high-precision average color.
+- Apply precomputed luma offsets to it.
+- Quantize and de-duplicate the base colors.
+- Find the ideal selectors for each base color.
+
+Differential mode is solved by just finding the best legal combination from those attempts.
+
+There are several scenarios where this is not ideal:
+- Clamping behavior can sometimes be leveraged for a more accurate block.
+- Differentials can sometimes be moved slightly closer to become legal.
+- This only works when MSE is the error metric (i.e. not normal maps)
+- This only works when pixel weights are of equal importance (i.e. not using weight by alpha or edge deblocking)
+
+T and H mode just work by generating clustering assignments by computing a chrominance line and splitting the block in half by the chrominance midpoint and using those to determine the averages.
+
+Planar mode is just solved algebraically.
+
+If you want to emulate etc2comp's default settings, add the flag ETC_UseFakeBT709 to use its modified Rec. 709 error coefficients.
+Doing that will significantly slow down encoding because it requires much more complicated quantization math.
+\ No newline at end of file
diff --git a/thirdparty/fonts/Hack_Regular.ttf b/thirdparty/fonts/Hack_Regular.ttf
deleted file mode 100644
index 92a90cb06e..0000000000
--- a/thirdparty/fonts/Hack_Regular.ttf
+++ /dev/null
diff --git a/thirdparty/fonts/JetBrainsMono_Regular.ttf b/thirdparty/fonts/JetBrainsMono_Regular.ttf
new file mode 100644
index 0000000000..8da8aa4051
--- /dev/null
+++ b/thirdparty/fonts/JetBrainsMono_Regular.ttf
diff --git a/thirdparty/fonts/LICENSE.JetBrainsMono.txt b/thirdparty/fonts/LICENSE.JetBrainsMono.txt
new file mode 100644
index 0000000000..e5f5dd62fc
--- /dev/null
+++ b/thirdparty/fonts/LICENSE.JetBrainsMono.txt
@@ -0,0 +1,93 @@
+Copyright 2020, The JetBrains Mono Project Authors (https://github.com/JetBrains/JetBrainsMono)
+
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+
+This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
+
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/thirdparty/fonts/LICENSE_Hack.md b/thirdparty/fonts/LICENSE_Hack.md
deleted file mode 100644
index 08927e504f..0000000000
--- a/thirdparty/fonts/LICENSE_Hack.md
+++ /dev/null
@@ -1,45 +0,0 @@
-The work in the Hack project is Copyright 2018 Source Foundry Authors and licensed under the MIT License
-
-The work in the DejaVu project was committed to the public domain.
-
-Bitstream Vera Sans Mono Copyright 2003 Bitstream Inc. and licensed under the Bitstream Vera License with Reserved Font Names "Bitstream" and "Vera"
-
-### MIT License
-
-Copyright (c) 2018 Source Foundry Authors
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-### BITSTREAM VERA LICENSE
-
-Copyright (c) 2003 by Bitstream, Inc. All Rights Reserved. Bitstream Vera is a trademark of Bitstream, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of the fonts accompanying this license ("Fonts") and associated documentation files (the "Font Software"), to reproduce and distribute the Font Software, including without limitation the rights to use, copy, merge, publish, distribute, and/or sell copies of the Font Software, and to permit persons to whom the Font Software is furnished to do so, subject to the following conditions:
-
-The above copyright and trademark notices and this permission notice shall be included in all copies of one or more of the Font Software typefaces.
-
-The Font Software may be modified, altered, or added to, and in particular the designs of glyphs or characters in the Fonts may be modified and additional glyphs or characters may be added to the Fonts, only if the fonts are renamed to names not containing either the words "Bitstream" or the word "Vera".
-
-This License becomes null and void to the extent applicable to Fonts or Font Software that has been modified and is distributed under the "Bitstream Vera" names.
-
-The Font Software may be sold as part of a larger software package but no copy of one or more of the Font Software typefaces may be sold by itself.
-
-THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL BITSTREAM OR THE GNOME FOUNDATION BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE.
-
-Except as contained in this notice, the names of Gnome, the Gnome Foundation, and Bitstream Inc., shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Font Software without prior written authorization from the Gnome Foundation or Bitstream Inc., respectively. For further information, contact: fonts at gnome dot org.
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
index 1dcbe92904..1db0f1df92 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-common.hh
@@ -839,7 +839,7 @@ struct StateTableDriver
     }
 
     if (!c->in_place)
-      buffer->swap_buffers ();
+      buffer->sync ();
   }
 
   public:
diff --git a/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh b/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
index d745c11431..0bf9bd2912 100644
--- a/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
+++ b/thirdparty/harfbuzz/src/hb-aat-layout-just-table.hh
@@ -146,7 +146,7 @@ struct DuctileGlyphAction
   HBUINT32	variationAxis;	/* The 4-byte tag identifying the ductile axis.
 				 * This would normally be 0x64756374 ('duct'),
 				 * but you may use any axis the font contains. */
-  HBFixed	minimumLimit;	/* The lowest value for the ductility axis tha
+  HBFixed	minimumLimit;	/* The lowest value for the ductility axis that
 				 * still yields an acceptable appearance. Normally
 				 * this will be 1.0. */
   HBFixed	noStretchValue; /* This is the default value that corresponds to
diff --git a/thirdparty/harfbuzz/src/hb-algs.hh b/thirdparty/harfbuzz/src/hb-algs.hh
index 446d87e28b..3a3ab08046 100644
--- a/thirdparty/harfbuzz/src/hb-algs.hh
+++ b/thirdparty/harfbuzz/src/hb-algs.hh
@@ -36,6 +36,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <functional>
 #include <new>
 
 /*
@@ -210,12 +211,23 @@ struct
 }
 HB_FUNCOBJ (hb_bool);
 
+template <typename T>
+static inline
+T hb_coerce (const T v) { return v; }
+template <typename T, typename V,
+	  hb_enable_if (!hb_is_same (hb_decay<T>, hb_decay<V>) && std::is_pointer<V>::value)>
+static inline
+T hb_coerce (const V v) { return *v; }
+
 struct
 {
   private:
 
   template <typename T> constexpr auto
-  impl (const T& v, hb_priority<1>) const HB_RETURN (uint32_t, hb_deref (v).hash ())
+  impl (const T& v, hb_priority<2>) const HB_RETURN (uint32_t, hb_deref (v).hash ())
+
+  template <typename T> constexpr auto
+  impl (const T& v, hb_priority<1>) const HB_RETURN (uint32_t, std::hash<hb_decay<decltype (hb_deref (v))>>{} (hb_deref (v)))
 
   template <typename T,
 	    hb_enable_if (std::is_integral<T>::value)> constexpr auto
@@ -435,23 +447,29 @@ struct
   private:
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<2>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<3>) const HB_AUTO_RETURN
   (
     std::forward<T2> (v2).cmp (std::forward<T1> (v1)) == 0
   )
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<1>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<2>) const HB_AUTO_RETURN
   (
     std::forward<T1> (v1).cmp (std::forward<T2> (v2)) == 0
   )
 
   template <typename T1, typename T2> auto
-  impl (T1&& v1, T2 &&v2, hb_priority<0>) const HB_AUTO_RETURN
+  impl (T1&& v1, T2 &&v2, hb_priority<1>) const HB_AUTO_RETURN
   (
     std::forward<T1> (v1) == std::forward<T2> (v2)
   )
 
+  template <typename T1, typename T2> auto
+  impl (T1&& v1, T2 &&v2, hb_priority<0>) const HB_AUTO_RETURN
+  (
+    std::forward<T2> (v2) == std::forward<T1> (v1)
+  )
+
   public:
 
   template <typename T1, typename T2> auto
@@ -472,6 +490,10 @@ struct hb_pair_t
   typedef T2 second_t;
   typedef hb_pair_t<T1, T2> pair_t;
 
+  template <typename U1 = T1, typename U2 = T2,
+	    hb_enable_if (std::is_default_constructible<U1>::value &&
+			  std::is_default_constructible<U2>::value)>
+  hb_pair_t () : first (), second () {}
   hb_pair_t (T1 a, T2 b) : first (a), second (b) {}
 
   template <typename Q1, typename Q2,
@@ -870,7 +892,7 @@ hb_bsearch_impl (unsigned *pos, /* Out */
 #pragma GCC diagnostic ignored "-Wcast-align"
     V* p = (V*) (((const char *) base) + (mid * stride));
 #pragma GCC diagnostic pop
-    int c = compar ((const void *) hb_addressof (key), (const void *) p, ds...);
+    int c = compar ((const void *) std::addressof (key), (const void *) p, ds...);
     if (c < 0)
       max = mid - 1;
     else if (c > 0)
diff --git a/thirdparty/harfbuzz/src/hb-array.hh b/thirdparty/harfbuzz/src/hb-array.hh
index 0beffb078f..1d1476d7cd 100644
--- a/thirdparty/harfbuzz/src/hb-array.hh
+++ b/thirdparty/harfbuzz/src/hb-array.hh
@@ -412,7 +412,7 @@ bool hb_array_t<T>::operator == (const hb_array_t<T> &o) const
   return true;
 }
 
-/* TODO Specialize opeator== for hb_bytes_t and hb_ubytes_t. */
+/* TODO Specialize operator== for hb_bytes_t and hb_ubytes_t. */
 
 template <>
 inline uint32_t hb_array_t<const char>::hash () const {
diff --git a/thirdparty/harfbuzz/src/hb-bimap.hh b/thirdparty/harfbuzz/src/hb-bimap.hh
index d466af8b60..a9e1278de7 100644
--- a/thirdparty/harfbuzz/src/hb-bimap.hh
+++ b/thirdparty/harfbuzz/src/hb-bimap.hh
@@ -33,20 +33,6 @@
 /* Bi-directional map */
 struct hb_bimap_t
 {
-  /* XXX(remove) */
-  void init ()
-  {
-    forw_map.init ();
-    back_map.init ();
-  }
-
-  /* XXX(remove) */
-  void fini ()
-  {
-    forw_map.fini ();
-    back_map.fini ();
-  }
-
   void reset ()
   {
     forw_map.reset ();
diff --git a/thirdparty/harfbuzz/src/hb-buffer.cc b/thirdparty/harfbuzz/src/hb-buffer.cc
index be3161a54d..e50afcb203 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.cc
+++ b/thirdparty/harfbuzz/src/hb-buffer.cc
@@ -86,7 +86,46 @@ hb_segment_properties_hash (const hb_segment_properties_t *p)
 	 (intptr_t) (p->language);
 }
 
+/**
+ * hb_segment_properties_overlay:
+ * @p: #hb_segment_properties_t to fill in.
+ * @src: #hb_segment_properties_t to fill in from.
+ *
+ * Fills in missing fields of @p from @src in a considered manner.
+ *
+ * First, if @p does not have direction set, direction is copied from @src.
+ *
+ * Next, if @p and @src have the same direction (which can be unset), if @p
+ * does not have script set, script is copied from @src.
+ *
+ * Finally, if @p and @src have the same direction and script (which either
+ * can be unset), if @p does not have language set, language is copied from
+ * @src.
+ *
+ * Since: 3.3.0
+ **/
+void
+hb_segment_properties_overlay (hb_segment_properties_t *p,
+			       const hb_segment_properties_t *src)
+{
+  if (unlikely (!p || !src))
+    return;
 
+  if (!p->direction)
+    p->direction = src->direction;
+
+  if (p->direction != src->direction)
+    return;
+
+  if (!p->script)
+    p->script = src->script;
+
+  if (p->script != src->script)
+    return;
+
+  if (!p->language)
+    p->language = src->language;
+}
 
 /* Here is how the buffer works internally:
  *
@@ -96,14 +135,14 @@ hb_segment_properties_hash (const hb_segment_properties_t *p)
  * As an optimization, both info and out_info may point to the
  * same piece of memory, which is owned by info.  This remains the
  * case as long as out_len doesn't exceed i at any time.
- * In that case, swap_buffers() is mostly no-op and the glyph operations
+ * In that case, sync() is mostly no-op and the glyph operations
  * operate mostly in-place.
  *
  * As soon as out_info gets longer than info, out_info is moved over
  * to an alternate buffer (which we reuse the pos buffer for), and its
  * current contents (out_len entries) are copied to the new place.
  *
- * This should all remain transparent to the user.  swap_buffers() then
+ * This should all remain transparent to the user.  sync() then
  * switches info over to out_info and does housekeeping.
  */
 
@@ -217,11 +256,24 @@ hb_buffer_t::get_scratch_buffer (unsigned int *size)
 /* HarfBuzz-Internal API */
 
 void
+hb_buffer_t::similar (const hb_buffer_t &src)
+{
+  hb_unicode_funcs_destroy (unicode);
+  unicode = hb_unicode_funcs_reference (src.unicode);
+  flags = src.flags;
+  cluster_level = src.cluster_level;
+  replacement = src.invisible;
+  invisible = src.invisible;
+  not_found = src.not_found;
+}
+
+void
 hb_buffer_t::reset ()
 {
   hb_unicode_funcs_destroy (unicode);
   unicode = hb_unicode_funcs_reference (hb_unicode_funcs_get_default ());
   flags = HB_BUFFER_FLAG_DEFAULT;
+  cluster_level = HB_BUFFER_CLUSTER_LEVEL_DEFAULT;
   replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT;
   invisible = 0;
   not_found = 0;
@@ -232,11 +284,10 @@ hb_buffer_t::reset ()
 void
 hb_buffer_t::clear ()
 {
+  content_type = HB_BUFFER_CONTENT_TYPE_INVALID;
   hb_segment_properties_t default_props = HB_SEGMENT_PROPERTIES_DEFAULT;
   props = default_props;
-  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
 
-  content_type = HB_BUFFER_CONTENT_TYPE_INVALID;
   successful = true;
   have_output = false;
   have_positions = false;
@@ -244,16 +295,44 @@ hb_buffer_t::clear ()
   idx = 0;
   len = 0;
   out_len = 0;
-  out_info = info;
 
-  serial = 0;
+  out_info = info;
 
   memset (context, 0, sizeof context);
   memset (context_len, 0, sizeof context_len);
 
   deallocate_var_all ();
+  serial = 0;
+  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
+}
+
+void
+hb_buffer_t::enter ()
+{
+  deallocate_var_all ();
+  serial = 0;
+  scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
+  if (likely (!hb_unsigned_mul_overflows (len, HB_BUFFER_MAX_LEN_FACTOR)))
+  {
+    max_len = hb_max (len * HB_BUFFER_MAX_LEN_FACTOR,
+		      (unsigned) HB_BUFFER_MAX_LEN_MIN);
+  }
+  if (likely (!hb_unsigned_mul_overflows (len, HB_BUFFER_MAX_OPS_FACTOR)))
+  {
+    max_ops = hb_max (len * HB_BUFFER_MAX_OPS_FACTOR,
+		      (unsigned) HB_BUFFER_MAX_OPS_MIN);
+  }
+}
+void
+hb_buffer_t::leave ()
+{
+  max_len = HB_BUFFER_MAX_LEN_DEFAULT;
+  max_ops = HB_BUFFER_MAX_OPS_DEFAULT;
+  deallocate_var_all ();
+  serial = 0;
 }
 
+
 void
 hb_buffer_t::add (hb_codepoint_t  codepoint,
 		  unsigned int    cluster)
@@ -307,7 +386,7 @@ hb_buffer_t::clear_positions ()
 }
 
 void
-hb_buffer_t::swap_buffers ()
+hb_buffer_t::sync ()
 {
   assert (have_output);
 
@@ -494,33 +573,6 @@ done:
 }
 
 void
-hb_buffer_t::unsafe_to_break_impl (unsigned int start, unsigned int end)
-{
-  unsigned int cluster = UINT_MAX;
-  cluster = _infos_find_min_cluster (info, start, end, cluster);
-  _unsafe_to_break_set_mask (info, start, end, cluster);
-}
-void
-hb_buffer_t::unsafe_to_break_from_outbuffer (unsigned int start, unsigned int end)
-{
-  if (!have_output)
-  {
-    unsafe_to_break_impl (start, end);
-    return;
-  }
-
-  assert (start <= out_len);
-  assert (idx <= end);
-
-  unsigned int cluster = UINT_MAX;
-  cluster = _infos_find_min_cluster (out_info, start, out_len, cluster);
-  cluster = _infos_find_min_cluster (info, idx, end, cluster);
-
-  _unsafe_to_break_set_mask (out_info, start, out_len, cluster);
-  _unsafe_to_break_set_mask (info, idx, end, cluster);
-}
-
-void
 hb_buffer_t::guess_segment_properties ()
 {
   assert_unicode ();
@@ -565,12 +617,11 @@ DEFINE_NULL_INSTANCE (hb_buffer_t) =
   HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT,
   0, /* invisible */
   0, /* not_found */
-  HB_BUFFER_SCRATCH_FLAG_DEFAULT,
-  HB_BUFFER_MAX_LEN_DEFAULT,
-  HB_BUFFER_MAX_OPS_DEFAULT,
+
 
   HB_BUFFER_CONTENT_TYPE_INVALID,
   HB_SEGMENT_PROPERTIES_DEFAULT,
+
   false, /* successful */
   false, /* have_output */
   true  /* have_positions */
@@ -610,6 +661,46 @@ hb_buffer_create ()
 }
 
 /**
+ * hb_buffer_create_similar:
+ * @src: An #hb_buffer_t
+ *
+ * Creates a new #hb_buffer_t, similar to hb_buffer_create(). The only
+ * difference is that the buffer is configured similarly to @src.
+ *
+ * Return value: (transfer full):
+ * A newly allocated #hb_buffer_t, similar to hb_buffer_create().
+ *
+ * Since: 3.3.0
+ **/
+hb_buffer_t *
+hb_buffer_create_similar (const hb_buffer_t *src)
+{
+  hb_buffer_t *buffer = hb_buffer_create ();
+
+  buffer->similar (*src);
+
+  return buffer;
+}
+
+/**
+ * hb_buffer_reset:
+ * @buffer: An #hb_buffer_t
+ *
+ * Resets the buffer to its initial status, as if it was just newly created
+ * with hb_buffer_create().
+ *
+ * Since: 0.9.2
+ **/
+void
+hb_buffer_reset (hb_buffer_t *buffer)
+{
+  if (unlikely (hb_object_is_immutable (buffer)))
+    return;
+
+  buffer->reset ();
+}
+
+/**
  * hb_buffer_get_empty:
  *
  * Fetches an empty #hb_buffer_t.
@@ -1157,24 +1248,6 @@ hb_buffer_get_not_found_glyph (hb_buffer_t    *buffer)
 
 
 /**
- * hb_buffer_reset:
- * @buffer: An #hb_buffer_t
- *
- * Resets the buffer to its initial status, as if it was just newly created
- * with hb_buffer_create().
- *
- * Since: 0.9.2
- **/
-void
-hb_buffer_reset (hb_buffer_t *buffer)
-{
-  if (unlikely (hb_object_is_immutable (buffer)))
-    return;
-
-  buffer->reset ();
-}
-
-/**
  * hb_buffer_clear_contents:
  * @buffer: An #hb_buffer_t
  *
@@ -1749,6 +1822,8 @@ hb_buffer_append (hb_buffer_t *buffer,
   if (!buffer->have_positions && source->have_positions)
     buffer->clear_positions ();
 
+  hb_segment_properties_overlay (&buffer->props, &source->props);
+
   memcpy (buffer->info + orig_len, source->info + start, (end - start) * sizeof (buffer->info[0]));
   if (buffer->have_positions)
     memcpy (buffer->pos + orig_len, source->pos + start, (end - start) * sizeof (buffer->pos[0]));
diff --git a/thirdparty/harfbuzz/src/hb-buffer.h b/thirdparty/harfbuzz/src/hb-buffer.h
index a183cb9d4a..9fbd7b1ec3 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.h
+++ b/thirdparty/harfbuzz/src/hb-buffer.h
@@ -76,18 +76,68 @@ typedef struct hb_glyph_info_t {
  * @HB_GLYPH_FLAG_UNSAFE_TO_BREAK: Indicates that if input text is broken at the
  * 				   beginning of the cluster this glyph is part of,
  * 				   then both sides need to be re-shaped, as the
- * 				   result might be different.  On the flip side,
- * 				   it means that when this flag is not present,
- * 				   then it's safe to break the glyph-run at the
- * 				   beginning of this cluster, and the two sides
- * 				   represent the exact same result one would get
- * 				   if breaking input text at the beginning of
- * 				   this cluster and shaping the two sides
- * 				   separately.  This can be used to optimize
- * 				   paragraph layout, by avoiding re-shaping
- * 				   of each line after line-breaking, or limiting
- * 				   the reshaping to a small piece around the
- * 				   breaking point only.
+ * 				   result might be different.
+ * 				   On the flip side, it means that when this
+ * 				   flag is not present, then it is safe to break
+ * 				   the glyph-run at the beginning of this
+ * 				   cluster, and the two sides will represent the
+ * 				   exact same result one would get if breaking
+ * 				   input text at the beginning of this cluster
+ * 				   and shaping the two sides separately.
+ * 				   This can be used to optimize paragraph
+ * 				   layout, by avoiding re-shaping of each line
+ * 				   after line-breaking.
+ * @HB_GLYPH_FLAG_UNSAFE_TO_CONCAT: Indicates that if input text is changed on one
+ * 				   side of the beginning of the cluster this glyph
+ * 				   is part of, then the shaping results for the
+ * 				   other side might change.
+ * 				   Note that the absence of this flag will NOT by
+ * 				   itself mean that it IS safe to concat text.
+ * 				   Only two pieces of text both of which clear of
+ * 				   this flag can be concatenated safely.
+ * 				   This can be used to optimize paragraph
+ * 				   layout, by avoiding re-shaping of each line
+ * 				   after line-breaking, by limiting the
+ * 				   reshaping to a small piece around the
+ * 				   breaking positin only, even if the breaking
+ * 				   position carries the
+ * 				   #HB_GLYPH_FLAG_UNSAFE_TO_BREAK or when
+ * 				   hyphenation or other text transformation
+ * 				   happens at line-break position, in the following
+ * 				   way:
+ * 				   1. Iterate back from the line-break position
+ * 				   until the first cluster start position that is
+ * 				   NOT unsafe-to-concat, 2. shape the segment from
+ * 				   there till the end of line, 3. check whether the
+ * 				   resulting glyph-run also is clear of the
+ * 				   unsafe-to-concat at its start-of-text position;
+ * 				   if it is, just splice it into place and the line
+ * 				   is shaped; If not, move on to a position further
+ * 				   back that is clear of unsafe-to-concat and retry
+ * 				   from there, and repeat.
+ * 				   At the start of next line a similar algorithm can
+ * 				   be implemented. That is: 1. Iterate forward from
+ * 				   the line-break position untill the first cluster
+ * 				   start position that is NOT unsafe-to-concat, 2.
+ * 				   shape the segment from beginning of the line to
+ * 				   that position, 3. check whether the resulting
+ * 				   glyph-run also is clear of the unsafe-to-concat
+ * 				   at its end-of-text position; if it is, just splice
+ * 				   it into place and the beginning is shaped; If not,
+ * 				   move on to a position further forward that is clear
+ * 				   of unsafe-to-concat and retry up to there, and repeat.
+ * 				   A slight complication will arise in the
+ * 				   implementation of the algorithm above,
+ * 				   because while our buffer API has a way to
+ * 				   return flags for position corresponding to
+ * 				   start-of-text, there is currently no position
+ * 				   corresponding to end-of-text.  This limitation
+ * 				   can be alleviated by shaping more text than needed
+ * 				   and looking for unsafe-to-concat flag within text
+ * 				   clusters.
+ * 				   The #HB_GLYPH_FLAG_UNSAFE_TO_BREAK flag will
+ * 				   always imply this flag.
+ * 				   Since: 3.3.0
  * @HB_GLYPH_FLAG_DEFINED: All the currently defined flags.
  *
  * Flags for #hb_glyph_info_t.
@@ -96,8 +146,9 @@ typedef struct hb_glyph_info_t {
  */
 typedef enum { /*< flags >*/
   HB_GLYPH_FLAG_UNSAFE_TO_BREAK		= 0x00000001,
+  HB_GLYPH_FLAG_UNSAFE_TO_CONCAT	= 0x00000002,
 
-  HB_GLYPH_FLAG_DEFINED			= 0x00000001 /* OR of all defined flags */
+  HB_GLYPH_FLAG_DEFINED			= 0x00000003 /* OR of all defined flags */
 } hb_glyph_flags_t;
 
 HB_EXTERN hb_glyph_flags_t
@@ -170,6 +221,9 @@ hb_segment_properties_equal (const hb_segment_properties_t *a,
 HB_EXTERN unsigned int
 hb_segment_properties_hash (const hb_segment_properties_t *p);
 
+HB_EXTERN void
+hb_segment_properties_overlay (hb_segment_properties_t *p,
+			       const hb_segment_properties_t *src);
 
 
 /**
@@ -185,6 +239,13 @@ HB_EXTERN hb_buffer_t *
 hb_buffer_create (void);
 
 HB_EXTERN hb_buffer_t *
+hb_buffer_create_similar (const hb_buffer_t *src);
+
+HB_EXTERN void
+hb_buffer_reset (hb_buffer_t *buffer);
+
+
+HB_EXTERN hb_buffer_t *
 hb_buffer_get_empty (void);
 
 HB_EXTERN hb_buffer_t *
@@ -391,8 +452,9 @@ HB_EXTERN hb_codepoint_t
 hb_buffer_get_not_found_glyph (hb_buffer_t    *buffer);
 
 
-HB_EXTERN void
-hb_buffer_reset (hb_buffer_t *buffer);
+/*
+ * Content API.
+ */
 
 HB_EXTERN void
 hb_buffer_clear_contents (hb_buffer_t *buffer);
diff --git a/thirdparty/harfbuzz/src/hb-buffer.hh b/thirdparty/harfbuzz/src/hb-buffer.hh
index 0f8140f1b3..adf4aa2b6f 100644
--- a/thirdparty/harfbuzz/src/hb-buffer.hh
+++ b/thirdparty/harfbuzz/src/hb-buffer.hh
@@ -67,8 +67,8 @@ enum hb_buffer_scratch_flags_t {
   HB_BUFFER_SCRATCH_FLAG_HAS_DEFAULT_IGNORABLES		= 0x00000002u,
   HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK		= 0x00000004u,
   HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT		= 0x00000008u,
-  HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK		= 0x00000010u,
-  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000020u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000010u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS		= 0x00000020u,
 
   /* Reserved for complex shapers' internal use. */
   HB_BUFFER_SCRATCH_FLAG_COMPLEX0			= 0x01000000u,
@@ -87,18 +87,21 @@ struct hb_buffer_t
 {
   hb_object_header_t header;
 
-  /* Information about how the text in the buffer should be treated */
+  /*
+   * Information about how the text in the buffer should be treated.
+   */
+
   hb_unicode_funcs_t *unicode; /* Unicode functions */
   hb_buffer_flags_t flags; /* BOT / EOT / etc. */
   hb_buffer_cluster_level_t cluster_level;
   hb_codepoint_t replacement; /* U+FFFD or something else. */
   hb_codepoint_t invisible; /* 0 or something else. */
   hb_codepoint_t not_found; /* 0 or something else. */
-  hb_buffer_scratch_flags_t scratch_flags; /* Have space-fallback, etc. */
-  unsigned int max_len; /* Maximum allowed len. */
-  int max_ops; /* Maximum allowed operations. */
 
-  /* Buffer contents */
+  /*
+   * Buffer contents
+   */
+
   hb_buffer_content_type_t content_type;
   hb_segment_properties_t props; /* Script, language, direction */
 
@@ -115,8 +118,6 @@ struct hb_buffer_t
   hb_glyph_info_t     *out_info;
   hb_glyph_position_t *pos;
 
-  unsigned int serial;
-
   /* Text before / after the main buffer contents.
    * Always in Unicode, and ordered outward.
    * Index 0 is for "pre-context", 1 for "post-context". */
@@ -124,7 +125,25 @@ struct hb_buffer_t
   hb_codepoint_t context[2][CONTEXT_LENGTH];
   unsigned int context_len[2];
 
-  /* Debugging API */
+
+  /*
+   * Managed by enter / leave
+   */
+
+#ifndef HB_NDEBUG
+  uint8_t allocated_var_bits;
+#endif
+  uint8_t serial;
+  hb_buffer_scratch_flags_t scratch_flags; /* Have space-fallback, etc. */
+  unsigned int max_len; /* Maximum allowed len. */
+  int max_ops; /* Maximum allowed operations. */
+  /* The bits here reflect current allocations of the bytes in glyph_info_t's var1 and var2. */
+
+
+  /*
+   * Messaging callback
+   */
+
 #ifndef HB_NO_BUFFER_MESSAGE
   hb_buffer_message_func_t message_func;
   void *message_data;
@@ -134,11 +153,6 @@ struct hb_buffer_t
   static constexpr unsigned message_depth = 0u;
 #endif
 
-  /* Internal debugging. */
-  /* The bits here reflect current allocations of the bytes in glyph_info_t's var1 and var2. */
-#ifndef HB_NDEBUG
-  uint8_t allocated_var_bits;
-#endif
 
 
   /* Methods */
@@ -190,12 +204,17 @@ struct hb_buffer_t
   hb_glyph_info_t &prev ()      { return out_info[out_len ? out_len - 1 : 0]; }
   hb_glyph_info_t prev () const { return out_info[out_len ? out_len - 1 : 0]; }
 
+  HB_INTERNAL void similar (const hb_buffer_t &src);
   HB_INTERNAL void reset ();
   HB_INTERNAL void clear ();
 
+  /* Called around shape() */
+  HB_INTERNAL void enter ();
+  HB_INTERNAL void leave ();
+
   unsigned int backtrack_len () const { return have_output ? out_len : idx; }
   unsigned int lookahead_len () const { return len - idx; }
-  unsigned int next_serial () { return serial++; }
+  uint8_t next_serial () { return ++serial ? serial : ++serial; }
 
   HB_INTERNAL void add (hb_codepoint_t  codepoint,
 			unsigned int    cluster);
@@ -252,7 +271,7 @@ struct hb_buffer_t
 
   HB_INTERNAL void guess_segment_properties ();
 
-  HB_INTERNAL void swap_buffers ();
+  HB_INTERNAL void sync ();
   HB_INTERNAL void clear_output ();
   HB_INTERNAL void clear_positions ();
 
@@ -366,15 +385,80 @@ struct hb_buffer_t
   /* Merge clusters for deleting current glyph, and skip it. */
   HB_INTERNAL void delete_glyph ();
 
-  void unsafe_to_break (unsigned int start,
-			unsigned int end)
+
+  void set_glyph_flags (hb_mask_t mask,
+			unsigned start = 0,
+			unsigned end = (unsigned) -1,
+			bool interior = false,
+			bool from_out_buffer = false)
   {
-    if (end - start < 2)
+    end = hb_min (end, len);
+
+    if (interior && !from_out_buffer && end - start < 2)
       return;
-    unsafe_to_break_impl (start, end);
+
+    scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS;
+
+    if (!from_out_buffer || !have_output)
+    {
+      if (!interior)
+      {
+	for (unsigned i = start; i < end; i++)
+	  info[i].mask |= mask;
+      }
+      else
+      {
+	unsigned cluster = _infos_find_min_cluster (info, start, end);
+	_infos_set_glyph_flags (info, start, end, cluster, mask);
+      }
+    }
+    else
+    {
+      assert (start <= out_len);
+      assert (idx <= end);
+
+      if (!interior)
+      {
+	for (unsigned i = start; i < out_len; i++)
+	  out_info[i].mask |= mask;
+	for (unsigned i = idx; i < end; i++)
+	  info[i].mask |= mask;
+      }
+      else
+      {
+	unsigned cluster = _infos_find_min_cluster (info, idx, end);
+	cluster = _infos_find_min_cluster (out_info, start, out_len, cluster);
+
+	_infos_set_glyph_flags (out_info, start, out_len, cluster, mask);
+	_infos_set_glyph_flags (info, idx, end, cluster, mask);
+      }
+    }
+  }
+
+  void unsafe_to_break (unsigned int start = 0, unsigned int end = -1)
+  {
+    set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK | HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		     start, end,
+		     true);
+  }
+  void unsafe_to_concat (unsigned int start = 0, unsigned int end = -1)
+  {
+    set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		     start, end,
+		     true);
+  }
+  void unsafe_to_break_from_outbuffer (unsigned int start = 0, unsigned int end = -1)
+  {
+    set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK | HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		     start, end,
+		     true, true);
+  }
+  void unsafe_to_concat_from_outbuffer (unsigned int start = 0, unsigned int end = -1)
+  {
+    set_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_CONCAT,
+		     start, end,
+		     false, true);
   }
-  HB_INTERNAL void unsafe_to_break_impl (unsigned int start, unsigned int end);
-  HB_INTERNAL void unsafe_to_break_from_outbuffer (unsigned int start, unsigned int end);
 
 
   /* Internal methods */
@@ -465,36 +549,31 @@ struct hb_buffer_t
   set_cluster (hb_glyph_info_t &inf, unsigned int cluster, unsigned int mask = 0)
   {
     if (inf.cluster != cluster)
-    {
-      if (mask & HB_GLYPH_FLAG_UNSAFE_TO_BREAK)
-	inf.mask |= HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-      else
-	inf.mask &= ~HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-    }
+      inf.mask = (inf.mask & ~HB_GLYPH_FLAG_DEFINED) | (mask & HB_GLYPH_FLAG_DEFINED);
     inf.cluster = cluster;
   }
-
+  void
+  _infos_set_glyph_flags (hb_glyph_info_t *infos,
+			  unsigned int start, unsigned int end,
+			  unsigned int cluster,
+			  hb_mask_t mask)
+  {
+    for (unsigned int i = start; i < end; i++)
+      if (cluster != infos[i].cluster)
+      {
+	scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS;
+	infos[i].mask |= mask;
+      }
+  }
   static unsigned
   _infos_find_min_cluster (const hb_glyph_info_t *infos,
 			   unsigned start, unsigned end,
-			   unsigned cluster)
+			   unsigned cluster = UINT_MAX)
   {
     for (unsigned int i = start; i < end; i++)
       cluster = hb_min (cluster, infos[i].cluster);
     return cluster;
   }
-  void
-  _unsafe_to_break_set_mask (hb_glyph_info_t *infos,
-			     unsigned int start, unsigned int end,
-			     unsigned int cluster)
-  {
-    for (unsigned int i = start; i < end; i++)
-      if (cluster != infos[i].cluster)
-      {
-	scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK;
-	infos[i].mask |= HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-      }
-  }
 
   void clear_glyph_flags (hb_mask_t mask = 0)
   {
diff --git a/thirdparty/harfbuzz/src/hb-cff-interp-common.hh b/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
index c251e2d0ed..641de0eff2 100644
--- a/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
+++ b/thirdparty/harfbuzz/src/hb-cff-interp-common.hh
@@ -217,9 +217,6 @@ inline unsigned int OpCode_Size (op_code_t op) { return Is_OpCode_ESC (op) ? 2:
 
 struct number_t
 {
-  void init () { set_real (0.0); }
-  void fini () {}
-
   void set_int (int v)       { value = v; }
   int to_int () const        { return value; }
 
@@ -245,7 +242,7 @@ struct number_t
   }
 
   protected:
-  double value;
+  double value = 0.;
 };
 
 /* byte string */
@@ -380,10 +377,8 @@ struct cff_stack_t
     count = 0;
     elements.init ();
     elements.resize (kSizeLimit);
-    for (unsigned int i = 0; i < elements.length; i++)
-      elements[i].init ();
   }
-  void fini () { elements.fini_deep (); }
+  void fini () { elements.fini (); }
 
   ELEM& operator [] (unsigned int i)
   {
@@ -523,9 +518,6 @@ struct arg_stack_t : cff_stack_t<ARG, 513>
 /* an operator prefixed by its operands in a byte string */
 struct op_str_t
 {
-  void init () {}
-  void fini () {}
-
   op_code_t  op;
   byte_str_t str;
 };
@@ -553,7 +545,7 @@ struct parsed_values_t
     opStart = 0;
     values.init ();
   }
-  void fini () { values.fini_deep (); }
+  void fini () { values.fini (); }
 
   void add_op (op_code_t op, const byte_str_ref_t& str_ref = byte_str_ref_t ())
   {
diff --git a/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh b/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
index 52d778ffe2..ef299369b5 100644
--- a/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
+++ b/thirdparty/harfbuzz/src/hb-cff-interp-cs-common.hh
@@ -94,12 +94,6 @@ struct biased_subrs_t
 
 struct point_t
 {
-  void init ()
-  {
-    x.init ();
-    y.init ();
-  }
-
   void set_int (int _x, int _y)
   {
     x.set_int (_x);
@@ -128,7 +122,7 @@ struct cs_interp_env_t : interp_env_t<ARG>
     hstem_count = 0;
     vstem_count = 0;
     hintmask_size = 0;
-    pt.init ();
+    pt.set_int (0, 0);
     callStack.init ();
     globalSubrs.init (globalSubrs_);
     localSubrs.init (localSubrs_);
@@ -841,7 +835,6 @@ struct path_procs_t
     if (likely (env.argStack.get_count () == 11))
     {
       point_t d;
-      d.init ();
       for (unsigned int i = 0; i < 10; i += 2)
 	d.move (env.eval_arg (i), env.eval_arg (i+1));
 
diff --git a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
index d961566447..766183760e 100644
--- a/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
+++ b/thirdparty/harfbuzz/src/hb-cff2-interp-cs.hh
@@ -35,18 +35,6 @@ using namespace OT;
 
 struct blend_arg_t : number_t
 {
-  void init ()
-  {
-    number_t::init ();
-    deltas.init ();
-  }
-
-  void fini ()
-  {
-    number_t::fini ();
-    deltas.fini_deep ();
-  }
-
   void set_int (int v) { reset_blends (); number_t::set_int (v); }
   void set_fixed (int32_t v) { reset_blends (); number_t::set_fixed (v); }
   void set_real (double v) { reset_blends (); number_t::set_real (v); }
@@ -202,7 +190,7 @@ struct cff2_cs_opset_t : cs_opset_t<blend_arg_t, OPSET, cff2_cs_interp_env_t, PA
     switch (op) {
       case OpCode_callsubr:
       case OpCode_callgsubr:
-	/* a subroutine number shoudln't be a blended value */
+	/* a subroutine number shouldn't be a blended value */
 	if (unlikely (env.argStack.peek ().blending ()))
 	{
 	  env.set_error ();
diff --git a/thirdparty/harfbuzz/src/hb-common.cc b/thirdparty/harfbuzz/src/hb-common.cc
index 26c8ad0f49..249a8a8010 100644
--- a/thirdparty/harfbuzz/src/hb-common.cc
+++ b/thirdparty/harfbuzz/src/hb-common.cc
@@ -29,10 +29,31 @@
 #include "hb.hh"
 #include "hb-machinery.hh"
 
+#if !defined(HB_NO_SETLOCALE) && (!defined(HAVE_NEWLOCALE) || !defined(HAVE_USELOCALE))
+#define HB_NO_SETLOCALE 1
+#endif
+
+#ifndef HB_NO_SETLOCALE
+
 #include <locale.h>
+#ifdef HAVE_XLOCALE_H
+#include <xlocale.h> // Needed on BSD/OS X for uselocale
+#endif
+
+#ifdef WIN32
+#define hb_locale_t _locale_t
+#else
+#define hb_locale_t locale_t
+#endif
+#define hb_setlocale setlocale
+#define hb_uselocale uselocale
+
+#else
+
+#define hb_locale_t void *
+#define hb_setlocale(Category, Locale) "C"
+#define hb_uselocale(Locale) ((hb_locale_t) 0)
 
-#ifdef HB_NO_SETLOCALE
-#define setlocale(Category, Locale) "C"
 #endif
 
 /**
@@ -122,7 +143,7 @@ hb_tag_from_string (const char *str, int len)
  * @tag: #hb_tag_t to convert
  * @buf: (out caller-allocates) (array fixed-size=4) (element-type uint8_t): Converted string
  *
- * Converts an #hb_tag_t to a string and returns it in @buf. 
+ * Converts an #hb_tag_t to a string and returns it in @buf.
  * Strings will be four characters long.
  *
  * Since: 0.9.5
@@ -151,13 +172,13 @@ const char direction_strings[][4] = {
  * @str: (array length=len) (element-type uint8_t): String to convert
  * @len: Length of @str, or -1 if it is %NULL-terminated
  *
- * Converts a string to an #hb_direction_t. 
+ * Converts a string to an #hb_direction_t.
  *
  * Matching is loose and applies only to the first letter. For
  * examples, "LTR" and "left-to-right" will both return #HB_DIRECTION_LTR.
  *
  * Unmatched strings will return #HB_DIRECTION_INVALID.
- * 
+ *
  * Return value: The #hb_direction_t matching @str
  *
  * Since: 0.9.2
@@ -413,7 +434,7 @@ hb_language_get_default ()
   hb_language_t language = default_language;
   if (unlikely (language == HB_LANGUAGE_INVALID))
   {
-    language = hb_language_from_string (setlocale (LC_CTYPE, nullptr), -1);
+    language = hb_language_from_string (hb_setlocale (LC_CTYPE, nullptr), -1);
     (void) default_language.cmpexch (HB_LANGUAGE_INVALID, language);
   }
 
@@ -1039,6 +1060,47 @@ hb_variation_from_string (const char *str, int len,
   return false;
 }
 
+#ifndef HB_NO_SETLOCALE
+
+static inline void free_static_C_locale ();
+
+static struct hb_C_locale_lazy_loader_t : hb_lazy_loader_t<hb_remove_pointer<hb_locale_t>,
+							     hb_C_locale_lazy_loader_t>
+{
+  static hb_locale_t create ()
+  {
+    hb_locale_t l = newlocale (LC_ALL_MASK, "C", NULL);
+    if (!l)
+      return l;
+
+    hb_atexit (free_static_C_locale);
+
+    return l;
+  }
+  static void destroy (hb_locale_t l)
+  {
+    freelocale (l);
+  }
+  static hb_locale_t get_null ()
+  {
+    return (hb_locale_t) 0;
+  }
+} static_C_locale;
+
+static inline
+void free_static_C_locale ()
+{
+  static_C_locale.free_instance ();
+}
+
+static hb_locale_t
+get_C_locale ()
+{
+  return static_C_locale.get_unconst ();
+}
+
+#endif
+
 /**
  * hb_variation_to_string:
  * @variation: an #hb_variation_t to convert
@@ -1064,7 +1126,11 @@ hb_variation_to_string (hb_variation_t *variation,
   while (len && s[len - 1] == ' ')
     len--;
   s[len++] = '=';
+
+  hb_locale_t oldlocale HB_UNUSED;
+  oldlocale = hb_uselocale (get_C_locale ());
   len += hb_max (0, snprintf (s + len, ARRAY_LENGTH (s) - len, "%g", (double) variation->value));
+  (void) hb_uselocale (oldlocale);
 
   assert (len < ARRAY_LENGTH (s));
   len = hb_min (len, size - 1);
diff --git a/thirdparty/harfbuzz/src/hb-coretext.cc b/thirdparty/harfbuzz/src/hb-coretext.cc
index a512f3b8b7..5f383064c4 100644
--- a/thirdparty/harfbuzz/src/hb-coretext.cc
+++ b/thirdparty/harfbuzz/src/hb-coretext.cc
@@ -481,8 +481,8 @@ struct active_feature_t {
 	   a->rec.setting < b->rec.setting ? -1 : a->rec.setting > b->rec.setting ? 1 :
 	   0;
   }
-  bool operator== (const active_feature_t *f) {
-    return cmp (this, f) == 0;
+  bool operator== (const active_feature_t& f) const {
+    return cmp (this, &f) == 0;
   }
 };
 
@@ -677,7 +677,7 @@ _hb_coretext_shape (hb_shape_plan_t    *shape_plan,
       {
 	active_features.push (event->feature);
       } else {
-	active_feature_t *feature = active_features.find (&event->feature);
+	active_feature_t *feature = active_features.lsearch (event->feature);
 	if (feature)
 	  active_features.remove (feature - active_features.arrayZ);
       }
@@ -1213,7 +1213,8 @@ resize_and_retry:
     }
   }
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
 #undef FAIL
 
diff --git a/thirdparty/harfbuzz/src/hb-directwrite.cc b/thirdparty/harfbuzz/src/hb-directwrite.cc
index dea87b8cd0..f177ff31c0 100644
--- a/thirdparty/harfbuzz/src/hb-directwrite.cc
+++ b/thirdparty/harfbuzz/src/hb-directwrite.cc
@@ -762,7 +762,8 @@ retry_getglyphs:
 
   if (isRightToLeft) hb_buffer_reverse (buffer);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   delete [] clusterMap;
   delete [] glyphIndices;
diff --git a/thirdparty/harfbuzz/src/hb-draw.h b/thirdparty/harfbuzz/src/hb-draw.h
index bddc876399..f82cc34842 100644
--- a/thirdparty/harfbuzz/src/hb-draw.h
+++ b/thirdparty/harfbuzz/src/hb-draw.h
@@ -50,7 +50,7 @@ typedef void (*hb_draw_close_path_func_t) (void *user_data);
  *
  * Glyph draw callbacks.
  *
- * _move_to, _line_to and _cubic_to calls are nessecary to be defined but we
+ * _move_to, _line_to and _cubic_to calls are necessary to be defined but we
  * translate _quadratic_to calls to _cubic_to if the callback isn't defined.
  *
  * Since: EXPERIMENTAL
diff --git a/thirdparty/harfbuzz/src/hb-face.cc b/thirdparty/harfbuzz/src/hb-face.cc
index 2c0087370c..5365598636 100644
--- a/thirdparty/harfbuzz/src/hb-face.cc
+++ b/thirdparty/harfbuzz/src/hb-face.cc
@@ -143,7 +143,7 @@ hb_face_create_for_tables (hb_reference_table_func_t  reference_table_func,
 
 typedef struct hb_face_for_data_closure_t {
   hb_blob_t *blob;
-  unsigned int  index;
+  uint16_t  index;
 } hb_face_for_data_closure_t;
 
 static hb_face_for_data_closure_t *
@@ -156,7 +156,7 @@ _hb_face_for_data_closure_create (hb_blob_t *blob, unsigned int index)
     return nullptr;
 
   closure->blob = blob;
-  closure->index = index;
+  closure->index = (uint16_t) (index & 0xFFFFu);
 
   return closure;
 }
@@ -195,9 +195,19 @@ _hb_face_for_data_reference_table (hb_face_t *face HB_UNUSED, hb_tag_t tag, void
  * @index: The index of the face within @blob
  *
  * Constructs a new face object from the specified blob and
- * a face index into that blob. This is used for blobs of
- * file formats such as Dfont and TTC that can contain more
- * than one face.
+ * a face index into that blob.
+ *
+ * The face index is used for blobs of file formats such as TTC and
+ * and DFont that can contain more than one face.  Face indices within
+ * such collections are zero-based.
+ *
+ * <note>Note: If the blob font format is not a collection, @index
+ * is ignored.  Otherwise, only the lower 16-bits of @index are used.
+ * The unmodified @index can be accessed via hb_face_get_index().</note>
+ *
+ * <note>Note: The high 16-bits of @index, if non-zero, are used by
+ * hb_font_create() to load named-instances in variable fonts.  See
+ * hb_font_create() for details.</note>
  *
  * Return value: (transfer full): The new face object
  *
@@ -420,7 +430,8 @@ hb_face_reference_blob (hb_face_t *face)
  * Assigns the specified face-index to @face. Fails if the
  * face is immutable.
  *
- * <note>Note: face indices within a collection are zero-based.</note>
+ * <note>Note: changing the index has no effect on the face itself
+ * This only changes the value returned by hb_face_get_index().</note>
  *
  * Since: 0.9.2
  **/
diff --git a/thirdparty/harfbuzz/src/hb-font.cc b/thirdparty/harfbuzz/src/hb-font.cc
index fa8da96395..350fcac139 100644
--- a/thirdparty/harfbuzz/src/hb-font.cc
+++ b/thirdparty/harfbuzz/src/hb-font.cc
@@ -631,7 +631,7 @@ hb_font_funcs_destroy (hb_font_funcs_t *ffuncs)
  * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
- * Attaches a user-data key/data pair to the specified font-functions structure. 
+ * Attaches a user-data key/data pair to the specified font-functions structure.
  *
  * Return value: %true if success, %false otherwise
  *
@@ -821,7 +821,7 @@ hb_font_get_glyph (hb_font_t      *font,
  * @glyph: (out): The glyph ID retrieved
  *
  * Fetches the nominal glyph ID for a Unicode code point in the
- * specified font. 
+ * specified font.
  *
  * This version of the function should not be used to fetch glyph IDs
  * for code points modified by variation selectors. For variation-selector
@@ -940,7 +940,7 @@ hb_font_get_glyph_v_advance (hb_font_t      *font,
  * @advance_stride: The stride between successive advances
  *
  * Fetches the advances for a sequence of glyph IDs in the specified
- * font, for horizontal text segments. 
+ * font, for horizontal text segments.
  *
  * Since: 1.8.6
  **/
@@ -964,7 +964,7 @@ hb_font_get_glyph_h_advances (hb_font_t*            font,
  * @advance_stride: (out): The stride between successive advances
  *
  * Fetches the advances for a sequence of glyph IDs in the specified
- * font, for vertical text segments.  
+ * font, for vertical text segments.
  *
  * Since: 1.8.6
  **/
@@ -1278,7 +1278,7 @@ hb_font_get_glyph_origin_for_direction (hb_font_t      *font,
  * @font: #hb_font_t to work upon
  * @glyph: The glyph ID to query
  * @direction: The direction of the text segment
- * @x: (inout): Input = The original X coordinate 
+ * @x: (inout): Input = The original X coordinate
  *     Output = The X coordinate plus the X-coordinate of the origin
  * @y: (inout): Input = The original Y coordinate
  *     Output = The Y coordinate plus the Y-coordinate of the origin
@@ -1306,7 +1306,7 @@ hb_font_add_glyph_origin_for_direction (hb_font_t      *font,
  * @font: #hb_font_t to work upon
  * @glyph: The glyph ID to query
  * @direction: The direction of the text segment
- * @x: (inout): Input = The original X coordinate 
+ * @x: (inout): Input = The original X coordinate
  *     Output = The X coordinate minus the X-coordinate of the origin
  * @y: (inout): Input = The original Y coordinate
  *     Output = The Y coordinate minus the Y-coordinate of the origin
@@ -1477,6 +1477,8 @@ DEFINE_NULL_INSTANCE (hb_font_t) =
 
   1000, /* x_scale */
   1000, /* y_scale */
+  0., /* slant */
+  0., /* slant_xy; */
   1<<16, /* x_mult */
   1<<16, /* y_mult */
 
@@ -1521,6 +1523,13 @@ _hb_font_create (hb_face_t *face)
  *
  * Constructs a new font object from the specified face.
  *
+ * <note>Note: If @face's index value (as passed to hb_face_create()
+ * has non-zero top 16-bits, those bits minus one are passed to
+ * hb_font_set_var_named_instance(), effectively loading a named-instance
+ * of a variable font, instead of the default-instance.  This allows
+ * specifying which named-instance to load by default when creating the
+ * face.</note>
+ *
  * Return value: (transfer full): The new font object
  *
  * Since: 0.9.2
@@ -1535,6 +1544,11 @@ hb_font_create (hb_face_t *face)
   hb_ot_font_set_funcs (font);
 #endif
 
+#ifndef HB_NO_VAR
+  if (face && face->index >> 16)
+    hb_font_set_var_named_instance (font, (face->index >> 16) - 1);
+#endif
+
   return font;
 }
 
@@ -1578,6 +1592,7 @@ hb_font_create_sub_font (hb_font_t *parent)
 
   font->x_scale = parent->x_scale;
   font->y_scale = parent->y_scale;
+  font->slant = parent->slant;
   font->mults_changed ();
   font->x_ppem = parent->x_ppem;
   font->y_ppem = parent->y_ppem;
@@ -1668,12 +1683,12 @@ hb_font_destroy (hb_font_t *font)
 /**
  * hb_font_set_user_data: (skip)
  * @font: #hb_font_t to work upon
- * @key: The user-data key 
+ * @key: The user-data key
  * @data: A pointer to the user data
  * @destroy: (nullable): A callback to call when @data is not needed anymore
  * @replace: Whether to replace an existing data with the same key
  *
- * Attaches a user-data key/data pair to the specified font object. 
+ * Attaches a user-data key/data pair to the specified font object.
  *
  * Return value: %true if success, %false otherwise
  *
@@ -1875,7 +1890,7 @@ hb_font_set_funcs (hb_font_t         *font,
  * @font_data: (destroy destroy) (scope notified): Data to attach to @font
  * @destroy: (nullable): The function to call when @font_data is not needed anymore
  *
- * Replaces the user data attached to a font, updating the font's 
+ * Replaces the user data attached to a font, updating the font's
  * @destroy callback.
  *
  * Since: 0.9.2
@@ -1949,7 +1964,7 @@ hb_font_get_scale (hb_font_t *font,
  * @x_ppem: Horizontal ppem value to assign
  * @y_ppem: Vertical ppem value to assign
  *
- * Sets the horizontal and vertical pixels-per-em (ppem) of a font. 
+ * Sets the horizontal and vertical pixels-per-em (ppem) of a font.
  *
  * Since: 0.9.2
  **/
@@ -1971,7 +1986,7 @@ hb_font_set_ppem (hb_font_t    *font,
  * @x_ppem: (out): Horizontal ppem value
  * @y_ppem: (out): Vertical ppem value
  *
- * Fetches the horizontal and vertical points-per-em (ppem) of a font. 
+ * Fetches the horizontal and vertical points-per-em (ppem) of a font.
  *
  * Since: 0.9.2
  **/
@@ -2015,7 +2030,7 @@ hb_font_set_ptem (hb_font_t *font,
  *
  * Return value: Point size.  A value of zero means "not set."
  *
- * Since: 0.9.2
+ * Since: 1.6.0
  **/
 float
 hb_font_get_ptem (hb_font_t *font)
@@ -2023,6 +2038,49 @@ hb_font_get_ptem (hb_font_t *font)
   return font->ptem;
 }
 
+/**
+ * hb_font_set_synthetic_slant:
+ * @font: #hb_font_t to work upon
+ * @slant: synthetic slant value.
+ *
+ * Sets the "synthetic slant" of a font.  By default is zero.
+ * Synthetic slant is the graphical skew that the renderer
+ * applies to the font at rendering time.
+ *
+ * HarfBuzz needs to know this value to adjust shaping results,
+ * metrics, and style values to match the slanted rendering.
+ *
+ * <note>Note: The slant value is a ratio.  For example, a
+ * 20% slant would be represented as a 0.2 value.</note>
+ *
+ * Since: 3.3.0
+ **/
+HB_EXTERN void
+hb_font_set_synthetic_slant (hb_font_t *font, float slant)
+{
+  if (hb_object_is_immutable (font))
+    return;
+
+  font->slant = slant;
+  font->mults_changed ();
+}
+
+/**
+ * hb_font_get_synthetic_slant:
+ * @font: #hb_font_t to work upon
+ *
+ * Fetches the "synthetic slant" of a font.
+ *
+ * Return value: Synthetic slant.  By default is zero.
+ *
+ * Since: 3.3.0
+ **/
+HB_EXTERN float
+hb_font_get_synthetic_slant (hb_font_t *font)
+{
+  return font->slant;
+}
+
 #ifndef HB_NO_VAR
 /*
  * Variations
@@ -2036,6 +2094,10 @@ hb_font_get_ptem (hb_font_t *font)
  *
  * Applies a list of font-variation settings to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @variations will be effectively set to their
+ * default values.
+ *
  * Since: 1.4.2
  */
 void
@@ -2091,6 +2153,10 @@ hb_font_set_variations (hb_font_t            *font,
  * Applies a list of variation coordinates (in design-space units)
  * to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @coords will be effectively set to their
+ * default values.
+ *
  * Since: 1.4.2
  */
 void
@@ -2154,6 +2220,10 @@ hb_font_set_var_named_instance (hb_font_t *font,
  * Applies a list of variation coordinates (in normalized units)
  * to a font.
  *
+ * Note that this overrides all existing variations set on @font.
+ * Axes not included in @coords will be effectively set to their
+ * default values.
+ *
  * <note>Note: Coordinates should be normalized to 2.14.</note>
  *
  * Since: 1.4.2
@@ -2196,14 +2266,19 @@ hb_font_set_var_coords_normalized (hb_font_t    *font,
 /**
  * hb_font_get_var_coords_normalized:
  * @font: #hb_font_t to work upon
- * @length: Number of coordinates retrieved
+ * @length: (out): Number of coordinates retrieved
  *
  * Fetches the list of normalized variation coordinates currently
  * set on a font.
  *
+ * Note that this returned array may only contain values for some
+ * (or none) of the axes; omitted axes effectively have zero values.
+ *
  * Return value is valid as long as variation coordinates of the font
  * are not modified.
  *
+ * Return value: coordinates array
+ *
  * Since: 1.4.2
  */
 const int *
@@ -2216,18 +2291,24 @@ hb_font_get_var_coords_normalized (hb_font_t    *font,
   return font->coords;
 }
 
-#ifdef HB_EXPERIMENTAL_API
 /**
  * hb_font_get_var_coords_design:
  * @font: #hb_font_t to work upon
- * @length: (out): number of coordinates
+ * @length: (out): Number of coordinates retrieved
+ *
+ * Fetches the list of variation coordinates (in design-space units) currently
+ * set on a font.
+ *
+ * Note that this returned array may only contain values for some
+ * (or none) of the axes; omitted axes effectively have their default
+ * values.
  *
  * Return value is valid as long as variation coordinates of the font
  * are not modified.
  *
  * Return value: coordinates array
  *
- * Since: EXPERIMENTAL
+ * Since: 3.3.0
  */
 const float *
 hb_font_get_var_coords_design (hb_font_t *font,
@@ -2239,7 +2320,6 @@ hb_font_get_var_coords_design (hb_font_t *font,
   return font->design_coords;
 }
 #endif
-#endif
 
 #ifndef HB_DISABLE_DEPRECATED
 /*
diff --git a/thirdparty/harfbuzz/src/hb-font.h b/thirdparty/harfbuzz/src/hb-font.h
index 15dc126523..a3bbb2e37b 100644
--- a/thirdparty/harfbuzz/src/hb-font.h
+++ b/thirdparty/harfbuzz/src/hb-font.h
@@ -1024,6 +1024,12 @@ HB_EXTERN float
 hb_font_get_ptem (hb_font_t *font);
 
 HB_EXTERN void
+hb_font_set_synthetic_slant (hb_font_t *font, float slant);
+
+HB_EXTERN float
+hb_font_get_synthetic_slant (hb_font_t *font);
+
+HB_EXTERN void
 hb_font_set_variations (hb_font_t *font,
 			const hb_variation_t *variations,
 			unsigned int variations_length);
@@ -1033,11 +1039,9 @@ hb_font_set_var_coords_design (hb_font_t *font,
 			       const float *coords,
 			       unsigned int coords_length);
 
-#ifdef HB_EXPERIMENTAL_API
 HB_EXTERN const float *
 hb_font_get_var_coords_design (hb_font_t *font,
 			       unsigned int *length);
-#endif
 
 HB_EXTERN void
 hb_font_set_var_coords_normalized (hb_font_t *font,
diff --git a/thirdparty/harfbuzz/src/hb-font.hh b/thirdparty/harfbuzz/src/hb-font.hh
index 1b7f445e8b..0d73589e8c 100644
--- a/thirdparty/harfbuzz/src/hb-font.hh
+++ b/thirdparty/harfbuzz/src/hb-font.hh
@@ -109,6 +109,8 @@ struct hb_font_t
 
   int32_t x_scale;
   int32_t y_scale;
+  float slant;
+  float slant_xy;
   int64_t x_mult;
   int64_t y_mult;
 
@@ -617,6 +619,7 @@ struct hb_font_t
     signed upem = face->get_upem ();
     x_mult = ((int64_t) x_scale << 16) / upem;
     y_mult = ((int64_t) y_scale << 16) / upem;
+    slant_xy = y_scale ? slant * x_scale / y_scale : 0.f;
   }
 
   hb_position_t em_mult (int16_t v, int64_t mult)
diff --git a/thirdparty/harfbuzz/src/hb-graphite2.cc b/thirdparty/harfbuzz/src/hb-graphite2.cc
index 42420ac0b0..63dc18b466 100644
--- a/thirdparty/harfbuzz/src/hb-graphite2.cc
+++ b/thirdparty/harfbuzz/src/hb-graphite2.cc
@@ -439,7 +439,8 @@ _hb_graphite2_shape (hb_shape_plan_t    *shape_plan HB_UNUSED,
   if (feats) gr_featureval_destroy (feats);
   gr_seg_destroy (seg);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   return true;
 }
diff --git a/thirdparty/harfbuzz/src/hb-iter.hh b/thirdparty/harfbuzz/src/hb-iter.hh
index ad2e45e3c5..43a3098f65 100644
--- a/thirdparty/harfbuzz/src/hb-iter.hh
+++ b/thirdparty/harfbuzz/src/hb-iter.hh
@@ -90,8 +90,8 @@ struct hb_iter_t
    * it will be returning pointer to temporary rvalue.
    * TODO Use a wrapper return type to fix for non-reference type. */
   template <typename T = item_t,
-	    hb_enable_if (hb_is_reference (T))>
-  hb_remove_reference<item_t>* operator -> () const { return hb_addressof (**thiz()); }
+	    hb_enable_if (std::is_reference<T>::value)>
+  hb_remove_reference<item_t>* operator -> () const { return std::addressof (**thiz()); }
   item_t operator * () const { return thiz()->__item__ (); }
   item_t operator * () { return thiz()->__item__ (); }
   item_t operator [] (unsigned i) const { return thiz()->__item_at__ (i); }
@@ -289,7 +289,7 @@ struct hb_is_source_of
 {
   private:
   template <typename Iter2 = Iter,
-	    hb_enable_if (hb_is_convertible (typename Iter2::item_t, hb_add_lvalue_reference<hb_add_const<Item>>))>
+	    hb_enable_if (hb_is_convertible (typename Iter2::item_t, hb_add_lvalue_reference<const Item>))>
   static hb_true_type impl (hb_priority<2>);
   template <typename Iter2 = Iter>
   static auto impl (hb_priority<1>) -> decltype (hb_declval (Iter2) >> hb_declval (Item &), hb_true_type ());
diff --git a/thirdparty/harfbuzz/src/hb-kern.hh b/thirdparty/harfbuzz/src/hb-kern.hh
index 3f952fe7fc..9ea945caed 100644
--- a/thirdparty/harfbuzz/src/hb-kern.hh
+++ b/thirdparty/harfbuzz/src/hb-kern.hh
@@ -49,6 +49,10 @@ struct hb_kern_machine_t
 	     hb_mask_t    kern_mask,
 	     bool         scale = true) const
   {
+    if (!buffer->message (font, "start kern"))
+      return;
+
+    buffer->unsafe_to_concat ();
     OT::hb_ot_apply_context_t c (1, font, buffer);
     c.set_lookup_mask (kern_mask);
     c.set_lookup_props (OT::LookupFlag::IgnoreMarks);
@@ -67,7 +71,8 @@ struct hb_kern_machine_t
       }
 
       skippy_iter.reset (idx, 1);
-      if (!skippy_iter.next ())
+      unsigned unsafe_to;
+      if (!skippy_iter.next (&unsafe_to))
       {
 	idx++;
 	continue;
@@ -125,6 +130,8 @@ struct hb_kern_machine_t
     skip:
       idx = skippy_iter.idx;
     }
+
+    (void) buffer->message (font, "end kern");
   }
 
   const Driver &driver;
diff --git a/thirdparty/harfbuzz/src/hb-machinery.hh b/thirdparty/harfbuzz/src/hb-machinery.hh
index 010c2570d7..5046ac1933 100644
--- a/thirdparty/harfbuzz/src/hb-machinery.hh
+++ b/thirdparty/harfbuzz/src/hb-machinery.hh
@@ -244,19 +244,19 @@ struct hb_lazy_loader_t : hb_data_wrapper_t<Data, WheresData>
   {
     Stored *p = (Stored *) hb_calloc (1, sizeof (Stored));
     if (likely (p))
-      p->init (data);
+      p = new (p) Stored (data);
     return p;
   }
   static Stored *create ()
   {
     Stored *p = (Stored *) hb_calloc (1, sizeof (Stored));
     if (likely (p))
-      p->init ();
+      p = new (p) Stored ();
     return p;
   }
   static void destroy (Stored *p)
   {
-    p->fini ();
+    p->~Stored ();
     hb_free (p);
   }
 
diff --git a/thirdparty/harfbuzz/src/hb-map.hh b/thirdparty/harfbuzz/src/hb-map.hh
index 793dcf22ca..9341637eac 100644
--- a/thirdparty/harfbuzz/src/hb-map.hh
+++ b/thirdparty/harfbuzz/src/hb-map.hh
@@ -37,13 +37,10 @@
 template <typename K, typename V,
 	  typename k_invalid_t = K,
 	  typename v_invalid_t = V,
-	  k_invalid_t kINVALID = hb_is_pointer (K) ? 0 : std::is_signed<K>::value ? hb_int_min (K) : (K) -1,
-	  v_invalid_t vINVALID = hb_is_pointer (V) ? 0 : std::is_signed<V>::value ? hb_int_min (V) : (V) -1>
+	  k_invalid_t kINVALID = std::is_pointer<K>::value ? 0 : std::is_signed<K>::value ? hb_int_min (K) : (K) -1,
+	  v_invalid_t vINVALID = std::is_pointer<V>::value ? 0 : std::is_signed<V>::value ? hb_int_min (V) : (V) -1>
 struct hb_hashmap_t
 {
-  static constexpr K INVALID_KEY   = kINVALID;
-  static constexpr V INVALID_VALUE = vINVALID;
-
   hb_hashmap_t ()  { init (); }
   ~hb_hashmap_t () { fini (); }
 
@@ -64,24 +61,40 @@ struct hb_hashmap_t
     hb_copy (o, *this);
   }
 
-  static_assert (std::is_trivially_copyable<K>::value, "");
-  static_assert (std::is_trivially_copyable<V>::value, "");
-  static_assert (std::is_trivially_destructible<K>::value, "");
-  static_assert (std::is_trivially_destructible<V>::value, "");
-
   struct item_t
   {
     K key;
     V value;
     uint32_t hash;
 
-    void clear () { key = kINVALID; value = vINVALID; hash = 0; }
+    void clear ()
+    {
+      new (std::addressof (key)) K ();
+      key = hb_coerce<K> (kINVALID);
+      new (std::addressof (value)) V ();
+      value = hb_coerce<V> (vINVALID);
+      hash = 0;
+    }
 
     bool operator == (const K &o) { return hb_deref (key) == hb_deref (o); }
     bool operator == (const item_t &o) { return *this == o.key; }
-    bool is_unused () const    { return key == kINVALID; }
-    bool is_tombstone () const { return key != kINVALID && value == vINVALID; }
-    bool is_real () const { return key != kINVALID && value != vINVALID; }
+    bool is_unused () const
+    {
+      const K inv = hb_coerce<K> (kINVALID);
+      return key == inv;
+    }
+    bool is_tombstone () const
+    {
+      const K kinv = hb_coerce<K> (kINVALID);
+      const V vinv = hb_coerce<V> (vINVALID);
+      return key != kinv && value == vinv;
+    }
+    bool is_real () const
+    {
+      const K kinv = hb_coerce<K> (kINVALID);
+      const V vinv = hb_coerce<V> (vINVALID);
+      return key != kinv && value != vinv;
+    }
     hb_pair_t<K, V> get_pair() const { return hb_pair_t<K, V> (key, value); }
   };
 
@@ -118,8 +131,13 @@ struct hb_hashmap_t
   }
   void fini_shallow ()
   {
-    hb_free (items);
-    items = nullptr;
+    if (likely (items)) {
+      unsigned size = mask + 1;
+      for (unsigned i = 0; i < size; i++)
+        items[i].~item_t ();
+      hb_free (items);
+      items = nullptr;
+    }
     population = occupancy = 0;
   }
   void fini ()
@@ -163,10 +181,15 @@ struct hb_hashmap_t
     /* Insert back old items. */
     if (old_items)
       for (unsigned int i = 0; i < old_size; i++)
+      {
 	if (old_items[i].is_real ())
+	{
 	  set_with_hash (old_items[i].key,
 			 old_items[i].hash,
 			 std::move (old_items[i].value));
+	}
+	old_items[i].~item_t ();
+      }
 
     hb_free (old_items);
 
@@ -178,22 +201,22 @@ struct hb_hashmap_t
 
   V get (K key) const
   {
-    if (unlikely (!items)) return vINVALID;
+    if (unlikely (!items)) return hb_coerce<V> (vINVALID);
     unsigned int i = bucket_for (key);
-    return items[i].is_real () && items[i] == key ? items[i].value : vINVALID;
+    return items[i].is_real () && items[i] == key ? items[i].value : hb_coerce<V> (vINVALID);
   }
 
-  void del (K key) { set (key, vINVALID); }
+  void del (K key) { set (key, hb_coerce<V> (vINVALID)); }
 
   /* Has interface. */
-  static constexpr V SENTINEL = vINVALID;
   typedef V value_t;
   value_t operator [] (K k) const { return get (k); }
   bool has (K k, V *vp = nullptr) const
   {
     V v = (*this)[k];
     if (vp) *vp = v;
-    return v != SENTINEL;
+    const V vinv = hb_coerce<V> (vINVALID);
+    return v != vinv;
   }
   /* Projection. */
   V operator () (K k) const { return get (k); }
@@ -248,11 +271,13 @@ struct hb_hashmap_t
   bool set_with_hash (K key, uint32_t hash, VV&& value)
   {
     if (unlikely (!successful)) return false;
-    if (unlikely (key == kINVALID)) return true;
+    const K kinv = hb_coerce<K> (kINVALID);
+    if (unlikely (key == kinv)) return true;
     if (unlikely ((occupancy + occupancy / 2) >= mask && !resize ())) return false;
     unsigned int i = bucket_for_hash (key, hash);
 
-    if (value == vINVALID && items[i].key != key)
+    const V vinv = hb_coerce<V> (vINVALID);
+    if (value == vinv && items[i].key != key)
       return true; /* Trying to delete non-existent key. */
 
     if (!items[i].is_unused ())
diff --git a/thirdparty/harfbuzz/src/hb-meta.hh b/thirdparty/harfbuzz/src/hb-meta.hh
index 0ea5774a9f..3fea5d995e 100644
--- a/thirdparty/harfbuzz/src/hb-meta.hh
+++ b/thirdparty/harfbuzz/src/hb-meta.hh
@@ -29,6 +29,7 @@
 
 #include "hb.hh"
 
+#include <memory>
 #include <type_traits>
 #include <utility>
 
@@ -85,30 +86,13 @@ template <>             struct hb_priority<0> {};
 template <typename T> struct hb_type_identity_t { typedef T type; };
 template <typename T> using hb_type_identity = typename hb_type_identity_t<T>::type;
 
-struct
-{
-  template <typename T> constexpr T*
-  operator () (T& arg) const
-  {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-align"
-    /* https://en.cppreference.com/w/cpp/memory/addressof */
-    return reinterpret_cast<T*> (
-	     &const_cast<char&> (
-		reinterpret_cast<const volatile char&> (arg)));
-#pragma GCC diagnostic pop
-  }
-}
-HB_FUNCOBJ (hb_addressof);
-
 template <typename T> static inline T hb_declval ();
 #define hb_declval(T) (hb_declval<T> ())
 
 template <typename T> struct hb_match_const		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_const<const T>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> using hb_remove_const = typename hb_match_const<T>::type;
-template <typename T> using hb_add_const = const T;
-#define hb_is_const(T) hb_match_const<T>::value
+
 template <typename T> struct hb_match_reference		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_reference<T &>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> struct hb_match_reference<T &&>	: hb_type_identity_t<T>, hb_true_type	{};
@@ -119,14 +103,13 @@ template <typename T> using hb_add_lvalue_reference = decltype (_hb_try_add_lval
 template <typename T> auto _hb_try_add_rvalue_reference (hb_priority<1>) -> hb_type_identity<T&&>;
 template <typename T> auto _hb_try_add_rvalue_reference (hb_priority<0>) -> hb_type_identity<T>;
 template <typename T> using hb_add_rvalue_reference = decltype (_hb_try_add_rvalue_reference<T> (hb_prioritize));
-#define hb_is_reference(T) hb_match_reference<T>::value
+
 template <typename T> struct hb_match_pointer		: hb_type_identity_t<T>, hb_false_type	{};
 template <typename T> struct hb_match_pointer<T *>	: hb_type_identity_t<T>, hb_true_type	{};
 template <typename T> using hb_remove_pointer = typename hb_match_pointer<T>::type;
 template <typename T> auto _hb_try_add_pointer (hb_priority<1>) -> hb_type_identity<hb_remove_reference<T>*>;
 template <typename T> auto _hb_try_add_pointer (hb_priority<1>) -> hb_type_identity<T>;
 template <typename T> using hb_add_pointer = decltype (_hb_try_add_pointer<T> (hb_prioritize));
-#define hb_is_pointer(T) hb_match_pointer<T>::value
 
 
 /* TODO Add feature-parity to std::decay. */
@@ -137,8 +120,8 @@ template <typename T> using hb_decay = hb_remove_const<hb_remove_reference<T>>;
 template <typename From, typename To>
 using hb_is_cr_convertible = hb_bool_constant<
   hb_is_same (hb_decay<From>, hb_decay<To>) &&
-  (!hb_is_const (From) || hb_is_const (To)) &&
-  (!hb_is_reference (To) || hb_is_const (To) || hb_is_reference (To))
+  (!std::is_const<From>::value || std::is_const<To>::value) &&
+  (!std::is_reference<To>::value || std::is_const<To>::value || std::is_reference<To>::value)
 >;
 #define hb_is_cr_convertible(From,To) hb_is_cr_convertible<From, To>::value
 
@@ -153,16 +136,6 @@ struct
 }
 HB_FUNCOBJ (hb_deref);
 
-struct
-{
-  template <typename T> constexpr auto
-  operator () (T&& v) const HB_AUTO_RETURN (std::forward<T> (v))
-
-  template <typename T> constexpr auto
-  operator () (T& v) const HB_AUTO_RETURN (hb_addressof (v))
-}
-HB_FUNCOBJ (hb_ref);
-
 template <typename T>
 struct hb_reference_wrapper
 {
@@ -176,7 +149,7 @@ struct hb_reference_wrapper
 template <typename T>
 struct hb_reference_wrapper<T&>
 {
-  hb_reference_wrapper (T& v) : v (hb_addressof (v)) {}
+  hb_reference_wrapper (T& v) : v (std::addressof (v)) {}
   bool operator == (const hb_reference_wrapper& o) const { return v == o.v; }
   bool operator != (const hb_reference_wrapper& o) const { return v != o.v; }
   operator T& () const { return *v; }
diff --git a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc
deleted file mode 100644
index 6d09b252d8..0000000000
--- a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright © 2011,2012,2013  Google, Inc.
- * Copyright © 2021  Khaled Hosny
- *
- *  This is part of HarfBuzz, a text shaping library.
- *
- * Permission is hereby granted, without written agreement and without
- * license or royalty fees, to use, copy, modify, and distribute this
- * software and its documentation for any purpose, provided that the
- * above copyright notice and the following two paragraphs appear in
- * all copies of this software.
- *
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
- * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
- * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
- * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
- * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
- * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
- *
- * Google Author(s): Behdad Esfahbod
- */
-
-#include "hb-ms-feature-ranges.hh"
-
-bool
-hb_ms_setup_features (const hb_feature_t                *features,
-		      unsigned int                       num_features,
-		      hb_vector_t<hb_ms_feature_t>      &feature_records, /* OUT */
-		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */)
-{
-  feature_records.shrink(0);
-  range_records.shrink(0);
-
-  /* Sort features by start/end events. */
-  hb_vector_t<hb_ms_feature_event_t> feature_events;
-  for (unsigned int i = 0; i < num_features; i++)
-  {
-    hb_ms_active_feature_t feature;
-    feature.fea.tag_le = hb_uint32_swap (features[i].tag);
-    feature.fea.value = features[i].value;
-    feature.order = i;
-
-    hb_ms_feature_event_t *event;
-
-    event = feature_events.push ();
-    event->index = features[i].start;
-    event->start = true;
-    event->feature = feature;
-
-    event = feature_events.push ();
-    event->index = features[i].end;
-    event->start = false;
-    event->feature = feature;
-  }
-  feature_events.qsort ();
-  /* Add a strategic final event. */
-  {
-    hb_ms_active_feature_t feature;
-    feature.fea.tag_le = 0;
-    feature.fea.value = 0;
-    feature.order = num_features + 1;
-
-    auto *event = feature_events.push ();
-    event->index = 0; /* This value does magic. */
-    event->start = false;
-    event->feature = feature;
-  }
-
-  /* Scan events and save features for each range. */
-  hb_vector_t<hb_ms_active_feature_t> active_features;
-  unsigned int last_index = 0;
-  for (unsigned int i = 0; i < feature_events.length; i++)
-  {
-    auto *event = &feature_events[i];
-
-    if (event->index != last_index)
-    {
-      /* Save a snapshot of active features and the range. */
-      auto *range = range_records.push ();
-      auto offset = feature_records.length;
-
-      active_features.qsort ();
-      for (unsigned int j = 0; j < active_features.length; j++)
-      {
-        if (!j || active_features[j].fea.tag_le != feature_records[feature_records.length - 1].tag_le)
-        {
-          feature_records.push (active_features[j].fea);
-        }
-        else
-        {
-          /* Overrides value for existing feature. */
-          feature_records[feature_records.length - 1].value = active_features[j].fea.value;
-        }
-      }
-
-      /* Will convert to pointer after all is ready, since feature_records.array
-       * may move as we grow it. */
-      range->features.features = reinterpret_cast<hb_ms_feature_t *> (offset);
-      range->features.num_features = feature_records.length - offset;
-      range->index_first = last_index;
-      range->index_last  = event->index - 1;
-
-      last_index = event->index;
-    }
-
-    if (event->start)
-    {
-      active_features.push (event->feature);
-    }
-    else
-    {
-      auto *feature = active_features.find (&event->feature);
-      if (feature)
-        active_features.remove (feature - active_features.arrayZ);
-    }
-  }
-
-  if (!range_records.length) /* No active feature found. */
-    num_features = 0;
-
-  /* Fixup the pointers. */
-  for (unsigned int i = 0; i < range_records.length; i++)
-  {
-    auto *range = &range_records[i];
-    range->features.features = (hb_ms_feature_t *) feature_records + reinterpret_cast<uintptr_t> (range->features.features);
-  }
-
-  return !!num_features;
-}
-
-void
-hb_ms_make_feature_ranges (hb_vector_t<hb_ms_feature_t>      &feature_records,
-			   hb_vector_t<hb_ms_range_record_t> &range_records,
-			   unsigned int                       chars_offset,
-			   unsigned int                       chars_len,
-			   uint16_t                          *log_clusters,
-			   hb_vector_t<hb_ms_features_t*>    &range_features, /* OUT */
-			   hb_vector_t<uint32_t>             &range_counts /* OUT */)
-{
-  range_features.shrink (0);
-  range_counts.shrink (0);
-
-  auto *last_range = &range_records[0];
-  for (unsigned int i = chars_offset; i < chars_len; i++)
-  {
-    auto *range = last_range;
-    while (log_clusters[i] < range->index_first)
-      range--;
-    while (log_clusters[i] > range->index_last)
-      range++;
-    if (!range_features.length ||
-        &range->features != range_features[range_features.length - 1])
-    {
-      auto **features = range_features.push ();
-      auto *c = range_counts.push ();
-      if (unlikely (!features || !c))
-      {
-        range_features.shrink (0);
-        range_counts.shrink (0);
-        break;
-      }
-      *features = &range->features;
-      *c = 1;
-    }
-    else
-    {
-      range_counts[range_counts.length - 1]++;
-    }
-
-    last_range = range;
-  }
-}
diff --git a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
index 401d1e1d97..d40fdeaa82 100644
--- a/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
+++ b/thirdparty/harfbuzz/src/hb-ms-feature-ranges.hh
@@ -52,8 +52,8 @@ struct hb_ms_active_feature_t {
 	   a->fea.value < b->fea.value ? -1 : a->fea.value > b->fea.value ? 1 :
 	   0;
   }
-  bool operator== (const hb_ms_active_feature_t *f)
-  { return cmp (this, f) == 0; }
+  bool operator== (const hb_ms_active_feature_t& f) const
+  { return cmp (this, &f) == 0; }
 };
 
 struct hb_ms_feature_event_t {
@@ -77,20 +77,153 @@ struct hb_ms_range_record_t {
   unsigned int index_last;  /* == end - 1 */
 };
 
-HB_INTERNAL bool
+static inline bool
 hb_ms_setup_features (const hb_feature_t                *features,
 		      unsigned int                       num_features,
 		      hb_vector_t<hb_ms_feature_t>      &feature_records, /* OUT */
-		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */);
+		      hb_vector_t<hb_ms_range_record_t> &range_records /* OUT */)
+{
+  feature_records.shrink(0);
+  range_records.shrink(0);
 
+  /* Sort features by start/end events. */
+  hb_vector_t<hb_ms_feature_event_t> feature_events;
+  for (unsigned int i = 0; i < num_features; i++)
+  {
+    hb_ms_active_feature_t feature;
+    feature.fea.tag_le = hb_uint32_swap (features[i].tag);
+    feature.fea.value = features[i].value;
+    feature.order = i;
+
+    hb_ms_feature_event_t *event;
+
+    event = feature_events.push ();
+    event->index = features[i].start;
+    event->start = true;
+    event->feature = feature;
+
+    event = feature_events.push ();
+    event->index = features[i].end;
+    event->start = false;
+    event->feature = feature;
+  }
+  feature_events.qsort ();
+  /* Add a strategic final event. */
+  {
+    hb_ms_active_feature_t feature;
+    feature.fea.tag_le = 0;
+    feature.fea.value = 0;
+    feature.order = num_features + 1;
+
+    auto *event = feature_events.push ();
+    event->index = 0; /* This value does magic. */
+    event->start = false;
+    event->feature = feature;
+  }
+
+  /* Scan events and save features for each range. */
+  hb_vector_t<hb_ms_active_feature_t> active_features;
+  unsigned int last_index = 0;
+  for (unsigned int i = 0; i < feature_events.length; i++)
+  {
+    auto *event = &feature_events[i];
+
+    if (event->index != last_index)
+    {
+      /* Save a snapshot of active features and the range. */
+      auto *range = range_records.push ();
+      auto offset = feature_records.length;
+
+      active_features.qsort ();
+      for (unsigned int j = 0; j < active_features.length; j++)
+      {
+        if (!j || active_features[j].fea.tag_le != feature_records[feature_records.length - 1].tag_le)
+        {
+          feature_records.push (active_features[j].fea);
+        }
+        else
+        {
+          /* Overrides value for existing feature. */
+          feature_records[feature_records.length - 1].value = active_features[j].fea.value;
+        }
+      }
+
+      /* Will convert to pointer after all is ready, since feature_records.array
+       * may move as we grow it. */
+      range->features.features = reinterpret_cast<hb_ms_feature_t *> (offset);
+      range->features.num_features = feature_records.length - offset;
+      range->index_first = last_index;
+      range->index_last  = event->index - 1;
 
-HB_INTERNAL void
+      last_index = event->index;
+    }
+
+    if (event->start)
+    {
+      active_features.push (event->feature);
+    }
+    else
+    {
+      auto *feature = active_features.lsearch (event->feature);
+      if (feature)
+        active_features.remove (feature - active_features.arrayZ);
+    }
+  }
+
+  if (!range_records.length) /* No active feature found. */
+    num_features = 0;
+
+  /* Fixup the pointers. */
+  for (unsigned int i = 0; i < range_records.length; i++)
+  {
+    auto *range = &range_records[i];
+    range->features.features = (hb_ms_feature_t *) feature_records + reinterpret_cast<uintptr_t> (range->features.features);
+  }
+
+  return !!num_features;
+}
+
+static inline void
 hb_ms_make_feature_ranges (hb_vector_t<hb_ms_feature_t>      &feature_records,
 			   hb_vector_t<hb_ms_range_record_t> &range_records,
 			   unsigned int                       chars_offset,
 			   unsigned int                       chars_len,
 			   uint16_t                          *log_clusters,
 			   hb_vector_t<hb_ms_features_t*>    &range_features, /* OUT */
-			   hb_vector_t<uint32_t>             &range_counts /* OUT */);
+			   hb_vector_t<uint32_t>             &range_counts /* OUT */)
+{
+  range_features.shrink (0);
+  range_counts.shrink (0);
+
+  auto *last_range = &range_records[0];
+  for (unsigned int i = chars_offset; i < chars_len; i++)
+  {
+    auto *range = last_range;
+    while (log_clusters[i] < range->index_first)
+      range--;
+    while (log_clusters[i] > range->index_last)
+      range++;
+    if (!range_features.length ||
+        &range->features != range_features[range_features.length - 1])
+    {
+      auto **features = range_features.push ();
+      auto *c = range_counts.push ();
+      if (unlikely (!features || !c))
+      {
+        range_features.shrink (0);
+        range_counts.shrink (0);
+        break;
+      }
+      *features = &range->features;
+      *c = 1;
+    }
+    else
+    {
+      range_counts[range_counts.length - 1]++;
+    }
+
+    last_range = range;
+  }
+}
 
 #endif /* HB_MS_FEATURE_RANGES_HH */
diff --git a/thirdparty/harfbuzz/src/hb-object.hh b/thirdparty/harfbuzz/src/hb-object.hh
index 0e15cb12c4..4b5bc32ade 100644
--- a/thirdparty/harfbuzz/src/hb-object.hh
+++ b/thirdparty/harfbuzz/src/hb-object.hh
@@ -53,7 +53,7 @@ struct hb_lockable_set_t
   item_t *replace_or_insert (T v, lock_t &l, bool replace)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item) {
       if (replace) {
 	item_t old = *item;
@@ -76,7 +76,7 @@ struct hb_lockable_set_t
   void remove (T v, lock_t &l)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item)
     {
       item_t old = *item;
@@ -93,7 +93,7 @@ struct hb_lockable_set_t
   bool find (T v, item_t *i, lock_t &l)
   {
     l.lock ();
-    item_t *item = items.find (v);
+    item_t *item = items.lsearch (v);
     if (item)
       *i = *item;
     l.unlock ();
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
index 180c87cb89..c102c15173 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff-common.hh
@@ -68,8 +68,6 @@ struct code_pair_t
 typedef hb_vector_t<unsigned char> str_buff_t;
 struct str_buff_vec_t : hb_vector_t<str_buff_t>
 {
-  void fini () { SUPER::fini_deep (); }
-
   unsigned int total_size () const
   {
     unsigned int size = 0;
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
index 5dd183e3a0..6fb59315c9 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff1-table.hh
@@ -1144,8 +1144,8 @@ struct cff1
     {
       sc.end_processing ();
       topDict.fini ();
-      fontDicts.fini_deep ();
-      privateDicts.fini_deep ();
+      fontDicts.fini ();
+      privateDicts.fini ();
       hb_blob_destroy (blob);
       blob = nullptr;
     }
@@ -1245,32 +1245,32 @@ struct cff1
     }
 
     protected:
-    hb_blob_t	           *blob;
+    hb_blob_t	           *blob = nullptr;
     hb_sanitize_context_t   sc;
 
     public:
-    const Encoding	    *encoding;
-    const Charset	    *charset;
-    const CFF1NameIndex     *nameIndex;
-    const CFF1TopDictIndex  *topDictIndex;
-    const CFF1StringIndex   *stringIndex;
-    const CFF1Subrs	    *globalSubrs;
-    const CFF1CharStrings   *charStrings;
-    const CFF1FDArray       *fdArray;
-    const CFF1FDSelect      *fdSelect;
-    unsigned int	     fdCount;
+    const Encoding	    *encoding = nullptr;
+    const Charset	    *charset = nullptr;
+    const CFF1NameIndex     *nameIndex = nullptr;
+    const CFF1TopDictIndex  *topDictIndex = nullptr;
+    const CFF1StringIndex   *stringIndex = nullptr;
+    const CFF1Subrs	    *globalSubrs = nullptr;
+    const CFF1CharStrings   *charStrings = nullptr;
+    const CFF1FDArray       *fdArray = nullptr;
+    const CFF1FDSelect      *fdSelect = nullptr;
+    unsigned int	     fdCount = 0;
 
     cff1_top_dict_values_t   topDict;
     hb_vector_t<cff1_font_dict_values_t>
 			     fontDicts;
     hb_vector_t<PRIVDICTVAL> privateDicts;
 
-    unsigned int	     num_glyphs;
+    unsigned int	     num_glyphs = 0;
   };
 
   struct accelerator_t : accelerator_templ_t<cff1_private_dict_opset_t, cff1_private_dict_values_t>
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       SUPER::init (face);
 
@@ -1295,8 +1295,7 @@ struct cff1
       }
       glyph_names.qsort ();
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       glyph_names.fini ();
 
@@ -1398,7 +1397,10 @@ struct cff1
   DEFINE_SIZE_STATIC (4);
 };
 
-struct cff1_accelerator_t : cff1::accelerator_t {};
+struct cff1_accelerator_t : cff1::accelerator_t {
+  cff1_accelerator_t (hb_face_t *face) : cff1::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF1_TABLE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
index 829217feaa..6e1b01c8fe 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cff2-table.hh
@@ -397,7 +397,7 @@ struct cff2
   template <typename PRIVOPSET, typename PRIVDICTVAL>
   struct accelerator_templ_t
   {
-    void init (hb_face_t *face)
+    accelerator_templ_t (hb_face_t *face)
     {
       topDict.init ();
       fontDicts.init ();
@@ -412,15 +412,15 @@ struct cff2
       const OT::cff2 *cff2 = this->blob->template as<OT::cff2> ();
 
       if (cff2 == &Null (OT::cff2))
-      { fini (); return; }
+        goto fail;
 
       { /* parse top dict */
 	byte_str_t topDictStr (cff2 + cff2->topDict, cff2->topDictSize);
-	if (unlikely (!topDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!topDictStr.sanitize (&sc))) goto fail;
 	cff2_top_dict_interpreter_t top_interp;
 	top_interp.env.init (topDictStr);
 	topDict.init ();
-	if (unlikely (!top_interp.interpret (topDict))) { fini (); return; }
+	if (unlikely (!top_interp.interpret (topDict))) goto fail;
       }
 
       globalSubrs = &StructAtOffset<CFF2Subrs> (cff2, cff2->topDict + cff2->topDictSize);
@@ -434,49 +434,55 @@ struct cff2
 	  (globalSubrs == &Null (CFF2Subrs)) || unlikely (!globalSubrs->sanitize (&sc)) ||
 	  (fdArray == &Null (CFF2FDArray)) || unlikely (!fdArray->sanitize (&sc)) ||
 	  (((fdSelect != &Null (CFF2FDSelect)) && unlikely (!fdSelect->sanitize (&sc, fdArray->count)))))
-      { fini (); return; }
+        goto fail;
 
       num_glyphs = charStrings->count;
       if (num_glyphs != sc.get_num_glyphs ())
-      { fini (); return; }
+        goto fail;
 
       fdCount = fdArray->count;
       if (!privateDicts.resize (fdCount))
-      { fini (); return; }
+        goto fail;
 
       /* parse font dicts and gather private dicts */
       for (unsigned int i = 0; i < fdCount; i++)
       {
 	const byte_str_t fontDictStr = (*fdArray)[i];
-	if (unlikely (!fontDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!fontDictStr.sanitize (&sc))) goto fail;
 	cff2_font_dict_values_t  *font;
 	cff2_font_dict_interpreter_t font_interp;
 	font_interp.env.init (fontDictStr);
 	font = fontDicts.push ();
-	if (unlikely (font == &Crap (cff2_font_dict_values_t))) { fini (); return; }
+	if (unlikely (font == &Crap (cff2_font_dict_values_t))) goto fail;
 	font->init ();
-	if (unlikely (!font_interp.interpret (*font))) { fini (); return; }
+	if (unlikely (!font_interp.interpret (*font))) goto fail;
 
 	const byte_str_t privDictStr (StructAtOffsetOrNull<UnsizedByteStr> (cff2, font->privateDictInfo.offset), font->privateDictInfo.size);
-	if (unlikely (!privDictStr.sanitize (&sc))) { fini (); return; }
+	if (unlikely (!privDictStr.sanitize (&sc))) goto fail;
 	dict_interpreter_t<PRIVOPSET, PRIVDICTVAL, cff2_priv_dict_interp_env_t>  priv_interp;
 	priv_interp.env.init(privDictStr);
 	privateDicts[i].init ();
-	if (unlikely (!priv_interp.interpret (privateDicts[i]))) { fini (); return; }
+	if (unlikely (!priv_interp.interpret (privateDicts[i]))) goto fail;
 
 	privateDicts[i].localSubrs = &StructAtOffsetOrNull<CFF2Subrs> (&privDictStr[0], privateDicts[i].subrsOffset);
 	if (privateDicts[i].localSubrs != &Null (CFF2Subrs) &&
 	  unlikely (!privateDicts[i].localSubrs->sanitize (&sc)))
-	{ fini (); return; }
+	  goto fail;
       }
-    }
 
-    void fini ()
+
+      return;
+
+      fail:
+        _fini ();
+    }
+    ~accelerator_templ_t () { _fini (); }
+    void _fini ()
     {
       sc.end_processing ();
       topDict.fini ();
-      fontDicts.fini_deep ();
-      privateDicts.fini_deep ();
+      fontDicts.fini ();
+      privateDicts.fini ();
       hb_blob_destroy (blob);
       blob = nullptr;
     }
@@ -484,26 +490,28 @@ struct cff2
     bool is_valid () const { return blob; }
 
     protected:
-    hb_blob_t			*blob;
+    hb_blob_t			*blob = nullptr;
     hb_sanitize_context_t	sc;
 
     public:
     cff2_top_dict_values_t	topDict;
-    const CFF2Subrs		*globalSubrs;
-    const CFF2VariationStore	*varStore;
-    const CFF2CharStrings	*charStrings;
-    const CFF2FDArray		*fdArray;
-    const CFF2FDSelect		*fdSelect;
-    unsigned int		fdCount;
+    const CFF2Subrs		*globalSubrs = nullptr;
+    const CFF2VariationStore	*varStore = nullptr;
+    const CFF2CharStrings	*charStrings = nullptr;
+    const CFF2FDArray		*fdArray = nullptr;
+    const CFF2FDSelect		*fdSelect = nullptr;
+    unsigned int		fdCount = 0;
 
     hb_vector_t<cff2_font_dict_values_t>     fontDicts;
     hb_vector_t<PRIVDICTVAL>  privateDicts;
 
-    unsigned int	      num_glyphs;
+    unsigned int	      num_glyphs = 0;
   };
 
   struct accelerator_t : accelerator_templ_t<cff2_private_dict_opset_t, cff2_private_dict_values_t>
   {
+    accelerator_t (hb_face_t *face) : accelerator_templ_t (face) {}
+
     HB_INTERNAL bool get_extents (hb_font_t *font,
 				  hb_codepoint_t glyph,
 				  hb_glyph_extents_t *extents) const;
@@ -525,7 +533,10 @@ struct cff2
   DEFINE_SIZE_STATIC (5);
 };
 
-struct cff2_accelerator_t : cff2::accelerator_t {};
+struct cff2_accelerator_t : cff2::accelerator_t {
+  cff2_accelerator_t (hb_face_t *face) : cff2::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF2_TABLE_HH */
diff --git a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
index d837adc788..fde57cdc5b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-cmap-table.hh
@@ -369,7 +369,6 @@ struct CmapSubtableFormat4
   {
     accelerator_t () {}
     accelerator_t (const CmapSubtableFormat4 *subtable) { init (subtable); }
-    ~accelerator_t () { fini (); }
 
     void init (const CmapSubtableFormat4 *subtable)
     {
@@ -381,7 +380,6 @@ struct CmapSubtableFormat4
       glyphIdArray = idRangeOffset + segCount;
       glyphIdArrayLength = (subtable->length - 16 - 8 * segCount) / 2;
     }
-    void fini () {}
 
     bool get_glyph (hb_codepoint_t codepoint, hb_codepoint_t *glyph) const
     {
@@ -1607,7 +1605,7 @@ struct cmap
       unsigned format = (this + _.subtable).u.format;
       if (format == 12) has_format12 = true;
 
-      const EncodingRecord *table = hb_addressof (_);
+      const EncodingRecord *table = std::addressof (_);
       if      (_.platformID == 0 && _.encodingID ==  3) unicode_bmp = table;
       else if (_.platformID == 0 && _.encodingID ==  4) unicode_ucs4 = table;
       else if (_.platformID == 3 && _.encodingID ==  1) ms_bmp = table;
@@ -1665,7 +1663,7 @@ struct cmap
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<cmap> (face);
       bool symbol;
@@ -1700,8 +1698,7 @@ struct cmap
 	}
       }
     }
-
-    void fini () { this->table.destroy (); }
+    ~accelerator_t () { this->table.destroy (); }
 
     bool get_nominal_glyph (hb_codepoint_t  unicode,
 			    hb_codepoint_t *glyph) const
@@ -1863,7 +1860,9 @@ struct cmap
   DEFINE_SIZE_ARRAY (4, encodingRecord);
 };
 
-struct cmap_accelerator_t : cmap::accelerator_t {};
+struct cmap_accelerator_t : cmap::accelerator_t {
+  cmap_accelerator_t (hb_face_t *face) : cmap::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
index 14459914ee..23fa56c4f6 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-cbdt-table.hh
@@ -360,6 +360,16 @@ struct IndexSubtable
 
 struct IndexSubtableRecord
 {
+  /* XXX Remove this and fix by not inserting it into vector. */
+  IndexSubtableRecord& operator = (const IndexSubtableRecord &o)
+  {
+    firstGlyphIndex = o.firstGlyphIndex;
+    lastGlyphIndex = o.lastGlyphIndex;
+    offsetToSubtable = (unsigned) o.offsetToSubtable;
+    assert (offsetToSubtable.is_null ());
+    return *this;
+  }
+
   bool sanitize (hb_sanitize_context_t *c, const void *base) const
   {
     TRACE_SANITIZE (this);
@@ -809,15 +819,14 @@ struct CBDT
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
-      cblc = hb_sanitize_context_t ().reference_table<CBLC> (face);
-      cbdt = hb_sanitize_context_t ().reference_table<CBDT> (face);
+      this->cblc = hb_sanitize_context_t ().reference_table<CBLC> (face);
+      this->cbdt = hb_sanitize_context_t ().reference_table<CBDT> (face);
 
       upem = hb_face_get_upem (face);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       this->cblc.destroy ();
       this->cbdt.destroy ();
@@ -978,7 +987,10 @@ CBLC::subset (hb_subset_context_t *c) const
   return_trace (CBLC::sink_cbdt (c, &cbdt_prime));
 }
 
-struct CBDT_accelerator_t : CBDT::accelerator_t {};
+struct CBDT_accelerator_t : CBDT::accelerator_t {
+  CBDT_accelerator_t (hb_face_t *face) : CBDT::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
index 008422d089..dac755c02c 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-colr-table.hh
@@ -71,7 +71,7 @@ struct hb_colrv1_closure_context_t :
   bool paint_visited (const void *paint)
   {
     hb_codepoint_t delta = (hb_codepoint_t) ((uintptr_t) paint - (uintptr_t) base);
-     if (visited_paint.has (delta))
+    if (visited_paint.in_error() || visited_paint.has (delta))
       return true;
 
     visited_paint.add (delta);
@@ -1270,13 +1270,9 @@ struct COLR
 
   struct accelerator_t
   {
-    accelerator_t () {}
-    ~accelerator_t () { fini (); }
-
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { colr = hb_sanitize_context_t ().reference_table<COLR> (face); }
-
-    void fini () { this->colr.destroy (); }
+    ~accelerator_t () { this->colr.destroy (); }
 
     bool is_valid () { return colr.get_blob ()->length; }
 
@@ -1535,6 +1531,10 @@ struct COLR
   DEFINE_SIZE_MIN (14);
 };
 
+struct COLR_accelerator_t : COLR::accelerator_t {
+  COLR_accelerator_t (hb_face_t *face) : COLR::accelerator_t (face) {}
+};
+
 } /* namespace OT */
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh b/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
index ca85ba6ad6..fbaf2ec26b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-colrv1-closure.hh
@@ -43,7 +43,7 @@ HB_INTERNAL void PaintColrLayers::closurev1 (hb_colrv1_closure_context_t* c) con
   const LayerList &paint_offset_lists = c->get_colr_table ()->get_layerList ();
   for (unsigned i = firstLayerIndex; i < firstLayerIndex + numLayers; i++)
   {
-    const Paint &paint = hb_addressof (paint_offset_lists) + paint_offset_lists[i];
+    const Paint &paint = std::addressof (paint_offset_lists) + paint_offset_lists[i];
     paint.dispatch (c);
   }
 }
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
index d2911f19e6..9741ebd450 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-sbix-table.hh
@@ -202,12 +202,12 @@ struct sbix
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       table = hb_sanitize_context_t ().reference_table<sbix> (face);
       num_glyphs = face->get_num_glyphs ();
     }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     bool has_data () const { return table->has_data (); }
 
@@ -407,7 +407,10 @@ struct sbix
   DEFINE_SIZE_ARRAY (8, strikes);
 };
 
-struct sbix_accelerator_t : sbix::accelerator_t {};
+struct sbix_accelerator_t : sbix::accelerator_t {
+  sbix_accelerator_t (hb_face_t *face) : sbix::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh b/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
index e022ef43b7..fc649f1006 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-color-svg-table.hh
@@ -79,9 +79,9 @@ struct SVG
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<SVG> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_t *reference_blob_for_glyph (hb_codepoint_t glyph_id) const
     {
@@ -116,7 +116,9 @@ struct SVG
   DEFINE_SIZE_STATIC (10);
 };
 
-struct SVG_accelerator_t : SVG::accelerator_t {};
+struct SVG_accelerator_t : SVG::accelerator_t {
+  SVG_accelerator_t (hb_face_t *face) : SVG::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-color.cc b/thirdparty/harfbuzz/src/hb-ot-color.cc
index 4170b71317..16077765bd 100644
--- a/thirdparty/harfbuzz/src/hb-ot-color.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-color.cc
@@ -90,15 +90,15 @@ hb_ot_color_palette_get_count (hb_face_t *face)
 /**
  * hb_ot_color_palette_get_name_id:
  * @face: #hb_face_t to work upon
- * @palette_index: The index of the color palette 
+ * @palette_index: The index of the color palette
  *
  * Fetches the `name` table Name ID that provides display names for
- * a `CPAL` color palette. 
+ * a `CPAL` color palette.
  *
  * Palette display names can be generic (e.g., "Default") or provide
  * specific, themed names (e.g., "Spring", "Summer", "Fall", and "Winter").
  *
- * Return value: the Named ID found for the palette. 
+ * Return value: the Named ID found for the palette.
  * If the requested palette has no name the result is #HB_OT_NAME_ID_INVALID.
  *
  * Since: 2.1.0
@@ -116,7 +116,7 @@ hb_ot_color_palette_get_name_id (hb_face_t *face,
  * @color_index: The index of the color
  *
  * Fetches the `name` table Name ID that provides display names for
- * the specificed color in a face's `CPAL` color palette. 
+ * the specified color in a face's `CPAL` color palette.
  *
  * Display names can be generic (e.g., "Background") or specific
  * (e.g., "Eye color").
@@ -256,6 +256,8 @@ hb_ot_color_has_svg (hb_face_t *face)
  *
  * Fetches the SVG document for a glyph. The blob may be either plain text or gzip-encoded.
  *
+ * If the glyph has no SVG document, the singleton empty blob is returned.
+ *
  * Return value: (transfer full): An #hb_blob_t containing the SVG document of the glyph, if available
  *
  * Since: 2.1.0
@@ -296,6 +298,8 @@ hb_ot_color_has_png (hb_face_t *face)
  * as input. To get an optimally sized PNG blob, the UPEM value must be set on the @font
  * object. If UPEM is unset, the blob returned will be the largest PNG available.
  *
+ * If the glyph has no PNG image, the singleton empty blob is returned.
+ *
  * Return value: (transfer full): An #hb_blob_t containing the PNG image for the glyph, if available
  *
  * Since: 2.1.0
diff --git a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
index 6aa34295c7..9bac30fff3 100644
--- a/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-glyf-table.hh
@@ -207,8 +207,7 @@ struct glyf
   _populate_subset_glyphs (const hb_subset_plan_t   *plan,
 			   hb_vector_t<SubsetGlyph> *glyphs /* OUT */) const
   {
-    OT::glyf::accelerator_t glyf;
-    glyf.init (plan->source);
+    OT::glyf::accelerator_t glyf (plan->source);
 
     + hb_range (plan->num_output_glyphs ())
     | hb_map ([&] (hb_codepoint_t new_gid)
@@ -233,8 +232,6 @@ struct glyf
 	      })
     | hb_sink (glyphs)
     ;
-
-    glyf.fini ();
   }
 
   static bool
@@ -595,7 +592,7 @@ struct glyf
         if (unlikely (!header.numberOfContours)) return;
 
         unsigned flags_offset = length (instructions_length ());
-        if (unlikely (length (flags_offset + 1) > bytes.length)) return;
+        if (unlikely (flags_offset + 1 > bytes.length)) return;
 
 	HBUINT8 &first_flag = (HBUINT8 &) StructAtOffset<HBUINT16> (&bytes, flags_offset);
         first_flag = (uint8_t) first_flag | FLAG_OVERLAP_SIMPLE;
@@ -920,7 +917,7 @@ struct glyf
 
   struct accelerator_t
   {
-    void init (hb_face_t *face_)
+    accelerator_t (hb_face_t *face_)
     {
       short_offset = false;
       num_glyphs = 0;
@@ -953,8 +950,7 @@ struct glyf
       num_glyphs = hb_max (1u, loca_table.get_length () / (short_offset ? 2 : 4)) - 1;
       num_glyphs = hb_min (num_glyphs, face->get_num_glyphs ());
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       loca_table.destroy ();
       glyf_table.destroy ();
@@ -1358,7 +1354,10 @@ struct glyf
 			 * defining it _MIN instead. */
 };
 
-struct glyf_accelerator_t : glyf::accelerator_t {};
+struct glyf_accelerator_t : glyf::accelerator_t {
+  glyf_accelerator_t (hb_face_t *face) : glyf::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
index 7d2d2d3eb8..36bffa70a5 100644
--- a/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-hmtx-table.hh
@@ -127,8 +127,7 @@ struct hmtxvmtx
     T *table_prime = c->serializer->start_embed <T> ();
     if (unlikely (!table_prime)) return_trace (false);
 
-    accelerator_t _mtx;
-    _mtx.init (c->plan->source);
+    accelerator_t _mtx (c->plan->source);
     unsigned num_advances = _mtx.num_advances_for_subset (c->plan);
 
     auto it =
@@ -144,8 +143,6 @@ struct hmtxvmtx
 
     table_prime->serialize (c->serializer, it, num_advances);
 
-    _mtx.fini ();
-
     if (unlikely (c->serializer->in_error ()))
       return_trace (false);
 
@@ -160,8 +157,8 @@ struct hmtxvmtx
   {
     friend struct hmtxvmtx;
 
-    void init (hb_face_t *face,
-	       unsigned int default_advance_ = 0)
+    accelerator_t (hb_face_t *face,
+		   unsigned int default_advance_ = 0)
     {
       default_advance = default_advance_ ? default_advance_ : hb_face_get_upem (face);
 
@@ -193,8 +190,7 @@ struct hmtxvmtx
 
       var_table = hb_sanitize_context_t ().reference_table<HVARVVAR> (face, T::variationsTag);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       table.destroy ();
       var_table.destroy ();
@@ -338,8 +334,12 @@ struct vmtx : hmtxvmtx<vmtx, vhea> {
   static constexpr bool is_horizontal = false;
 };
 
-struct hmtx_accelerator_t : hmtx::accelerator_t {};
-struct vmtx_accelerator_t : vmtx::accelerator_t {};
+struct hmtx_accelerator_t : hmtx::accelerator_t {
+  hmtx_accelerator_t (hb_face_t *face) : hmtx::accelerator_t (face) {}
+};
+struct vmtx_accelerator_t : vmtx::accelerator_t {
+  vmtx_accelerator_t (hb_face_t *face) : vmtx::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
index 4fb1893435..60a1906155 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-common.hh
@@ -128,7 +128,7 @@ struct hb_prune_langsys_context_t
   bool visited (const T *p, hb_set_t &visited_set)
   {
     hb_codepoint_t delta = (hb_codepoint_t) ((uintptr_t) p - (uintptr_t) table);
-     if (visited_set.has (delta))
+    if (visited_set.in_error () || visited_set.has (delta))
       return true;
 
     visited_set.add (delta);
@@ -655,7 +655,6 @@ struct LangSys
   void collect_features (hb_prune_langsys_context_t *c) const
   {
     if (!has_required_feature () && !get_feature_count ()) return;
-    if (c->visitedLangsys (this)) return;
     if (has_required_feature () &&
         c->duplicate_feature_map->has (reqFeatureIndex))
       c->new_feature_indexes->add (get_required_feature_index ());
@@ -750,11 +749,15 @@ struct Script
     {
       //only collect features from non-redundant langsys
       const LangSys& d = get_default_lang_sys ();
-      d.collect_features (c);
+      if (!c->visitedLangsys (&d)) {
+        d.collect_features (c);
+      }
 
       for (auto _ : + hb_zip (langSys, hb_range (langsys_count)))
       {
+
         const LangSys& l = this+_.first.offset;
+        if (c->visitedLangsys (&l)) continue;
         if (l.compare (d, c->duplicate_feature_map)) continue;
 
         l.collect_features (c);
@@ -766,6 +769,7 @@ struct Script
       for (auto _ : + hb_zip (langSys, hb_range (langsys_count)))
       {
         const LangSys& l = this+_.first.offset;
+        if (c->visitedLangsys (&l)) continue;
         l.collect_features (c);
         c->script_langsys_map->get (script_index)->add (_.second);
       }
@@ -845,7 +849,7 @@ struct FeatureParamsSize
     if (unlikely (!c->check_struct (this))) return_trace (false);
 
     /* This subtable has some "history", if you will.  Some earlier versions of
-     * Adobe tools calculated the offset of the FeatureParams sutable from the
+     * Adobe tools calculated the offset of the FeatureParams subtable from the
      * beginning of the FeatureList table!  Now, that is dealt with in the
      * Feature implementation.  But we still need to be able to tell junk from
      * real data.  Note: We don't check that the nameID actually exists.
@@ -2926,8 +2930,6 @@ struct VariationStore
 
     hb_vector_t<hb_inc_bimap_t> inner_maps;
     inner_maps.resize ((unsigned) dataSets.len);
-    for (unsigned i = 0; i < inner_maps.length; i++)
-      inner_maps[i].init ();
 
     for (unsigned idx : c->plan->layout_variation_indices->iter ())
     {
@@ -2935,18 +2937,11 @@ struct VariationStore
       uint16_t minor = idx & 0xFFFF;
 
       if (major >= inner_maps.length)
-      {
-	for (unsigned i = 0; i < inner_maps.length; i++)
-	  inner_maps[i].fini ();
 	return_trace (false);
-      }
       inner_maps[major].add (minor);
     }
     varstore_prime->serialize (c->serializer, this, inner_maps.as_array ());
 
-    for (unsigned i = 0; i < inner_maps.length; i++)
-      inner_maps[i].fini ();
-
     return_trace (
         !c->serializer->in_error()
         && varstore_prime->dataSets);
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
index aea644f3e1..a76d644c4b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gdef-table.hh
@@ -585,17 +585,16 @@ struct GDEF
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
-      this->table = hb_sanitize_context_t ().reference_table<GDEF> (face);
-      if (unlikely (this->table->is_blocklisted (this->table.get_blob (), face)))
+      table = hb_sanitize_context_t ().reference_table<GDEF> (face);
+      if (unlikely (table->is_blocklisted (table.get_blob (), face)))
       {
-	hb_blob_destroy (this->table.get_blob ());
-	this->table = hb_blob_get_empty ();
+	hb_blob_destroy (table.get_blob ());
+	table = hb_blob_get_empty ();
       }
     }
-
-    void fini () { this->table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_ptr_t<GDEF> table;
   };
@@ -715,7 +714,9 @@ struct GDEF
   DEFINE_SIZE_MIN (12);
 };
 
-struct GDEF_accelerator_t : GDEF::accelerator_t {};
+struct GDEF_accelerator_t : GDEF::accelerator_t {
+  GDEF_accelerator_t (hb_face_t *face) : GDEF::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
index 6db3e08940..e28c951f3f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gpos-table.hh
@@ -706,7 +706,7 @@ struct MarkArray : Array16Of<MarkRecord>	/* Array of MarkRecords--in Coverage or
 
     float mark_x, mark_y, base_x, base_y;
 
-    buffer->unsafe_to_break (glyph_pos, buffer->idx);
+    buffer->unsafe_to_break (glyph_pos, buffer->idx + 1);
     mark_anchor.get_anchor (c, buffer->cur().codepoint, &mark_x, &mark_y);
     glyph_anchor.get_anchor (c, buffer->info[glyph_pos].codepoint, &base_x, &base_y);
 
@@ -1235,6 +1235,7 @@ struct PairSet
       buffer->idx = pos;
       return_trace (true);
     }
+    buffer->unsafe_to_concat (buffer->idx, pos + 1);
     return_trace (false);
   }
 
@@ -1362,7 +1363,12 @@ struct PairPosFormat1
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      buffer->unsafe_to_concat (buffer->idx, unsafe_to);
+      return_trace (false);
+    }
 
     return_trace ((this+pairSet[index]).apply (c, valueFormat, skippy_iter.idx));
   }
@@ -1555,7 +1561,12 @@ struct PairPosFormat2
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      buffer->unsafe_to_concat (buffer->idx, unsafe_to);
+      return_trace (false);
+    }
 
     unsigned int len1 = valueFormat1.get_len ();
     unsigned int len2 = valueFormat2.get_len ();
@@ -1563,13 +1574,81 @@ struct PairPosFormat2
 
     unsigned int klass1 = (this+classDef1).get_class (buffer->cur().codepoint);
     unsigned int klass2 = (this+classDef2).get_class (buffer->info[skippy_iter.idx].codepoint);
-    if (unlikely (klass1 >= class1Count || klass2 >= class2Count)) return_trace (false);
+    if (unlikely (klass1 >= class1Count || klass2 >= class2Count))
+    {
+      buffer->unsafe_to_concat (buffer->idx, skippy_iter.idx + 1);
+      return_trace (false);
+    }
 
     const Value *v = &values[record_len * (klass1 * class2Count + klass2)];
-    bool applied_first = valueFormat1.apply_value (c, this, v, buffer->cur_pos());
-    bool applied_second = valueFormat2.apply_value (c, this, v + len1, buffer->pos[skippy_iter.idx]);
+
+    bool applied_first = false, applied_second = false;
+
+
+    /* Isolate simple kerning and apply it half to each side.
+     * Results in better cursor positinoing / underline drawing. */
+    {
+      if (!len2)
+      {
+	const hb_direction_t dir = buffer->props.direction;
+	const bool horizontal = HB_DIRECTION_IS_HORIZONTAL (dir);
+	const bool backward = HB_DIRECTION_IS_BACKWARD (dir);
+	unsigned mask = horizontal ? ValueFormat::xAdvance : ValueFormat::yAdvance;
+	if (backward)
+	  mask |= mask >> 2; /* Add eg. xPlacement in RTL. */
+	/* Add Devices. */
+	mask |= mask << 4;
+
+	if (valueFormat1 & ~mask)
+	  goto bail;
+
+	/* Is simple kern. Apply value on an empty position slot,
+	 * then split it between sides. */
+
+	hb_glyph_position_t pos{};
+	if (valueFormat1.apply_value (c, this, v, pos))
+	{
+	  hb_position_t *src  = &pos.x_advance;
+	  hb_position_t *dst1 = &buffer->cur_pos().x_advance;
+	  hb_position_t *dst2 = &buffer->pos[skippy_iter.idx].x_advance;
+	  unsigned i = horizontal ? 0 : 1;
+
+	  hb_position_t kern  = src[i];
+	  hb_position_t kern1 = kern >> 1;
+	  hb_position_t kern2 = kern - kern1;
+
+	  if (!backward)
+	  {
+	    dst1[i] += kern1;
+	    dst2[i] += kern2;
+	    dst2[i + 2] += kern2;
+	  }
+	  else
+	  {
+	    dst1[i] += kern1;
+	    dst1[i + 2] += src[i + 2] - kern2;
+	    dst2[i] += kern2;
+	  }
+
+	  applied_first = applied_second = kern != 0;
+	  goto success;
+	}
+	goto boring;
+      }
+    }
+    bail:
+
+
+    applied_first = valueFormat1.apply_value (c, this, v, buffer->cur_pos());
+    applied_second = valueFormat2.apply_value (c, this, v + len1, buffer->pos[skippy_iter.idx]);
+
+    success:
     if (applied_first || applied_second)
       buffer->unsafe_to_break (buffer->idx, skippy_iter.idx + 1);
+    else
+    boring:
+      buffer->unsafe_to_concat (buffer->idx, skippy_iter.idx + 1);
+
 
     buffer->idx = skippy_iter.idx;
     if (len2)
@@ -1799,10 +1878,19 @@ struct CursivePosFormat1
 
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
     const EntryExitRecord &prev_record = entryExitRecord[(this+coverage).get_coverage  (buffer->info[skippy_iter.idx].codepoint)];
-    if (!prev_record.exitAnchor) return_trace (false);
+    if (!prev_record.exitAnchor)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     unsigned int i = skippy_iter.idx;
     unsigned int j = buffer->idx;
@@ -2066,7 +2154,13 @@ struct MarkBasePosFormat1
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (LookupFlag::IgnoreMarks);
     do {
-      if (!skippy_iter.prev ()) return_trace (false);
+      unsigned unsafe_from;
+      if (!skippy_iter.prev (&unsafe_from))
+      {
+	buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+	return_trace (false);
+      }
+
       /* We only want to attach to the first of a MultipleSubst sequence.
        * https://github.com/harfbuzz/harfbuzz/issues/740
        * Reject others...
@@ -2089,7 +2183,11 @@ struct MarkBasePosFormat1
     //if (!_hb_glyph_info_is_base_glyph (&buffer->info[skippy_iter.idx])) { return_trace (false); }
 
     unsigned int base_index = (this+baseCoverage).get_coverage  (buffer->info[skippy_iter.idx].codepoint);
-    if (base_index == NOT_COVERED) return_trace (false);
+    if (base_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     return_trace ((this+markArray).apply (c, mark_index, base_index, this+baseArray, classCount, skippy_iter.idx));
   }
@@ -2320,21 +2418,34 @@ struct MarkLigPosFormat1
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (LookupFlag::IgnoreMarks);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
     /* Checking that matched glyph is actually a ligature by GDEF is too strong; disabled */
     //if (!_hb_glyph_info_is_ligature (&buffer->info[skippy_iter.idx])) { return_trace (false); }
 
     unsigned int j = skippy_iter.idx;
     unsigned int lig_index = (this+ligatureCoverage).get_coverage  (buffer->info[j].codepoint);
-    if (lig_index == NOT_COVERED) return_trace (false);
+    if (lig_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     const LigatureArray& lig_array = this+ligatureArray;
     const LigatureAttach& lig_attach = lig_array[lig_index];
 
     /* Find component to attach to */
     unsigned int comp_count = lig_attach.rows;
-    if (unlikely (!comp_count)) return_trace (false);
+    if (unlikely (!comp_count))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     /* We must now check whether the ligature ID of the current mark glyph
      * is identical to the ligature ID of the found ligature.  If yes, we
@@ -2517,9 +2628,18 @@ struct MarkMarkPosFormat1
     hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_input;
     skippy_iter.reset (buffer->idx, 1);
     skippy_iter.set_lookup_props (c->lookup_props & ~LookupFlag::IgnoreFlags);
-    if (!skippy_iter.prev ()) return_trace (false);
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (unsafe_from, buffer->idx + 1);
+      return_trace (false);
+    }
 
-    if (!_hb_glyph_info_is_mark (&buffer->info[skippy_iter.idx])) { return_trace (false); }
+    if (!_hb_glyph_info_is_mark (&buffer->info[skippy_iter.idx]))
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     unsigned int j = skippy_iter.idx;
 
@@ -2544,11 +2664,16 @@ struct MarkMarkPosFormat1
     }
 
     /* Didn't match. */
+    buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
     return_trace (false);
 
     good:
     unsigned int mark2_index = (this+mark2Coverage).get_coverage  (buffer->info[j].codepoint);
-    if (mark2_index == NOT_COVERED) return_trace (false);
+    if (mark2_index == NOT_COVERED)
+    {
+      buffer->unsafe_to_concat_from_outbuffer (skippy_iter.idx, buffer->idx + 1);
+      return_trace (false);
+    }
 
     return_trace ((this+mark1Array).apply (c, mark1_index, mark2_index, this+mark2Array, classCount, j));
   }
@@ -2951,7 +3076,7 @@ GPOS::position_finish_advances (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer H
 }
 
 void
-GPOS::position_finish_offsets (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer)
+GPOS::position_finish_offsets (hb_font_t *font, hb_buffer_t *buffer)
 {
   _hb_buffer_assert_gsubgpos_vars (buffer);
 
@@ -2961,12 +3086,21 @@ GPOS::position_finish_offsets (hb_font_t *font HB_UNUSED, hb_buffer_t *buffer)
 
   /* Handle attachments */
   if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT)
-    for (unsigned int i = 0; i < len; i++)
+    for (unsigned i = 0; i < len; i++)
       propagate_attachment_offsets (pos, len, i, direction);
+
+  if (unlikely (font->slant))
+  {
+    for (unsigned i = 0; i < len; i++)
+      if (unlikely (pos[i].y_offset))
+        pos[i].x_offset += _hb_roundf (font->slant_xy * pos[i].y_offset);
+  }
 }
 
 
-struct GPOS_accelerator_t : GPOS::accelerator_t {};
+struct GPOS_accelerator_t : GPOS::accelerator_t {
+  GPOS_accelerator_t (hb_face_t *face) : GPOS::accelerator_t (face) {}
+};
 
 
 /* Out-of-class implementation for methods recursing */
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
index b7ce30135e..0b0bc547bd 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsub-table.hh
@@ -826,22 +826,25 @@ struct Ligature
 
     unsigned int total_component_count = 0;
 
-    unsigned int match_length = 0;
+    unsigned int match_end = 0;
     unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
 
     if (likely (!match_input (c, count,
 			      &component[1],
 			      match_glyph,
 			      nullptr,
-			      &match_length,
+			      &match_end,
 			      match_positions,
 			      &total_component_count)))
+    {
+      c->buffer->unsafe_to_concat (c->buffer->idx, match_end);
       return_trace (false);
+    }
 
     ligate_input (c,
 		  count,
 		  match_positions,
-		  match_length,
+		  match_end,
 		  ligGlyph,
 		  total_component_count);
 
@@ -1296,7 +1299,7 @@ struct ReverseChainSingleSubstFormat1
 	match_lookahead (c,
 			 lookahead.len, (HBUINT16 *) lookahead.arrayZ,
 			 match_coverage, this,
-			 1, &end_index))
+			 c->buffer->idx + 1, &end_index))
     {
       c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index);
       c->replace_glyph_inplace (substitute[index]);
@@ -1305,8 +1308,11 @@ struct ReverseChainSingleSubstFormat1
        * calls us through a Context lookup. */
       return_trace (true);
     }
-
-    return_trace (false);
+    else
+    {
+      c->buffer->unsafe_to_concat_from_outbuffer (start_index, end_index);
+      return_trace (false);
+    }
   }
 
   template<typename Iterator,
@@ -1739,7 +1745,9 @@ struct GSUB : GSUBGPOS
 };
 
 
-struct GSUB_accelerator_t : GSUB::accelerator_t {};
+struct GSUB_accelerator_t : GSUB::accelerator_t {
+  GSUB_accelerator_t (hb_face_t *face) : GSUB::accelerator_t (face) {}
+};
 
 
 /* Out-of-class implementation for methods recursing */
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
index 191d3bebc5..65de131f85 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout-gsubgpos.hh
@@ -125,24 +125,31 @@ struct hb_closure_context_t :
     hb_set_t *covered_glyph_set = done_lookups_glyph_set->get (lookup_index);
     if (unlikely (covered_glyph_set->in_error ()))
       return true;
-    if (parent_active_glyphs ()->is_subset (*covered_glyph_set))
+    if (parent_active_glyphs ().is_subset (*covered_glyph_set))
       return true;
 
-    hb_set_union (covered_glyph_set, parent_active_glyphs ());
+    covered_glyph_set->union_ (parent_active_glyphs ());
     return false;
   }
 
-  hb_set_t* parent_active_glyphs ()
+  const hb_set_t& previous_parent_active_glyphs () {
+    if (active_glyphs_stack.length <= 1)
+      return *glyphs;
+
+    return active_glyphs_stack[active_glyphs_stack.length - 2];
+  }
+
+  const hb_set_t& parent_active_glyphs ()
   {
-    if (active_glyphs_stack.length < 1)
-      return glyphs;
+    if (!active_glyphs_stack)
+      return *glyphs;
 
     return active_glyphs_stack.tail ();
   }
 
-  void push_cur_active_glyphs (hb_set_t* cur_active_glyph_set)
+  hb_set_t& push_cur_active_glyphs ()
   {
-    active_glyphs_stack.push (cur_active_glyph_set);
+    return *active_glyphs_stack.push ();
   }
 
   bool pop_cur_done_glyphs ()
@@ -156,29 +163,24 @@ struct hb_closure_context_t :
 
   hb_face_t *face;
   hb_set_t *glyphs;
-  hb_set_t *cur_intersected_glyphs;
   hb_set_t output[1];
-  hb_vector_t<hb_set_t *> active_glyphs_stack;
+  hb_vector_t<hb_set_t> active_glyphs_stack;
   recurse_func_t recurse_func;
   unsigned int nesting_level_left;
 
   hb_closure_context_t (hb_face_t *face_,
 			hb_set_t *glyphs_,
-			hb_set_t *cur_intersected_glyphs_,
 			hb_map_t *done_lookups_glyph_count_,
 			hb_hashmap_t<unsigned, hb_set_t *> *done_lookups_glyph_set_,
 			unsigned int nesting_level_left_ = HB_MAX_NESTING_LEVEL) :
 			  face (face_),
 			  glyphs (glyphs_),
-			  cur_intersected_glyphs (cur_intersected_glyphs_),
 			  recurse_func (nullptr),
 			  nesting_level_left (nesting_level_left_),
 			  done_lookups_glyph_count (done_lookups_glyph_count_),
 			  done_lookups_glyph_set (done_lookups_glyph_set_),
 			  lookup_count (0)
-  {
-    push_cur_active_glyphs (glyphs_);
-  }
+  {}
 
   ~hb_closure_context_t () { flush (); }
 
@@ -186,11 +188,11 @@ struct hb_closure_context_t :
 
   void flush ()
   {
-    hb_set_del_range (output, face->get_num_glyphs (), HB_SET_VALUE_INVALID);	/* Remove invalid glyphs. */
-    hb_set_union (glyphs, output);
-    hb_set_clear (output);
+    output->del_range (face->get_num_glyphs (), HB_SET_VALUE_INVALID);	/* Remove invalid glyphs. */
+    glyphs->union_ (*output);
+    output->clear ();
     active_glyphs_stack.pop ();
-    active_glyphs_stack.fini ();
+    active_glyphs_stack.reset ();
   }
 
   private:
@@ -520,7 +522,7 @@ struct hb_ot_apply_context_t :
     may_skip (const hb_glyph_info_t &info) const
     { return matcher.may_skip (c, info); }
 
-    bool next ()
+    bool next (unsigned *unsafe_to = nullptr)
     {
       assert (num_items > 0);
       while (idx + num_items < end)
@@ -543,11 +545,17 @@ struct hb_ot_apply_context_t :
 	}
 
 	if (skip == matcher_t::SKIP_NO)
+	{
+	  if (unsafe_to)
+	    *unsafe_to = idx + 1;
 	  return false;
+	}
       }
+      if (unsafe_to)
+        *unsafe_to = end;
       return false;
     }
-    bool prev ()
+    bool prev (unsigned *unsafe_from = nullptr)
     {
       assert (num_items > 0);
       while (idx > num_items - 1)
@@ -570,8 +578,14 @@ struct hb_ot_apply_context_t :
 	}
 
 	if (skip == matcher_t::SKIP_NO)
+	{
+	  if (unsafe_from)
+	    *unsafe_from = hb_max (1u, idx) - 1u;
 	  return false;
+	}
       }
+      if (unsafe_from)
+        *unsafe_from = 0;
       return false;
     }
 
@@ -712,53 +726,60 @@ struct hb_ot_apply_context_t :
     return true;
   }
 
-  void _set_glyph_props (hb_codepoint_t glyph_index,
+  void _set_glyph_class (hb_codepoint_t glyph_index,
 			  unsigned int class_guess = 0,
 			  bool ligature = false,
 			  bool component = false) const
   {
-    unsigned int add_in = _hb_glyph_info_get_glyph_props (&buffer->cur()) &
-			  HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
-    add_in |= HB_OT_LAYOUT_GLYPH_PROPS_SUBSTITUTED;
+    unsigned int props = _hb_glyph_info_get_glyph_props (&buffer->cur());
+    props |= HB_OT_LAYOUT_GLYPH_PROPS_SUBSTITUTED;
     if (ligature)
     {
-      add_in |= HB_OT_LAYOUT_GLYPH_PROPS_LIGATED;
+      props |= HB_OT_LAYOUT_GLYPH_PROPS_LIGATED;
       /* In the only place that the MULTIPLIED bit is used, Uniscribe
        * seems to only care about the "last" transformation between
        * Ligature and Multiple substitutions.  Ie. if you ligate, expand,
        * and ligate again, it forgives the multiplication and acts as
        * if only ligation happened.  As such, clear MULTIPLIED bit.
        */
-      add_in &= ~HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
+      props &= ~HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
     }
     if (component)
-      add_in |= HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
+      props |= HB_OT_LAYOUT_GLYPH_PROPS_MULTIPLIED;
     if (likely (has_glyph_classes))
-      _hb_glyph_info_set_glyph_props (&buffer->cur(), add_in | gdef.get_glyph_props (glyph_index));
+    {
+      props &= HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props | gdef.get_glyph_props (glyph_index));
+    }
     else if (class_guess)
-      _hb_glyph_info_set_glyph_props (&buffer->cur(), add_in | class_guess);
+    {
+      props &= HB_OT_LAYOUT_GLYPH_PROPS_PRESERVE;
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props | class_guess);
+    }
+    else
+      _hb_glyph_info_set_glyph_props (&buffer->cur(), props);
   }
 
   void replace_glyph (hb_codepoint_t glyph_index) const
   {
-    _set_glyph_props (glyph_index);
+    _set_glyph_class (glyph_index);
     (void) buffer->replace_glyph (glyph_index);
   }
   void replace_glyph_inplace (hb_codepoint_t glyph_index) const
   {
-    _set_glyph_props (glyph_index);
+    _set_glyph_class (glyph_index);
     buffer->cur().codepoint = glyph_index;
   }
   void replace_glyph_with_ligature (hb_codepoint_t glyph_index,
 				    unsigned int class_guess) const
   {
-    _set_glyph_props (glyph_index, class_guess, true);
+    _set_glyph_class (glyph_index, class_guess, true);
     (void) buffer->replace_glyph (glyph_index);
   }
   void output_glyph_for_component (hb_codepoint_t glyph_index,
 				   unsigned int class_guess) const
   {
-    _set_glyph_props (glyph_index, class_guess, false, true);
+    _set_glyph_class (glyph_index, class_guess, false, true);
     (void) buffer->output_glyph (glyph_index);
   }
 };
@@ -948,7 +969,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
 				const HBUINT16 input[], /* Array of input values--start with second glyph */
 				match_func_t match_func,
 				const void *match_data,
-				unsigned int *end_offset,
+				unsigned int *end_position,
 				unsigned int match_positions[HB_MAX_CONTEXT_LENGTH],
 				unsigned int *p_total_component_count = nullptr)
 {
@@ -1001,7 +1022,12 @@ static inline bool match_input (hb_ot_apply_context_t *c,
   match_positions[0] = buffer->idx;
   for (unsigned int i = 1; i < count; i++)
   {
-    if (!skippy_iter.next ()) return_trace (false);
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      *end_position = unsafe_to;
+      return_trace (false);
+    }
 
     match_positions[i] = skippy_iter.idx;
 
@@ -1055,7 +1081,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
     total_component_count += _hb_glyph_info_get_lig_num_comps (&buffer->info[skippy_iter.idx]);
   }
 
-  *end_offset = skippy_iter.idx - buffer->idx + 1;
+  *end_position = skippy_iter.idx + 1;
 
   if (p_total_component_count)
     *p_total_component_count = total_component_count;
@@ -1065,7 +1091,7 @@ static inline bool match_input (hb_ot_apply_context_t *c,
 static inline bool ligate_input (hb_ot_apply_context_t *c,
 				 unsigned int count, /* Including the first glyph */
 				 const unsigned int match_positions[HB_MAX_CONTEXT_LENGTH], /* Including the first glyph */
-				 unsigned int match_length,
+				 unsigned int match_end,
 				 hb_codepoint_t lig_glyph,
 				 unsigned int total_component_count)
 {
@@ -1073,7 +1099,7 @@ static inline bool ligate_input (hb_ot_apply_context_t *c,
 
   hb_buffer_t *buffer = c->buffer;
 
-  buffer->merge_clusters (buffer->idx, buffer->idx + match_length);
+  buffer->merge_clusters (buffer->idx, match_end);
 
   /* - If a base and one or more marks ligate, consider that as a base, NOT
    *   ligature, such that all following marks can still attach to it.
@@ -1190,11 +1216,16 @@ static inline bool match_backtrack (hb_ot_apply_context_t *c,
   skippy_iter.set_match_func (match_func, match_data, backtrack);
 
   for (unsigned int i = 0; i < count; i++)
-    if (!skippy_iter.prev ())
+  {
+    unsigned unsafe_from;
+    if (!skippy_iter.prev (&unsafe_from))
+    {
+      *match_start = unsafe_from;
       return_trace (false);
+    }
+  }
 
   *match_start = skippy_iter.idx;
-
   return_trace (true);
 }
 
@@ -1203,21 +1234,26 @@ static inline bool match_lookahead (hb_ot_apply_context_t *c,
 				    const HBUINT16 lookahead[],
 				    match_func_t match_func,
 				    const void *match_data,
-				    unsigned int offset,
+				    unsigned int start_index,
 				    unsigned int *end_index)
 {
   TRACE_APPLY (nullptr);
 
   hb_ot_apply_context_t::skipping_iterator_t &skippy_iter = c->iter_context;
-  skippy_iter.reset (c->buffer->idx + offset - 1, count);
+  skippy_iter.reset (start_index - 1, count);
   skippy_iter.set_match_func (match_func, match_data, lookahead);
 
   for (unsigned int i = 0; i < count; i++)
-    if (!skippy_iter.next ())
+  {
+    unsigned unsafe_to;
+    if (!skippy_iter.next (&unsafe_to))
+    {
+      *end_index = unsafe_to;
       return_trace (false);
+    }
+  }
 
   *end_index = skippy_iter.idx + 1;
-
   return_trace (true);
 }
 
@@ -1284,22 +1320,23 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
     unsigned seqIndex = lookupRecord[i].sequenceIndex;
     if (seqIndex >= inputCount) continue;
 
-    hb_set_t *pos_glyphs = nullptr;
+    bool has_pos_glyphs = false;
+    hb_set_t pos_glyphs;
 
     if (hb_set_is_empty (covered_seq_indicies) || !hb_set_has (covered_seq_indicies, seqIndex))
     {
-      pos_glyphs = hb_set_create ();
+      has_pos_glyphs = true;
       if (seqIndex == 0)
       {
         switch (context_format) {
         case ContextFormat::SimpleContext:
-          pos_glyphs->add (value);
+          pos_glyphs.add (value);
           break;
         case ContextFormat::ClassBasedContext:
-          intersected_glyphs_func (c->cur_intersected_glyphs, data, value, pos_glyphs);
+          intersected_glyphs_func (&c->parent_active_glyphs (), data, value, &pos_glyphs);
           break;
         case ContextFormat::CoverageBasedContext:
-          hb_set_set (pos_glyphs, c->cur_intersected_glyphs);
+          pos_glyphs.set (c->parent_active_glyphs ());
           break;
         }
       }
@@ -1313,12 +1350,16 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
           input_value = input[seqIndex - 1];
         }
 
-        intersected_glyphs_func (c->glyphs, input_data, input_value, pos_glyphs);
+        intersected_glyphs_func (c->glyphs, input_data, input_value, &pos_glyphs);
       }
     }
 
-    hb_set_add (covered_seq_indicies, seqIndex);
-    c->push_cur_active_glyphs (pos_glyphs ? pos_glyphs : c->glyphs);
+    covered_seq_indicies->add (seqIndex);
+    if (has_pos_glyphs) {
+      c->push_cur_active_glyphs () = pos_glyphs;
+    } else {
+      c->push_cur_active_glyphs ().set (*c->glyphs);
+    }
 
     unsigned endIndex = inputCount;
     if (context_format == ContextFormat::CoverageBasedContext)
@@ -1327,8 +1368,6 @@ static void context_closure_recurse_lookups (hb_closure_context_t *c,
     c->recurse (lookupRecord[i].lookupListIndex, covered_seq_indicies, seqIndex, endIndex);
 
     c->pop_cur_done_glyphs ();
-    if (pos_glyphs)
-      hb_set_destroy (pos_glyphs);
   }
 
   hb_set_destroy (covered_seq_indicies);
@@ -1343,15 +1382,13 @@ static inline void recurse_lookups (context_t *c,
     c->recurse (lookupRecord[i].lookupListIndex);
 }
 
-static inline bool apply_lookup (hb_ot_apply_context_t *c,
+static inline void apply_lookup (hb_ot_apply_context_t *c,
 				 unsigned int count, /* Including the first glyph */
 				 unsigned int match_positions[HB_MAX_CONTEXT_LENGTH], /* Including the first glyph */
 				 unsigned int lookupCount,
 				 const LookupRecord lookupRecord[], /* Array of LookupRecords--in design order */
-				 unsigned int match_length)
+				 unsigned int match_end)
 {
-  TRACE_APPLY (nullptr);
-
   hb_buffer_t *buffer = c->buffer;
   int end;
 
@@ -1359,7 +1396,7 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
    * Adjust. */
   {
     unsigned int bl = buffer->backtrack_len ();
-    end = bl + match_length;
+    end = bl + match_end - buffer->idx;
 
     int delta = bl - buffer->idx;
     /* Convert positions to new indexing. */
@@ -1461,8 +1498,6 @@ static inline bool apply_lookup (hb_ot_apply_context_t *c,
   }
 
   (void) buffer->move_to (end);
-
-  return_trace (true);
 }
 
 
@@ -1550,17 +1585,25 @@ static inline bool context_apply_lookup (hb_ot_apply_context_t *c,
 					 const LookupRecord lookupRecord[],
 					 ContextApplyLookupContext &lookup_context)
 {
-  unsigned int match_length = 0;
-  unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
-  return match_input (c,
-		      inputCount, input,
-		      lookup_context.funcs.match, lookup_context.match_data,
-		      &match_length, match_positions)
-      && (c->buffer->unsafe_to_break (c->buffer->idx, c->buffer->idx + match_length),
-	  apply_lookup (c,
-		       inputCount, match_positions,
-		       lookupCount, lookupRecord,
-		       match_length));
+  unsigned match_end = 0;
+  unsigned match_positions[HB_MAX_CONTEXT_LENGTH];
+  if (match_input (c,
+		   inputCount, input,
+		   lookup_context.funcs.match, lookup_context.match_data,
+		   &match_end, match_positions))
+  {
+    c->buffer->unsafe_to_break (c->buffer->idx, match_end);
+    apply_lookup (c,
+		  inputCount, match_positions,
+		  lookupCount, lookupRecord,
+		  match_end);
+    return true;
+  }
+  else
+  {
+    c->buffer->unsafe_to_concat (c->buffer->idx, match_end);
+    return false;
+  }
 }
 
 struct Rule
@@ -1828,8 +1871,9 @@ struct ContextFormat1
 
   void closure (hb_closure_context_t *c) const
   {
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     struct ContextClosureLookupContext lookup_context = {
       {intersects_glyph, intersected_glyph},
@@ -1838,10 +1882,14 @@ struct ContextFormat1
     };
 
     + hb_zip (this+coverage, hb_range ((unsigned) ruleSet.len))
-    | hb_filter (c->parent_active_glyphs (), hb_first)
+    | hb_filter ([&] (hb_codepoint_t _) {
+      return c->previous_parent_active_glyphs ().has (_);
+    }, hb_first)
     | hb_map ([&](const hb_pair_t<hb_codepoint_t, unsigned> _) { return hb_pair_t<unsigned, const RuleSet&> (_.first, this+ruleSet[_.second]); })
     | hb_apply ([&] (const hb_pair_t<unsigned, const RuleSet&>& _) { _.second.closure (c, _.first, lookup_context); })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -1989,8 +2037,9 @@ struct ContextFormat2
     if (!(this+coverage).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     const ClassDef &class_def = this+classDef;
 
@@ -2000,10 +2049,9 @@ struct ContextFormat2
       &class_def
     };
 
-    return
     + hb_enumerate (ruleSet)
     | hb_filter ([&] (unsigned _)
-		 { return class_def.intersects_class (c->cur_intersected_glyphs, _); },
+    { return class_def.intersects_class (&c->parent_active_glyphs (), _); },
 		 hb_first)
     | hb_apply ([&] (const hb_pair_t<unsigned, const Offset16To<RuleSet>&> _)
                 {
@@ -2011,6 +2059,8 @@ struct ContextFormat2
                   rule_set.closure (c, _.first, lookup_context);
                 })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2183,8 +2233,10 @@ struct ContextFormat3
     if (!(this+coverageZ[0]).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const LookupRecord *lookupRecord = &StructAfter<LookupRecord> (coverageZ.as_array (glyphCount));
     struct ContextClosureLookupContext lookup_context = {
@@ -2196,6 +2248,8 @@ struct ContextFormat3
 			    glyphCount, (const HBUINT16 *) (coverageZ.arrayZ + 1),
 			    lookupCount, lookupRecord,
 			    0, lookup_context);
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2452,25 +2506,38 @@ static inline bool chain_context_apply_lookup (hb_ot_apply_context_t *c,
 					       const LookupRecord lookupRecord[],
 					       ChainContextApplyLookupContext &lookup_context)
 {
-  unsigned int start_index = 0, match_length = 0, end_index = 0;
-  unsigned int match_positions[HB_MAX_CONTEXT_LENGTH];
-  return match_input (c,
-		      inputCount, input,
-		      lookup_context.funcs.match, lookup_context.match_data[1],
-		      &match_length, match_positions)
-      && match_backtrack (c,
-			  backtrackCount, backtrack,
-			  lookup_context.funcs.match, lookup_context.match_data[0],
-			  &start_index)
-      && match_lookahead (c,
-			  lookaheadCount, lookahead,
-			  lookup_context.funcs.match, lookup_context.match_data[2],
-			  match_length, &end_index)
-      && (c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index),
-	  apply_lookup (c,
-			inputCount, match_positions,
-			lookupCount, lookupRecord,
-			match_length));
+  unsigned end_index = c->buffer->idx;
+  unsigned match_end = 0;
+  unsigned match_positions[HB_MAX_CONTEXT_LENGTH];
+  if (!(match_input (c,
+		     inputCount, input,
+		     lookup_context.funcs.match, lookup_context.match_data[1],
+		     &match_end, match_positions) && (end_index = match_end)
+       && match_lookahead (c,
+			   lookaheadCount, lookahead,
+			   lookup_context.funcs.match, lookup_context.match_data[2],
+			   match_end, &end_index)))
+  {
+    c->buffer->unsafe_to_concat (c->buffer->idx, end_index);
+    return false;
+  }
+
+  unsigned start_index = c->buffer->out_len;
+  if (!match_backtrack (c,
+			backtrackCount, backtrack,
+			lookup_context.funcs.match, lookup_context.match_data[0],
+			&start_index))
+  {
+    c->buffer->unsafe_to_concat_from_outbuffer (start_index, end_index);
+    return false;
+  }
+
+  c->buffer->unsafe_to_break_from_outbuffer (start_index, end_index);
+  apply_lookup (c,
+		inputCount, match_positions,
+		lookupCount, lookupRecord,
+		match_end);
+  return true;
 }
 
 struct ChainRule
@@ -2802,8 +2869,9 @@ struct ChainContextFormat1
 
   void closure (hb_closure_context_t *c) const
   {
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
 
     struct ChainContextClosureLookupContext lookup_context = {
       {intersects_glyph, intersected_glyph},
@@ -2812,10 +2880,14 @@ struct ChainContextFormat1
     };
 
     + hb_zip (this+coverage, hb_range ((unsigned) ruleSet.len))
-    | hb_filter (c->parent_active_glyphs (), hb_first)
+    | hb_filter ([&] (hb_codepoint_t _) {
+      return c->previous_parent_active_glyphs ().has (_);
+    }, hb_first)
     | hb_map ([&](const hb_pair_t<hb_codepoint_t, unsigned> _) { return hb_pair_t<unsigned, const ChainRuleSet&> (_.first, this+ruleSet[_.second]); })
     | hb_apply ([&] (const hb_pair_t<unsigned, const ChainRuleSet&>& _) { _.second.closure (c, _.first, lookup_context); })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -2964,8 +3036,10 @@ struct ChainContextFormat2
     if (!(this+coverage).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const ClassDef &backtrack_class_def = this+backtrackClassDef;
     const ClassDef &input_class_def = this+inputClassDef;
@@ -2979,10 +3053,9 @@ struct ChainContextFormat2
        &lookahead_class_def}
     };
 
-    return
     + hb_enumerate (ruleSet)
     | hb_filter ([&] (unsigned _)
-		 { return input_class_def.intersects_class (c->cur_intersected_glyphs, _); },
+    { return input_class_def.intersects_class (&c->parent_active_glyphs (), _); },
 		 hb_first)
     | hb_apply ([&] (const hb_pair_t<unsigned, const Offset16To<ChainRuleSet>&> _)
                 {
@@ -2990,6 +3063,8 @@ struct ChainContextFormat2
                   chainrule_set.closure (c, _.first, lookup_context);
                 })
     ;
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -3216,8 +3291,10 @@ struct ChainContextFormat3
     if (!(this+input[0]).intersects (c->glyphs))
       return;
 
-    c->cur_intersected_glyphs->clear ();
-    get_coverage ().intersected_coverage_glyphs (c->parent_active_glyphs (), c->cur_intersected_glyphs);
+    hb_set_t* cur_active_glyphs = &c->push_cur_active_glyphs ();
+    get_coverage ().intersected_coverage_glyphs (&c->previous_parent_active_glyphs (),
+                                                 cur_active_glyphs);
+
 
     const Array16OfOffset16To<Coverage> &lookahead = StructAfter<Array16OfOffset16To<Coverage>> (input);
     const Array16Of<LookupRecord> &lookup = StructAfter<Array16Of<LookupRecord>> (lookahead);
@@ -3232,6 +3309,8 @@ struct ChainContextFormat3
 				  lookahead.len, (const HBUINT16 *) lookahead.arrayZ,
 				  lookup.len, lookup.arrayZ,
 				  0, lookup_context);
+
+    c->pop_cur_done_glyphs ();
   }
 
   void closure_lookups (hb_closure_lookups_context_t *c) const
@@ -3706,7 +3785,7 @@ struct GSUBGPOS
     for (unsigned i : feature_indices->iter ())
     {
       hb_tag_t t = get_feature_tag (i);
-      if (t == unique_features.INVALID_KEY) continue;
+      if (t == HB_MAP_VALUE_INVALID) continue;
       if (!unique_features.has (t))
       {
         hb_set_t* indices = hb_set_create ();
@@ -3839,7 +3918,7 @@ struct GSUBGPOS
   template <typename T>
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<T> (face);
       if (unlikely (this->table->is_blocklisted (this->table.get_blob (), face)))
@@ -3861,8 +3940,7 @@ struct GSUBGPOS
       for (unsigned int i = 0; i < this->lookup_count; i++)
 	this->accels[i].init (table->get_lookup (i));
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
       for (unsigned int i = 0; i < this->lookup_count; i++)
 	this->accels[i].fini ();
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.cc b/thirdparty/harfbuzz/src/hb-ot-layout.cc
index 60733648c1..a599eea6e9 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.cc
@@ -1491,10 +1491,9 @@ hb_ot_layout_lookup_substitute_closure (hb_face_t    *face,
 					unsigned int  lookup_index,
 					hb_set_t     *glyphs /* OUT */)
 {
-  hb_set_t cur_intersected_glyphs;
   hb_map_t done_lookups_glyph_count;
   hb_hashmap_t<unsigned, hb_set_t *> done_lookups_glyph_set;
-  OT::hb_closure_context_t c (face, glyphs, &cur_intersected_glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
+  OT::hb_closure_context_t c (face, glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
 
   const OT::SubstLookup& l = face->table.GSUB->table->get_lookup (lookup_index);
 
@@ -1520,10 +1519,9 @@ hb_ot_layout_lookups_substitute_closure (hb_face_t      *face,
 					 const hb_set_t *lookups,
 					 hb_set_t       *glyphs /* OUT */)
 {
-  hb_set_t cur_intersected_glyphs;
   hb_map_t done_lookups_glyph_count;
   hb_hashmap_t<unsigned, hb_set_t *> done_lookups_glyph_set;
-  OT::hb_closure_context_t c (face, glyphs, &cur_intersected_glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
+  OT::hb_closure_context_t c (face, glyphs, &done_lookups_glyph_count, &done_lookups_glyph_set);
   const OT::GSUB& gsub = *face->table.GSUB->table;
 
   unsigned int iteration_count = 0;
@@ -1890,7 +1888,7 @@ apply_string (OT::hb_ot_apply_context_t *c,
     apply_forward (c, accel);
 
     if (!Proxy::inplace)
-      buffer->swap_buffers ();
+      buffer->sync ();
   }
   else
   {
diff --git a/thirdparty/harfbuzz/src/hb-ot-layout.hh b/thirdparty/harfbuzz/src/hb-ot-layout.hh
index 2c825e0c81..ede8f007db 100644
--- a/thirdparty/harfbuzz/src/hb-ot-layout.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-layout.hh
@@ -482,10 +482,9 @@ _hb_glyph_info_get_lig_num_comps (const hb_glyph_info_t *info)
 }
 
 static inline uint8_t
-_hb_allocate_lig_id (hb_buffer_t *buffer) {
+_hb_allocate_lig_id (hb_buffer_t *buffer)
+{
   uint8_t lig_id = buffer->next_serial () & 0x07;
-  if (unlikely (!lig_id))
-    lig_id = _hb_allocate_lig_id (buffer); /* in case of overflow */
   return lig_id;
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-meta-table.hh b/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
index e31447f8fc..93e64c5327 100644
--- a/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-meta-table.hh
@@ -71,9 +71,9 @@ struct meta
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<meta> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     hb_blob_t *reference_entry (hb_tag_t tag) const
     { return table->dataMaps.lsearch (tag).reference_entry (table.get_blob ()); }
@@ -119,7 +119,9 @@ struct meta
   DEFINE_SIZE_ARRAY (16, dataMaps);
 };
 
-struct meta_accelerator_t : meta::accelerator_t {};
+struct meta_accelerator_t : meta::accelerator_t {
+  meta_accelerator_t (hb_face_t *face) : meta::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-metrics.cc b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
index dbd4a1ffbe..103808cf91 100644
--- a/thirdparty/harfbuzz/src/hb-ot-metrics.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-metrics.cc
@@ -160,9 +160,50 @@ hb_ot_metrics_get_position (hb_font_t           *font,
     (position && (*position = font->em_scalef_y (face->table.TABLE->ATTR + GET_VAR)), true))
   case HB_OT_METRICS_TAG_HORIZONTAL_CLIPPING_ASCENT:  return GET_METRIC_Y (OS2, usWinAscent);
   case HB_OT_METRICS_TAG_HORIZONTAL_CLIPPING_DESCENT: return GET_METRIC_Y (OS2, usWinDescent);
-  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE:       return GET_METRIC_Y (hhea, caretSlopeRise);
-  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RUN:        return GET_METRIC_X (hhea, caretSlopeRun);
+
+  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE:
+  case HB_OT_METRICS_TAG_HORIZONTAL_CARET_RUN:
+  {
+    unsigned mult = 1u;
+
+    if (font->slant)
+    {
+      unsigned rise = face->table.hhea->caretSlopeRise;
+      unsigned upem = face->get_upem ();
+      mult = (rise && rise < upem) ? hb_min (upem / rise, 256u) : 1u;
+    }
+
+    if (metrics_tag == HB_OT_METRICS_TAG_HORIZONTAL_CARET_RISE)
+    {
+      bool ret = GET_METRIC_Y (hhea, caretSlopeRise);
+
+      if (position)
+	*position *= mult;
+
+      return ret;
+    }
+    else
+    {
+      hb_position_t rise = 0;
+
+      if (font->slant && position && GET_METRIC_Y (hhea, caretSlopeRise))
+	rise = *position;
+
+      bool ret = GET_METRIC_X (hhea, caretSlopeRun);
+
+      if (position)
+      {
+	*position *= mult;
+
+	if (font->slant)
+	  *position += _hb_roundf (mult * font->slant_xy * rise);
+      }
+
+      return ret;
+    }
+  }
   case HB_OT_METRICS_TAG_HORIZONTAL_CARET_OFFSET:     return GET_METRIC_X (hhea, caretOffset);
+
 #ifndef HB_NO_VERTICAL
   case HB_OT_METRICS_TAG_VERTICAL_CARET_RISE:         return GET_METRIC_X (vhea, caretSlopeRise);
   case HB_OT_METRICS_TAG_VERTICAL_CARET_RUN:          return GET_METRIC_Y (vhea, caretSlopeRun);
diff --git a/thirdparty/harfbuzz/src/hb-ot-name-table.hh b/thirdparty/harfbuzz/src/hb-ot-name-table.hh
index c17bb4abb8..d52367e9b1 100644
--- a/thirdparty/harfbuzz/src/hb-ot-name-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-name-table.hh
@@ -256,7 +256,7 @@ struct name
     })
     ;
 
-    name_prime->serialize (c->serializer, it, hb_addressof (this + stringOffset));
+    name_prime->serialize (c->serializer, it, std::addressof (this + stringOffset));
     return_trace (name_prime->count);
   }
 
@@ -279,7 +279,7 @@ struct name
 
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     {
       this->table = hb_sanitize_context_t ().reference_table<name> (face);
       assert (this->table.get_length () >= this->table->stringOffset);
@@ -288,7 +288,6 @@ struct name
       const hb_array_t<const NameRecord> all_names (this->table->nameRecordZ.arrayZ,
 						    this->table->count);
 
-      this->names.init ();
       this->names.alloc (all_names.length);
 
       for (unsigned int i = 0; i < all_names.length; i++)
@@ -318,10 +317,8 @@ struct name
       }
       this->names.resize (j);
     }
-
-    void fini ()
+    ~accelerator_t ()
     {
-      this->names.fini ();
       this->table.destroy ();
     }
 
@@ -373,7 +370,9 @@ struct name
 #undef entry_index
 #undef entry_score
 
-struct name_accelerator_t : name::accelerator_t {};
+struct name_accelerator_t : name::accelerator_t {
+  name_accelerator_t (hb_face_t *face) : name::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh b/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
index 504de2de74..0f3cd8e24f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-post-table-v2subset.hh
@@ -76,8 +76,7 @@ HB_INTERNAL bool postV2Tail::subset (hb_subset_context_t *c) const
   hb_map_t old_new_index_map, old_gid_new_index_map;
   unsigned i = 0;
 
-  post::accelerator_t _post;
-  _post.init (c->plan->source);
+  post::accelerator_t _post (c->plan->source);
 
   hb_hashmap_t<hb_bytes_t, unsigned, std::nullptr_t, unsigned, nullptr, (unsigned)-1> glyph_name_to_new_index;
   for (hb_codepoint_t new_gid = 0; new_gid < num_glyphs; new_gid++)
@@ -128,9 +127,7 @@ HB_INTERNAL bool postV2Tail::subset (hb_subset_context_t *c) const
                             })
   ;
 
-  bool ret = serialize (c->serializer, index_iter, &_post);
-  _post.fini ();
-  return_trace (ret);
+  return_trace (serialize (c->serializer, index_iter, &_post));
 }
 
 } /* namespace OT */
diff --git a/thirdparty/harfbuzz/src/hb-ot-post-table.hh b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
index 39de671707..a4844e94bc 100644
--- a/thirdparty/harfbuzz/src/hb-ot-post-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-post-table.hh
@@ -111,10 +111,9 @@ struct post
   struct accelerator_t
   {
     friend struct postV2Tail;
-    void init (hb_face_t *face)
-    {
-      index_to_offset.init ();
 
+    accelerator_t (hb_face_t *face)
+    {
       table = hb_sanitize_context_t ().reference_table<post> (face);
       unsigned int table_length = table.get_length ();
 
@@ -132,9 +131,8 @@ struct post
 	   data += 1 + *data)
 	index_to_offset.push (data - pool);
     }
-    void fini ()
+    ~accelerator_t ()
     {
-      index_to_offset.fini ();
       hb_free (gids_sorted_by_name.get ());
       table.destroy ();
     }
@@ -254,9 +252,9 @@ struct post
 
     private:
     uint32_t version;
-    const Array16Of<HBUINT16> *glyphNameIndex;
+    const Array16Of<HBUINT16> *glyphNameIndex = nullptr;
     hb_vector_t<uint32_t> index_to_offset;
-    const uint8_t *pool;
+    const uint8_t *pool = nullptr;
     hb_atomic_ptr_t<uint16_t *> gids_sorted_by_name;
   };
 
@@ -307,7 +305,10 @@ struct post
   DEFINE_SIZE_MIN (32);
 };
 
-struct post_accelerator_t : post::accelerator_t {};
+struct post_accelerator_t : post::accelerator_t {
+  post_accelerator_t (hb_face_t *face) : post::accelerator_t (face) {}
+};
+
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
index 41e3dd38ab..429974d05b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic-win1256.hh
@@ -87,6 +87,8 @@
 
 #define OT_GLYPHID /* GlyphID */ \
 	OT_UINT16
+/* Shorthand. */
+#define G	OT_GLYPHID
 
 #define OT_UARRAY(Name, Items) \
 	OT_LABEL_START(Name) \
@@ -183,8 +185,6 @@
 	Tag \
 	OT_OFFSET(manifest, Name)
 
-/* Shorthand. */
-#define G	OT_GLYPHID
 
 /*
  * Table Start
@@ -300,14 +300,40 @@ OT_TABLE_END
 /*
  * Clean up
  */
+
+#undef MANIFEST
+#undef MANIFEST_LOOKUP
+
 #undef OT_TABLE_START
 #undef OT_TABLE_END
 #undef OT_LABEL_START
 #undef OT_LABEL_END
 #undef OT_UINT8
 #undef OT_UINT16
-#undef OT_DISTANCE
 #undef OT_COUNT
+#undef OT_DISTANCE
+
+#undef OT_LABEL
+#undef OT_LIST
+
+#undef OT_TAG
+#undef OT_OFFSET
+#undef OT_GLYPHID
+#undef G
+#undef OT_UARRAY
+#undef OT_UHEADLESSARRAY
+
+#undef OT_LOOKUP_FLAG_IGNORE_MARKS
+#undef OT_LOOKUP
+#undef OT_SUBLOOKUP
+#undef OT_COVERAGE1
+#undef OT_LOOKUP_TYPE_SUBST_SINGLE
+#undef OT_LOOKUP_TYPE_SUBST_LIGATURE
+#undef OT_SUBLOOKUP_SINGLE_SUBST_FORMAT2
+#undef OT_SUBLOOKUP_LIGATURE_SUBST_FORMAT1
+#undef OT_LIGATURE_SET
+#undef OT_LIGATURE
+
 
 /*
  * Include a second time to get the table data...
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
index 222c5d6b71..2298aa92f2 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-arabic.cc
@@ -321,6 +321,20 @@ arabic_joining (hb_buffer_t *buffer)
       info[prev].arabic_shaping_action() = entry->prev_action;
       buffer->unsafe_to_break (prev, i + 1);
     }
+    else
+    {
+      if (prev == UINT_MAX)
+      {
+        if (this_type >= JOINING_TYPE_R)
+	  buffer->unsafe_to_concat_from_outbuffer (0, i + 1);
+      }
+      else
+      {
+	if (this_type >= JOINING_TYPE_R ||
+	    (2 <= state && state <= 5) /* States that have a possible prev_action. */)
+	  buffer->unsafe_to_concat (prev, i + 1);
+      }
+    }
 
     info[i].arabic_shaping_action() = entry->curr_action;
 
@@ -337,7 +351,14 @@ arabic_joining (hb_buffer_t *buffer)
 
     const arabic_state_table_entry *entry = &arabic_state_table[state][this_type];
     if (entry->prev_action != NONE && prev != UINT_MAX)
+    {
       info[prev].arabic_shaping_action() = entry->prev_action;
+      buffer->unsafe_to_break (prev, buffer->len);
+    }
+    else if (2 <= state && state <= 5) /* States that have a possible prev_action. */
+    {
+      buffer->unsafe_to_concat (prev, buffer->len);
+    }
     break;
   }
 }
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
index 0d84a76b85..3bc9e9b961 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-hangul.cc
@@ -140,7 +140,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
    *
    *   - LV can be precomposed, or decomposed.  Lets call those
    *     <LV> and <L,V>,
-   *   - LVT can be fully precomposed, partically precomposed, or
+   *   - LVT can be fully precomposed, partially precomposed, or
    *     fully decomposed.  Ie. <LVT>, <LV,T>, or <L,V,T>.
    *
    * The composition / decomposition is mechanical.  However, not
@@ -392,7 +392,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
      */
     (void) buffer->next_glyph ();
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 static void
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
index 5a08f878dc..76092c7f38 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-syllabic.cc
@@ -96,7 +96,7 @@ hb_syllabic_insert_dotted_circles (hb_font_t *font,
     else
       (void) buffer->next_glyph ();
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
index 4c3068173b..a1e27a83be 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc
@@ -364,7 +364,7 @@ preprocess_text_thai (const hb_ot_shape_plan_t *plan,
 	buffer->merge_out_clusters (start - 1, end);
     }
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 
   /* If font has Thai GSUB, we are done. */
   if (plan->props.script == HB_SCRIPT_THAI && !plan->map.found_script[0])
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
index 045731dfb4..d2cca105a4 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-complex-vowel-constraints.cc
@@ -435,7 +435,7 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,
     default:
       break;
   }
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc b/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
index eb1bc79768..671f30327f 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-fallback.cc
@@ -446,6 +446,9 @@ _hb_ot_shape_fallback_mark_position (const hb_ot_shape_plan_t *plan,
   return;
 #endif
 
+  if (!buffer->message (font, "start fallback mark"))
+    return;
+
   _hb_buffer_assert_gsubgpos_vars (buffer);
 
   unsigned int start = 0;
@@ -457,6 +460,8 @@ _hb_ot_shape_fallback_mark_position (const hb_ot_shape_plan_t *plan,
       start = i;
     }
   position_cluster (plan, font, buffer, start, count, adjust_offsets_when_zeroing);
+
+  (void) buffer->message (font, "end fallback mark");
 }
 
 
@@ -492,6 +497,9 @@ _hb_ot_shape_fallback_kern (const hb_ot_shape_plan_t *plan,
 #endif
 
 #ifndef HB_DISABLE_DEPRECATED
+  if (!buffer->message (font, "start fallback kern"))
+    return;
+
   if (HB_DIRECTION_IS_HORIZONTAL (buffer->props.direction) ?
       !font->has_glyph_h_kerning_func () :
       !font->has_glyph_v_kerning_func ())
@@ -508,6 +516,8 @@ _hb_ot_shape_fallback_kern (const hb_ot_shape_plan_t *plan,
 
   if (reverse)
     buffer->reverse ();
+
+  (void) buffer->message (font, "end fallback kern");
 #endif
 }
 
@@ -525,6 +535,15 @@ _hb_ot_shape_fallback_spaces (const hb_ot_shape_plan_t *plan HB_UNUSED,
   for (unsigned int i = 0; i < count; i++)
     if (_hb_glyph_info_is_unicode_space (&info[i]) && !_hb_glyph_info_ligated (&info[i]))
     {
+      /* If font had no ASCII space and we used the invisible glyph, give it a 1/4 EM default advance. */
+      if (buffer->invisible && info[i].codepoint == buffer->invisible)
+      {
+        if (horizontal)
+	  pos[i].x_advance = +font->x_scale / 4;
+        else
+	  pos[i].y_advance = -font->y_scale / 4;
+      }
+
       hb_unicode_funcs_t::space_t space_type = _hb_glyph_info_get_unicode_space_fallback_type (&info[i]);
       hb_codepoint_t glyph;
       typedef hb_unicode_funcs_t t;
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
index 839cc9122c..aa5a8eeaa3 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape-normalize.cc
@@ -193,7 +193,8 @@ decompose_current_character (const hb_ot_shape_normalize_context_t *c, bool shor
   {
     hb_codepoint_t space_glyph;
     hb_unicode_funcs_t::space_t space_type = buffer->unicode->space_fallback_type (u);
-    if (space_type != hb_unicode_funcs_t::NOT_SPACE && c->font->get_nominal_glyph (0x0020u, &space_glyph))
+    if (space_type != hb_unicode_funcs_t::NOT_SPACE &&
+	(c->font->get_nominal_glyph (0x0020, &space_glyph) || (space_glyph = buffer->invisible)))
     {
       _hb_glyph_info_set_unicode_space_fallback_type (&buffer->cur(), space_type);
       next_char (buffer, space_glyph);
@@ -374,7 +375,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       decompose_multi_char_cluster (&c, end, always_short_circuit);
     }
     while (buffer->idx < count && buffer->successful);
-    buffer->swap_buffers ();
+    buffer->sync ();
   }
 
 
@@ -477,7 +478,7 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
       if (info_cc (buffer->prev()) == 0)
 	starter = buffer->out_len - 1;
     }
-    buffer->swap_buffers ();
+    buffer->sync ();
   }
 }
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-shape.cc b/thirdparty/harfbuzz/src/hb-ot-shape.cc
index 4dde3520d8..4bd8aaf03b 100644
--- a/thirdparty/harfbuzz/src/hb-ot-shape.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-shape.cc
@@ -566,7 +566,7 @@ hb_insert_dotted_circle (hb_buffer_t *buffer, hb_font_t *font)
   info.mask = buffer->cur().mask;
   (void) buffer->output_info (info);
 
-  buffer->swap_buffers ();
+  buffer->sync ();
 }
 
 static void
@@ -1034,7 +1034,7 @@ hb_ot_position_complex (const hb_ot_shape_context_t *c)
    * hanging over the next glyph after the final reordering.
    *
    * Note: If fallback positinoing happens, we don't care about
-   * this as it will be overriden.
+   * this as it will be overridden.
    */
   bool adjust_offsets_when_zeroing = c->plan->adjust_mark_positioning_when_zeroing &&
 				     HB_DIRECTION_IS_FORWARD (c->buffer->props.direction);
@@ -1120,7 +1120,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
   /* Propagate cluster-level glyph flags to be the same on all cluster glyphs.
    * Simplifies using them. */
 
-  if (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK))
+  if (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_GLYPH_FLAGS))
     return;
 
   hb_glyph_info_t *info = buffer->info;
@@ -1129,11 +1129,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
   {
     unsigned int mask = 0;
     for (unsigned int i = start; i < end; i++)
-      if (info[i].mask & HB_GLYPH_FLAG_UNSAFE_TO_BREAK)
-      {
-	 mask = HB_GLYPH_FLAG_UNSAFE_TO_BREAK;
-	 break;
-      }
+      mask |= info[i].mask & HB_GLYPH_FLAG_DEFINED;
     if (mask)
       for (unsigned int i = start; i < end; i++)
 	info[i].mask |= mask;
@@ -1145,18 +1141,7 @@ hb_propagate_flags (hb_buffer_t *buffer)
 static void
 hb_ot_shape_internal (hb_ot_shape_context_t *c)
 {
-  c->buffer->deallocate_var_all ();
-  c->buffer->scratch_flags = HB_BUFFER_SCRATCH_FLAG_DEFAULT;
-  if (likely (!hb_unsigned_mul_overflows (c->buffer->len, HB_BUFFER_MAX_LEN_FACTOR)))
-  {
-    c->buffer->max_len = hb_max (c->buffer->len * HB_BUFFER_MAX_LEN_FACTOR,
-				 (unsigned) HB_BUFFER_MAX_LEN_MIN);
-  }
-  if (likely (!hb_unsigned_mul_overflows (c->buffer->len, HB_BUFFER_MAX_OPS_FACTOR)))
-  {
-    c->buffer->max_ops = hb_max (c->buffer->len * HB_BUFFER_MAX_OPS_FACTOR,
-				 (unsigned) HB_BUFFER_MAX_OPS_MIN);
-  }
+  c->buffer->enter ();
 
   /* Save the original direction, we use it later. */
   c->target_direction = c->buffer->props.direction;
@@ -1188,9 +1173,7 @@ hb_ot_shape_internal (hb_ot_shape_context_t *c)
 
   c->buffer->props.direction = c->target_direction;
 
-  c->buffer->max_len = HB_BUFFER_MAX_LEN_DEFAULT;
-  c->buffer->max_ops = HB_BUFFER_MAX_OPS_DEFAULT;
-  c->buffer->deallocate_var_all ();
+  c->buffer->leave ();
 }
 
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
index 2c6316df4f..61d2814e93 100644
--- a/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-tag-table.hh
@@ -6,8 +6,8 @@
  *
  * on files with these headers:
  *
- * <meta name="updated_at" content="2021-12-09 12:01 AM" />
- * File-Date: 2021-08-06
+ * <meta name="updated_at" content="2022-01-28 10:00 PM" />
+ * File-Date: 2021-12-29
  */
 
 #ifndef HB_OT_TAG_TABLE_HH
@@ -66,7 +66,7 @@ static const LangTag ot_languages[] = {
   {"an",	HB_TAG('A','R','G',' ')},	/* Aragonese */
 /*{"ang",	HB_TAG('A','N','G',' ')},*/	/* Old English (ca. 450-1100) -> Anglo-Saxon */
   {"aoa",	HB_TAG('C','P','P',' ')},	/* Angolar -> Creoles */
-  {"apa",	HB_TAG('A','T','H',' ')},	/* Apache [family] -> Athapaskan */
+  {"apa",	HB_TAG('A','T','H',' ')},	/* Apache [collection] -> Athapaskan */
   {"apc",	HB_TAG('A','R','A',' ')},	/* North Levantine Arabic -> Arabic */
   {"apd",	HB_TAG('A','R','A',' ')},	/* Sudanese Arabic -> Arabic */
   {"apj",	HB_TAG('A','T','H',' ')},	/* Jicarilla Apache -> Athapaskan */
@@ -86,7 +86,7 @@ static const LangTag ot_languages[] = {
   {"arz",	HB_TAG('A','R','A',' ')},	/* Egyptian Arabic -> Arabic */
   {"as",	HB_TAG('A','S','M',' ')},	/* Assamese */
 /*{"ast",	HB_TAG('A','S','T',' ')},*/	/* Asturian */
-/*{"ath",	HB_TAG('A','T','H',' ')},*/	/* Athapascan [family] -> Athapaskan */
+/*{"ath",	HB_TAG('A','T','H',' ')},*/	/* Athapascan [collection] -> Athapaskan */
   {"atj",	HB_TAG('R','C','R',' ')},	/* Atikamekw -> R-Cree */
   {"atv",	HB_TAG('A','L','T',' ')},	/* Northern Altai -> Altai */
   {"auj",	HB_TAG('B','B','R',' ')},	/* Awjilah -> Berber */
@@ -110,10 +110,10 @@ static const LangTag ot_languages[] = {
   {"azn",	HB_TAG('N','A','H',' ')},	/* Western Durango Nahuatl -> Nahuatl */
   {"azz",	HB_TAG('N','A','H',' ')},	/* Highland Puebla Nahuatl -> Nahuatl */
   {"ba",	HB_TAG('B','S','H',' ')},	/* Bashkir */
-  {"bad",	HB_TAG('B','A','D','0')},	/* Banda [family] */
+  {"bad",	HB_TAG('B','A','D','0')},	/* Banda [collection] */
   {"bag",	HB_TAG_NONE	       },	/* Tuki != Baghelkhandi */
   {"bah",	HB_TAG('C','P','P',' ')},	/* Bahamas Creole English -> Creoles */
-  {"bai",	HB_TAG('B','M','L',' ')},	/* Bamileke [family] */
+  {"bai",	HB_TAG('B','M','L',' ')},	/* Bamileke [collection] */
   {"bal",	HB_TAG('B','L','I',' ')},	/* Baluchi [macrolanguage] */
 /*{"ban",	HB_TAG('B','A','N',' ')},*/	/* Balinese */
 /*{"bar",	HB_TAG('B','A','R',' ')},*/	/* Bavarian */
@@ -135,7 +135,7 @@ static const LangTag ot_languages[] = {
   {"bea",	HB_TAG('A','T','H',' ')},	/* Beaver -> Athapaskan */
   {"beb",	HB_TAG('B','T','I',' ')},	/* Bebele -> Beti */
 /*{"bem",	HB_TAG('B','E','M',' ')},*/	/* Bemba (Zambia) */
-  {"ber",	HB_TAG('B','B','R',' ')},	/* Berber [family] */
+  {"ber",	HB_TAG('B','B','R',' ')},	/* Berber [collection] */
   {"bew",	HB_TAG('C','P','P',' ')},	/* Betawi -> Creoles */
   {"bfl",	HB_TAG('B','A','D','0')},	/* Banda-Ndélé -> Banda */
   {"bfq",	HB_TAG('B','A','D',' ')},	/* Badaga */
@@ -203,7 +203,7 @@ static const LangTag ot_languages[] = {
   {"btd",	HB_TAG('B','T','K',' ')},	/* Batak Dairi -> Batak */
   {"bti",	HB_TAG_NONE	       },	/* Burate != Beti */
   {"btj",	HB_TAG('M','L','Y',' ')},	/* Bacanese Malay -> Malay */
-/*{"btk",	HB_TAG('B','T','K',' ')},*/	/* Batak [family] */
+/*{"btk",	HB_TAG('B','T','K',' ')},*/	/* Batak [collection] */
   {"btm",	HB_TAG('B','T','M',' ')},	/* Batak Mandailing */
   {"btm",	HB_TAG('B','T','K',' ')},	/* Batak Mandailing -> Batak */
   {"bto",	HB_TAG('B','I','K',' ')},	/* Rinconada Bikol -> Bikol */
@@ -256,6 +256,8 @@ static const LangTag ot_languages[] = {
   {"chh",	HB_TAG_NONE	       },	/* Chinook != Chattisgarhi */
   {"chj",	HB_TAG('C','C','H','N')},	/* Ojitlán Chinantec -> Chinantec */
   {"chk",	HB_TAG('C','H','K','0')},	/* Chuukese */
+  {"chm",	HB_TAG('H','M','A',' ')},	/* Mari (Russia) [macrolanguage] -> High Mari */
+  {"chm",	HB_TAG('L','M','A',' ')},	/* Mari (Russia) [macrolanguage] -> Low Mari */
   {"chn",	HB_TAG('C','P','P',' ')},	/* Chinook jargon -> Creoles */
 /*{"cho",	HB_TAG('C','H','O',' ')},*/	/* Choctaw */
   {"chp",	HB_TAG('C','H','P',' ')},	/* Chipewyan */
@@ -297,10 +299,10 @@ static const LangTag ot_languages[] = {
 /*{"cop",	HB_TAG('C','O','P',' ')},*/	/* Coptic */
   {"coq",	HB_TAG('A','T','H',' ')},	/* Coquille -> Athapaskan */
   {"cpa",	HB_TAG('C','C','H','N')},	/* Palantla Chinantec -> Chinantec */
-  {"cpe",	HB_TAG('C','P','P',' ')},	/* English-based creoles and pidgins [family] -> Creoles */
-  {"cpf",	HB_TAG('C','P','P',' ')},	/* French-based creoles and pidgins [family] -> Creoles */
+  {"cpe",	HB_TAG('C','P','P',' ')},	/* English-based creoles and pidgins [collection] -> Creoles */
+  {"cpf",	HB_TAG('C','P','P',' ')},	/* French-based creoles and pidgins [collection] -> Creoles */
   {"cpi",	HB_TAG('C','P','P',' ')},	/* Chinese Pidgin English -> Creoles */
-/*{"cpp",	HB_TAG('C','P','P',' ')},*/	/* Portuguese-based creoles and pidgins [family] -> Creoles */
+/*{"cpp",	HB_TAG('C','P','P',' ')},*/	/* Portuguese-based creoles and pidgins [collection] -> Creoles */
   {"cpx",	HB_TAG('Z','H','S',' ')},	/* Pu-Xian Chinese -> Chinese, Simplified */
   {"cqd",	HB_TAG('H','M','N',' ')},	/* Chuanqiandian Cluster Miao -> Hmong */
   {"cqu",	HB_TAG('Q','U','H',' ')},	/* Chilean Quechua (retired code) -> Quechua (Bolivia) */
@@ -320,7 +322,7 @@ static const LangTag ot_languages[] = {
   {"crm",	HB_TAG('M','C','R',' ')},	/* Moose Cree */
   {"crm",	HB_TAG('L','C','R',' ')},	/* Moose Cree -> L-Cree */
   {"crm",	HB_TAG('C','R','E',' ')},	/* Moose Cree -> Cree */
-  {"crp",	HB_TAG('C','P','P',' ')},	/* Creoles and pidgins [family] -> Creoles */
+  {"crp",	HB_TAG('C','P','P',' ')},	/* Creoles and pidgins [collection] -> Creoles */
   {"crr",	HB_TAG_NONE	       },	/* Carolina Algonquian != Carrier */
   {"crs",	HB_TAG('C','P','P',' ')},	/* Seselwa Creole French -> Creoles */
   {"crt",	HB_TAG_NONE	       },	/* Iyojwa'ja Chorote != Crimean Tatar */
@@ -431,7 +433,7 @@ static const LangTag ot_languages[] = {
   {"et",	HB_TAG('E','T','I',' ')},	/* Estonian [macrolanguage] */
   {"eto",	HB_TAG('B','T','I',' ')},	/* Eton (Cameroon) -> Beti */
   {"eu",	HB_TAG('E','U','Q',' ')},	/* Basque */
-  {"euq",	HB_TAG_NONE	       },	/* Basque [family] != Basque */
+  {"euq",	HB_TAG_NONE	       },	/* Basque [collection] != Basque */
   {"eve",	HB_TAG('E','V','N',' ')},	/* Even */
   {"evn",	HB_TAG('E','V','K',' ')},	/* Evenki */
   {"ewo",	HB_TAG('B','T','I',' ')},	/* Ewondo -> Beti */
@@ -620,10 +622,11 @@ static const LangTag ot_languages[] = {
   {"ijc",	HB_TAG('I','J','O',' ')},	/* Izon -> Ijo */
   {"ije",	HB_TAG('I','J','O',' ')},	/* Biseni -> Ijo */
   {"ijn",	HB_TAG('I','J','O',' ')},	/* Kalabari -> Ijo */
-/*{"ijo",	HB_TAG('I','J','O',' ')},*/	/* Ijo [family] */
+/*{"ijo",	HB_TAG('I','J','O',' ')},*/	/* Ijo [collection] */
   {"ijs",	HB_TAG('I','J','O',' ')},	/* Southeast Ijo -> Ijo */
   {"ik",	HB_TAG('I','P','K',' ')},	/* Inupiaq [macrolanguage] -> Inupiat */
   {"ike",	HB_TAG('I','N','U',' ')},	/* Eastern Canadian Inuktitut -> Inuktitut */
+  {"ike",	HB_TAG('I','N','U','K')},	/* Eastern Canadian Inuktitut -> Nunavik Inuktitut */
   {"ikt",	HB_TAG('I','N','U',' ')},	/* Inuinnaqtun -> Inuktitut */
 /*{"ilo",	HB_TAG('I','L','O',' ')},*/	/* Iloko -> Ilokano */
   {"in",	HB_TAG('I','N','D',' ')},	/* Indonesian (retired code) */
@@ -638,6 +641,7 @@ static const LangTag ot_languages[] = {
   {"it",	HB_TAG('I','T','A',' ')},	/* Italian */
   {"itz",	HB_TAG('M','Y','N',' ')},	/* Itzá -> Mayan */
   {"iu",	HB_TAG('I','N','U',' ')},	/* Inuktitut [macrolanguage] */
+  {"iu",	HB_TAG('I','N','U','K')},	/* Inuktitut [macrolanguage] -> Nunavik Inuktitut */
   {"iw",	HB_TAG('I','W','R',' ')},	/* Hebrew (retired code) */
   {"ixl",	HB_TAG('M','Y','N',' ')},	/* Ixil -> Mayan */
   {"ja",	HB_TAG('J','A','N',' ')},	/* Japanese */
@@ -667,7 +671,7 @@ static const LangTag ot_languages[] = {
   {"kab",	HB_TAG('B','B','R',' ')},	/* Kabyle -> Berber */
   {"kac",	HB_TAG_NONE	       },	/* Kachin != Kachchi */
   {"kam",	HB_TAG('K','M','B',' ')},	/* Kamba (Kenya) */
-  {"kar",	HB_TAG('K','R','N',' ')},	/* Karen [family] */
+  {"kar",	HB_TAG('K','R','N',' ')},	/* Karen [collection] */
 /*{"kaw",	HB_TAG('K','A','W',' ')},*/	/* Kawi (Old Javanese) */
   {"kbd",	HB_TAG('K','A','B',' ')},	/* Kabardian */
   {"kby",	HB_TAG('K','N','R',' ')},	/* Manga Kanuri -> Kanuri */
@@ -876,7 +880,7 @@ static const LangTag ot_languages[] = {
   {"mam",	HB_TAG('M','A','M',' ')},	/* Mam */
   {"mam",	HB_TAG('M','Y','N',' ')},	/* Mam -> Mayan */
   {"man",	HB_TAG('M','N','K',' ')},	/* Mandingo [macrolanguage] -> Maninka */
-  {"map",	HB_TAG_NONE	       },	/* Austronesian [family] != Mapudungun */
+  {"map",	HB_TAG_NONE	       },	/* Austronesian [collection] != Mapudungun */
   {"maw",	HB_TAG_NONE	       },	/* Mampruli != Marwari */
   {"max",	HB_TAG('M','L','Y',' ')},	/* North Moluccan Malay -> Malay */
   {"max",	HB_TAG('C','P','P',' ')},	/* North Moluccan Malay -> Creoles */
@@ -936,6 +940,7 @@ static const LangTag ot_languages[] = {
   {"mnw",	HB_TAG('M','O','N','T')},	/* Mon -> Thailand Mon */
   {"mnx",	HB_TAG_NONE	       },	/* Manikion != Manx */
   {"mo",	HB_TAG('M','O','L',' ')},	/* Moldavian (retired code) */
+  {"mo",	HB_TAG('R','O','M',' ')},	/* Moldavian (retired code) -> Romanian */
   {"mod",	HB_TAG('C','P','P',' ')},	/* Mobilian -> Creoles */
 /*{"moh",	HB_TAG('M','O','H',' ')},*/	/* Mohawk */
   {"mok",	HB_TAG_NONE	       },	/* Morori != Moksha */
@@ -958,7 +963,7 @@ static const LangTag ot_languages[] = {
   {"mts",	HB_TAG_NONE	       },	/* Yora != Maltese */
   {"mud",	HB_TAG('C','P','P',' ')},	/* Mednyj Aleut -> Creoles */
   {"mui",	HB_TAG('M','L','Y',' ')},	/* Musi -> Malay */
-  {"mun",	HB_TAG_NONE	       },	/* Munda [family] != Mundari */
+  {"mun",	HB_TAG_NONE	       },	/* Munda [collection] != Mundari */
   {"mup",	HB_TAG('R','A','J',' ')},	/* Malvi -> Rajasthani */
   {"muq",	HB_TAG('H','M','N',' ')},	/* Eastern Xiangxi Miao -> Hmong */
 /*{"mus",	HB_TAG('M','U','S',' ')},*/	/* Creek -> Muscogee */
@@ -973,7 +978,7 @@ static const LangTag ot_languages[] = {
   {"mww",	HB_TAG('H','M','N',' ')},	/* Hmong Daw -> Hmong */
   {"my",	HB_TAG('B','R','M',' ')},	/* Burmese */
   {"mym",	HB_TAG('M','E','N',' ')},	/* Me’en */
-/*{"myn",	HB_TAG('M','Y','N',' ')},*/	/* Mayan [family] */
+/*{"myn",	HB_TAG('M','Y','N',' ')},*/	/* Mayan [collection] */
   {"myq",	HB_TAG('M','N','K',' ')},	/* Forest Maninka (retired code) -> Maninka */
   {"myv",	HB_TAG('E','R','Z',' ')},	/* Erzya */
   {"mzb",	HB_TAG('B','B','R',' ')},	/* Tumzabt -> Berber */
@@ -982,7 +987,7 @@ static const LangTag ot_languages[] = {
   {"na",	HB_TAG('N','A','U',' ')},	/* Nauru -> Nauruan */
   {"nag",	HB_TAG('N','A','G',' ')},	/* Naga Pidgin -> Naga-Assamese */
   {"nag",	HB_TAG('C','P','P',' ')},	/* Naga Pidgin -> Creoles */
-/*{"nah",	HB_TAG('N','A','H',' ')},*/	/* Nahuatl [family] */
+/*{"nah",	HB_TAG('N','A','H',' ')},*/	/* Nahuatl [collection] */
   {"nan",	HB_TAG('Z','H','S',' ')},	/* Min Nan Chinese -> Chinese, Simplified */
 /*{"nap",	HB_TAG('N','A','P',' ')},*/	/* Neapolitan */
   {"nas",	HB_TAG_NONE	       },	/* Naasioi != Naskapi */
@@ -1039,7 +1044,6 @@ static const LangTag ot_languages[] = {
   {"nln",	HB_TAG('N','A','H',' ')},	/* Durango Nahuatl (retired code) -> Nahuatl */
   {"nlv",	HB_TAG('N','A','H',' ')},	/* Orizaba Nahuatl -> Nahuatl */
   {"nn",	HB_TAG('N','Y','N',' ')},	/* Norwegian Nynorsk (Nynorsk, Norwegian) */
-  {"nn",	HB_TAG('N','O','R',' ')},	/* Norwegian Nynorsk -> Norwegian */
   {"nnh",	HB_TAG('B','M','L',' ')},	/* Ngiemboon -> Bamileke */
   {"nnz",	HB_TAG('B','M','L',' ')},	/* Nda'nda' -> Bamileke */
   {"no",	HB_TAG('N','O','R',' ')},	/* Norwegian [macrolanguage] */
@@ -1093,7 +1097,7 @@ static const LangTag ot_languages[] = {
   {"otw",	HB_TAG('O','J','B',' ')},	/* Ottawa -> Ojibway */
   {"oua",	HB_TAG('B','B','R',' ')},	/* Tagargrent -> Berber */
   {"pa",	HB_TAG('P','A','N',' ')},	/* Punjabi */
-  {"paa",	HB_TAG_NONE	       },	/* Papuan [family] != Palestinian Aramaic */
+  {"paa",	HB_TAG_NONE	       },	/* Papuan [collection] != Palestinian Aramaic */
 /*{"pag",	HB_TAG('P','A','G',' ')},*/	/* Pangasinan */
   {"pal",	HB_TAG_NONE	       },	/* Pahlavi != Pali */
 /*{"pam",	HB_TAG('P','A','M',' ')},*/	/* Pampanga -> Pampangan */
@@ -1308,6 +1312,9 @@ static const LangTag ot_languages[] = {
   {"sgo",	HB_TAG_NONE	       },	/* Songa (retired code) != Sango */
 /*{"sgs",	HB_TAG('S','G','S',' ')},*/	/* Samogitian */
   {"sgw",	HB_TAG('C','H','G',' ')},	/* Sebat Bet Gurage -> Chaha Gurage */
+  {"sh",	HB_TAG('B','O','S',' ')},	/* Serbo-Croatian [macrolanguage] -> Bosnian */
+  {"sh",	HB_TAG('H','R','V',' ')},	/* Serbo-Croatian [macrolanguage] -> Croatian */
+  {"sh",	HB_TAG('S','R','B',' ')},	/* Serbo-Croatian [macrolanguage] -> Serbian */
   {"shi",	HB_TAG('S','H','I',' ')},	/* Tachelhit */
   {"shi",	HB_TAG('B','B','R',' ')},	/* Tachelhit -> Berber */
   {"shl",	HB_TAG('Q','I','N',' ')},	/* Shendu -> Chin */
@@ -1329,7 +1336,7 @@ static const LangTag ot_languages[] = {
   {"skw",	HB_TAG('C','P','P',' ')},	/* Skepi Creole Dutch -> Creoles */
   {"sky",	HB_TAG_NONE	       },	/* Sikaiana != Slovak */
   {"sl",	HB_TAG('S','L','V',' ')},	/* Slovenian */
-  {"sla",	HB_TAG_NONE	       },	/* Slavic [family] != Slavey */
+  {"sla",	HB_TAG_NONE	       },	/* Slavic [collection] != Slavey */
   {"sm",	HB_TAG('S','M','O',' ')},	/* Samoan */
   {"sma",	HB_TAG('S','S','M',' ')},	/* Southern Sami */
   {"smj",	HB_TAG('L','S','M',' ')},	/* Lule Sami */
@@ -1451,7 +1458,7 @@ static const LangTag ot_languages[] = {
   {"tpi",	HB_TAG('C','P','P',' ')},	/* Tok Pisin -> Creoles */
   {"tr",	HB_TAG('T','R','K',' ')},	/* Turkish */
   {"trf",	HB_TAG('C','P','P',' ')},	/* Trinidadian Creole English -> Creoles */
-  {"trk",	HB_TAG_NONE	       },	/* Turkic [family] != Turkish */
+  {"trk",	HB_TAG_NONE	       },	/* Turkic [collection] != Turkish */
   {"tru",	HB_TAG('T','U','A',' ')},	/* Turoyo -> Turoyo Aramaic */
   {"tru",	HB_TAG('S','Y','R',' ')},	/* Turoyo -> Syriac */
   {"ts",	HB_TAG('T','S','G',' ')},	/* Tsonga */
@@ -1593,7 +1600,7 @@ static const LangTag ot_languages[] = {
   {"zlq",	HB_TAG('Z','H','A',' ')},	/* Liuqian Zhuang -> Zhuang */
   {"zmi",	HB_TAG('M','L','Y',' ')},	/* Negeri Sembilan Malay -> Malay */
   {"zmz",	HB_TAG('B','A','D','0')},	/* Mbandja -> Banda */
-  {"znd",	HB_TAG_NONE	       },	/* Zande [family] != Zande */
+  {"znd",	HB_TAG_NONE	       },	/* Zande [collection] != Zande */
   {"zne",	HB_TAG('Z','N','D',' ')},	/* Zande */
   {"zom",	HB_TAG('Q','I','N',' ')},	/* Zou -> Chin */
   {"zqe",	HB_TAG('Z','H','A',' ')},	/* Qiubei Zhuang -> Zhuang */
@@ -2607,14 +2614,8 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
     if (0 == strcmp (&lang_str[1], "o-nyn"))
     {
       /* Norwegian Nynorsk (retired code) */
-      unsigned int i;
-      hb_tag_t possible_tags[] = {
-	HB_TAG('N','Y','N',' '),  /* Norwegian Nynorsk (Nynorsk, Norwegian) */
-	HB_TAG('N','O','R',' '),  /* Norwegian */
-      };
-      for (i = 0; i < 2 && i < *count; i++)
-	tags[i] = possible_tags[i];
-      *count = i;
+      tags[0] = HB_TAG('N','Y','N',' ');  /* Norwegian Nynorsk (Nynorsk, Norwegian) */
+      *count = 1;
       return true;
     }
     break;
@@ -2623,8 +2624,14 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
 	&& subtag_matches (lang_str, limit, "-md"))
     {
       /* Romanian; Moldova */
-      tags[0] = HB_TAG('M','O','L',' ');  /* Moldavian */
-      *count = 1;
+      unsigned int i;
+      hb_tag_t possible_tags[] = {
+	HB_TAG('M','O','L',' '),  /* Moldavian */
+	HB_TAG('R','O','M',' '),  /* Romanian */
+      };
+      for (i = 0; i < 2 && i < *count; i++)
+	tags[i] = possible_tags[i];
+      *count = i;
       return true;
     }
     break;
@@ -2813,15 +2820,15 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('A','R','K',' '):  /* Rakhine */
     return hb_language_from_string ("rki", -1);  /* Rakhine */
   case HB_TAG('A','T','H',' '):  /* Athapaskan */
-    return hb_language_from_string ("ath", -1);  /* Athapascan [family] */
+    return hb_language_from_string ("ath", -1);  /* Athapascan [collection] */
   case HB_TAG('B','B','R',' '):  /* Berber */
-    return hb_language_from_string ("ber", -1);  /* Berber [family] */
+    return hb_language_from_string ("ber", -1);  /* Berber [collection] */
   case HB_TAG('B','I','K',' '):  /* Bikol */
     return hb_language_from_string ("bik", -1);  /* Bikol [macrolanguage] */
   case HB_TAG('B','T','K',' '):  /* Batak */
-    return hb_language_from_string ("btk", -1);  /* Batak [family] */
+    return hb_language_from_string ("btk", -1);  /* Batak [collection] */
   case HB_TAG('C','P','P',' '):  /* Creoles */
-    return hb_language_from_string ("crp", -1);  /* Creoles and pidgins [family] */
+    return hb_language_from_string ("crp", -1);  /* Creoles and pidgins [collection] */
   case HB_TAG('C','R','R',' '):  /* Carrier */
     return hb_language_from_string ("crx", -1);  /* Carrier */
   case HB_TAG('D','G','R',' '):  /* Dogri (macrolanguage) */
@@ -2838,6 +2845,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("fa", -1);  /* Persian [macrolanguage] */
   case HB_TAG('G','O','N',' '):  /* Gondi */
     return hb_language_from_string ("gon", -1);  /* Gondi [macrolanguage] */
+  case HB_TAG('H','M','A',' '):  /* High Mari */
+    return hb_language_from_string ("mrj", -1);  /* Western Mari */
   case HB_TAG('H','M','N',' '):  /* Hmong */
     return hb_language_from_string ("hmn", -1);  /* Hmong [macrolanguage] */
   case HB_TAG('H','N','D',' '):  /* Hindko */
@@ -2847,7 +2856,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('I','B','A',' '):  /* Iban */
     return hb_language_from_string ("iba", -1);  /* Iban */
   case HB_TAG('I','J','O',' '):  /* Ijo */
-    return hb_language_from_string ("ijo", -1);  /* Ijo [family] */
+    return hb_language_from_string ("ijo", -1);  /* Ijo [collection] */
   case HB_TAG('I','N','U',' '):  /* Inuktitut */
     return hb_language_from_string ("iu", -1);  /* Inuktitut [macrolanguage] */
   case HB_TAG('I','P','K',' '):  /* Inupiat */
@@ -2873,11 +2882,13 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('K','P','L',' '):  /* Kpelle */
     return hb_language_from_string ("kpe", -1);  /* Kpelle [macrolanguage] */
   case HB_TAG('K','R','N',' '):  /* Karen */
-    return hb_language_from_string ("kar", -1);  /* Karen [family] */
+    return hb_language_from_string ("kar", -1);  /* Karen [collection] */
   case HB_TAG('K','U','I',' '):  /* Kui */
     return hb_language_from_string ("uki", -1);  /* Kui (India) */
   case HB_TAG('K','U','R',' '):  /* Kurdish */
     return hb_language_from_string ("ku", -1);  /* Kurdish [macrolanguage] */
+  case HB_TAG('L','M','A',' '):  /* Low Mari */
+    return hb_language_from_string ("mhr", -1);  /* Eastern Mari */
   case HB_TAG('L','U','H',' '):  /* Luyia */
     return hb_language_from_string ("luy", -1);  /* Luyia [macrolanguage] */
   case HB_TAG('L','V','I',' '):  /* Latvian */
@@ -2897,9 +2908,9 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
   case HB_TAG('M','O','N','T'):  /* Thailand Mon */
     return hb_language_from_string ("mnw-TH", -1);  /* Mon; Thailand */
   case HB_TAG('M','Y','N',' '):  /* Mayan */
-    return hb_language_from_string ("myn", -1);  /* Mayan [family] */
+    return hb_language_from_string ("myn", -1);  /* Mayan [collection] */
   case HB_TAG('N','A','H',' '):  /* Nahuatl */
-    return hb_language_from_string ("nah", -1);  /* Nahuatl [family] */
+    return hb_language_from_string ("nah", -1);  /* Nahuatl [collection] */
   case HB_TAG('N','E','P',' '):  /* Nepali */
     return hb_language_from_string ("ne", -1);  /* Nepali [macrolanguage] */
   case HB_TAG('N','I','S',' '):  /* Nisi */
@@ -2926,6 +2937,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
     return hb_language_from_string ("qwh", -1);  /* Huaylas Ancash Quechua */
   case HB_TAG('R','A','J',' '):  /* Rajasthani */
     return hb_language_from_string ("raj", -1);  /* Rajasthani [macrolanguage] */
+  case HB_TAG('R','O','M',' '):  /* Romanian */
+    return hb_language_from_string ("ro", -1);  /* Romanian */
   case HB_TAG('R','O','Y',' '):  /* Romany */
     return hb_language_from_string ("rom", -1);  /* Romany [macrolanguage] */
   case HB_TAG('S','Q','I',' '):  /* Albanian */
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
index 05f289db26..e066558683 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-fvar-table.hh
@@ -263,7 +263,7 @@ struct fvar
     if (coords_length && *coords_length)
     {
       hb_array_t<const HBFixed> instanceCoords = instance->get_coordinates (axisCount)
-							 .sub_array (0, *coords_length);
+							 .sub_array (0, coords_length);
       for (unsigned int i = 0; i < instanceCoords.length; i++)
 	coords[i] = instanceCoords.arrayZ[i].to_float ();
     }
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
index 49b5532d40..539213c339 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-gvar-table.hh
@@ -399,7 +399,7 @@ struct gvar
 				  get_offset (glyphCount) - get_offset (0)));
   }
 
-  /* GlyphVariationData not sanitized here; must be checked while accessing each glyph varation data */
+  /* GlyphVariationData not sanitized here; must be checked while accessing each glyph variation data */
   bool sanitize (hb_sanitize_context_t *c) const
   { return sanitize_shallow (c); }
 
@@ -498,9 +498,9 @@ struct gvar
   public:
   struct accelerator_t
   {
-    void init (hb_face_t *face)
+    accelerator_t (hb_face_t *face)
     { table = hb_sanitize_context_t ().reference_table<gvar> (face); }
-    void fini () { table.destroy (); }
+    ~accelerator_t () { table.destroy (); }
 
     private:
     struct x_getter { static float get (const contour_point_t &p) { return p.x; } };
@@ -698,7 +698,9 @@ no_more_gaps:
   DEFINE_SIZE_MIN (20);
 };
 
-struct gvar_accelerator_t : gvar::accelerator_t {};
+struct gvar_accelerator_t : gvar::accelerator_t {
+  gvar_accelerator_t (hb_face_t *face) : gvar::accelerator_t (face) {}
+};
 
 } /* namespace OT */
 
diff --git a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
index 074b6a3785..e9d90352f0 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
+++ b/thirdparty/harfbuzz/src/hb-ot-var-hvar-table.hh
@@ -177,9 +177,6 @@ struct hvarvvar_subset_plan_t
 
     inner_maps.resize (var_store->get_sub_table_count ());
 
-    for (unsigned int i = 0; i < inner_maps.length; i++)
-      inner_maps[i].init ();
-
     if (unlikely (!index_map_plans.length || !inner_sets.length || !inner_maps.length)) return;
 
     bool retain_adv_map = false;
@@ -229,8 +226,8 @@ struct hvarvvar_subset_plan_t
     for (unsigned int i = 0; i < inner_sets.length; i++)
       hb_set_destroy (inner_sets[i]);
     hb_set_destroy (adv_set);
-    inner_maps.fini_deep ();
-    index_map_plans.fini_deep ();
+    inner_maps.fini ();
+    index_map_plans.fini ();
   }
 
   hb_inc_bimap_t outer_map;
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.cc b/thirdparty/harfbuzz/src/hb-ot-var.cc
index 6b42b45cd9..0376e26b4a 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.cc
+++ b/thirdparty/harfbuzz/src/hb-ot-var.cc
@@ -303,6 +303,9 @@ hb_ot_var_normalize_variations (hb_face_t            *face,
  * values for the axis are mapped to the interval [-1,1], with the default
  * axis value mapped to 0.
  *
+ * The normalized values have 14 bits of fixed-point sub-integer precision as per
+ * OpenType specification.
+ *
  * Any additional scaling defined in the face's `avar` table is also
  * applied, as described at https://docs.microsoft.com/en-us/typography/opentype/spec/avar
  *
diff --git a/thirdparty/harfbuzz/src/hb-ot-var.h b/thirdparty/harfbuzz/src/hb-ot-var.h
index ce201d3b4f..05147cc25e 100644
--- a/thirdparty/harfbuzz/src/hb-ot-var.h
+++ b/thirdparty/harfbuzz/src/hb-ot-var.h
@@ -109,7 +109,7 @@ typedef enum { /*< flags >*/
  * @tag: The #hb_tag_t tag identifying the design variation of the axis
  * @name_id: The `name` table Name ID that provides display names for the axis
  * @flags: The #hb_ot_var_axis_flags_t flags for the axis
- * @min_value: The mininum value on the variation axis that the font covers
+ * @min_value: The minimum value on the variation axis that the font covers
  * @default_value: The position on the variation axis corresponding to the font's defaults
  * @max_value: The maximum value on the variation axis that the font covers
  * 
diff --git a/thirdparty/harfbuzz/src/hb-repacker.hh b/thirdparty/harfbuzz/src/hb-repacker.hh
index 5c46b4cccc..b1726d8beb 100644
--- a/thirdparty/harfbuzz/src/hb-repacker.hh
+++ b/thirdparty/harfbuzz/src/hb-repacker.hh
@@ -42,26 +42,13 @@ struct graph_t
 {
   struct vertex_t
   {
-    vertex_t () :
-        distance (0),
-        space (0),
-        parents (),
-        start (0),
-        end (0),
-        priority(0) {}
-
-    void fini () {
-      obj.fini ();
-      parents.fini ();
-    }
-
     hb_serialize_context_t::object_t obj;
-    int64_t distance;
-    int64_t space;
+    int64_t distance = 0 ;
+    int64_t space = 0 ;
     hb_vector_t<unsigned> parents;
-    unsigned start;
-    unsigned end;
-    unsigned priority;
+    unsigned start = 0;
+    unsigned end = 0;
+    unsigned priority = 0;
 
     bool is_shared () const
     {
@@ -186,7 +173,7 @@ struct graph_t
 
   ~graph_t ()
   {
-    vertices_.fini_deep ();
+    vertices_.fini ();
   }
 
   bool in_error () const
@@ -309,7 +296,7 @@ struct graph_t
     remap_all_obj_indices (id_map, &sorted_graph);
 
     hb_swap (vertices_, sorted_graph);
-    sorted_graph.fini_deep ();
+    sorted_graph.fini ();
   }
 
   /*
@@ -369,7 +356,7 @@ struct graph_t
     remap_all_obj_indices (id_map, &sorted_graph);
 
     hb_swap (vertices_, sorted_graph);
-    sorted_graph.fini_deep ();
+    sorted_graph.fini ();
   }
 
   /*
@@ -402,11 +389,15 @@ struct graph_t
     while (roots)
     {
       unsigned next = HB_SET_VALUE_INVALID;
+      if (unlikely (!check_success (!roots.in_error ()))) break;
       if (!roots.next (&next)) break;
 
       hb_set_t connected_roots;
       find_connected_nodes (next, roots, visited, connected_roots);
+      if (unlikely (!check_success (!connected_roots.in_error ()))) break;
+
       isolate_subgraph (connected_roots);
+      if (unlikely (!check_success (!connected_roots.in_error ()))) break;
 
       unsigned next_space = this->next_space ();
       num_roots_for_space_.push (0);
@@ -423,6 +414,8 @@ struct graph_t
       //                into the 32 bit space as needed, instead of using isolation.
     }
 
+
+
     return true;
   }
 
@@ -865,7 +858,7 @@ struct graph_t
     // Redundant ones are filtered out later on by the visited set.
     // According to https://www3.cs.stonybrook.edu/~rezaul/papers/TR-07-54.pdf
     // for practical performance this is faster then using a more advanced queue
-    // (such as a fibonaacci queue) with a fast decrease priority.
+    // (such as a fibonacci queue) with a fast decrease priority.
     for (unsigned i = 0; i < vertices_.length; i++)
     {
       if (i == vertices_.length - 1)
@@ -1074,6 +1067,7 @@ struct graph_t
                              hb_set_t& visited,
                              hb_set_t& connected)
   {
+    if (unlikely (!check_success (!visited.in_error ()))) return;
     if (visited.has (start_idx)) return;
     visited.add (start_idx);
 
diff --git a/thirdparty/harfbuzz/src/hb-serialize.hh b/thirdparty/harfbuzz/src/hb-serialize.hh
index 823c0be8b5..6615f033c5 100644
--- a/thirdparty/harfbuzz/src/hb-serialize.hh
+++ b/thirdparty/harfbuzz/src/hb-serialize.hh
@@ -279,7 +279,7 @@ struct hb_serialize_context_t
     object_pool.release (obj);
   }
 
-  /* Set share to false when an object is unlikely sharable with others
+  /* Set share to false when an object is unlikely shareable with others
    * so not worth an attempt, or a contiguous table is serialized as
    * multiple consecutive objects in the reverse order so can't be shared.
    */
@@ -381,7 +381,7 @@ struct hb_serialize_context_t
   // Adding a virtual link from object a to object b will ensure that object b is always packed after
   // object a in the final serialized order.
   //
-  // This is useful in certain situtations where there needs to be a specific ordering in the
+  // This is useful in certain situations where there needs to be a specific ordering in the
   // final serialization. Such as when platform bugs require certain orderings, or to provide
   //  guidance to the repacker for better offset overflow resolution.
   void add_virtual_link (objidx_t objidx)
@@ -510,7 +510,7 @@ struct hb_serialize_context_t
   { return reinterpret_cast<Type *> (this->head); }
   template <typename Type>
   Type *start_embed (const Type &obj) const
-  { return start_embed (hb_addressof (obj)); }
+  { return start_embed (std::addressof (obj)); }
 
   bool err (hb_serialize_error_t err_type)
   {
@@ -548,7 +548,7 @@ struct hb_serialize_context_t
   }
   template <typename Type>
   Type *embed (const Type &obj)
-  { return embed (hb_addressof (obj)); }
+  { return embed (std::addressof (obj)); }
 
   template <typename Type, typename ...Ts> auto
   _copy (const Type &src, hb_priority<1>, Ts&&... ds) HB_RETURN
@@ -595,19 +595,19 @@ struct hb_serialize_context_t
   }
   template <typename Type>
   Type *extend_size (Type &obj, size_t size)
-  { return extend_size (hb_addressof (obj), size); }
+  { return extend_size (std::addressof (obj), size); }
 
   template <typename Type>
   Type *extend_min (Type *obj) { return extend_size (obj, obj->min_size); }
   template <typename Type>
-  Type *extend_min (Type &obj) { return extend_min (hb_addressof (obj)); }
+  Type *extend_min (Type &obj) { return extend_min (std::addressof (obj)); }
 
   template <typename Type, typename ...Ts>
   Type *extend (Type *obj, Ts&&... ds)
   { return extend_size (obj, obj->get_size (std::forward<Ts> (ds)...)); }
   template <typename Type, typename ...Ts>
   Type *extend (Type &obj, Ts&&... ds)
-  { return extend (hb_addressof (obj), std::forward<Ts> (ds)...); }
+  { return extend (std::addressof (obj), std::forward<Ts> (ds)...); }
 
   /* Output routines. */
   hb_bytes_t copy_bytes () const
diff --git a/thirdparty/harfbuzz/src/hb-style.cc b/thirdparty/harfbuzz/src/hb-style.cc
index f1b44cea53..c0c5c4832c 100644
--- a/thirdparty/harfbuzz/src/hb-style.cc
+++ b/thirdparty/harfbuzz/src/hb-style.cc
@@ -48,13 +48,12 @@ _hb_angle_to_ratio (float a)
 {
   return tanf (a * float (M_PI / 180.));
 }
-#if 0
+
 static inline float
 _hb_ratio_to_angle (float r)
 {
   return atanf (r) * float (180. / M_PI);
 }
-#endif
 
 /**
  * hb_style_get_value:
@@ -73,7 +72,8 @@ float
 hb_style_get_value (hb_font_t *font, hb_style_tag_t style_tag)
 {
   if (unlikely (style_tag == HB_STYLE_TAG_SLANT_RATIO))
-    return _hb_angle_to_ratio (hb_style_get_value (font, HB_STYLE_TAG_SLANT_ANGLE));
+    return _hb_angle_to_ratio (hb_style_get_value (font, HB_STYLE_TAG_SLANT_ANGLE))
+	 + font->slant;
 
   hb_face_t *face = font->face;
 
@@ -109,7 +109,14 @@ hb_style_get_value (hb_font_t *font, hb_style_tag_t style_tag)
 	   : 12.f;
   }
   case HB_STYLE_TAG_SLANT_ANGLE:
-    return face->table.post->table->italicAngle.to_float ();
+  {
+    float angle = face->table.post->table->italicAngle.to_float ();
+
+    if (font->slant)
+      angle = _hb_ratio_to_angle (font->slant + _hb_angle_to_ratio (angle));
+
+    return angle;
+  }
   case HB_STYLE_TAG_WIDTH:
     return face->table.OS2->has_data ()
 	   ? face->table.OS2->get_width ()
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff-common.hh b/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
index 7fd96ca86d..18657705fa 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
+++ b/thirdparty/harfbuzz/src/hb-subset-cff-common.hh
@@ -275,60 +275,36 @@ struct subr_flattener_t
 
 struct subr_closures_t
 {
-  subr_closures_t () : valid (false), global_closure (nullptr)
-  { local_closures.init (); }
-
-  void init (unsigned int fd_count)
+  subr_closures_t (unsigned int fd_count) : valid (false), global_closure (), local_closures ()
   {
     valid = true;
-    global_closure = hb_set_create ();
-    if (global_closure == hb_set_get_empty ())
-      valid = false;
     if (!local_closures.resize (fd_count))
       valid = false;
-
-    for (unsigned int i = 0; i < local_closures.length; i++)
-    {
-      local_closures[i] = hb_set_create ();
-      if (local_closures[i] == hb_set_get_empty ())
-	valid = false;
-    }
-  }
-
-  void fini ()
-  {
-    hb_set_destroy (global_closure);
-    for (unsigned int i = 0; i < local_closures.length; i++)
-      hb_set_destroy (local_closures[i]);
-    local_closures.fini ();
   }
 
   void reset ()
   {
-    hb_set_clear (global_closure);
+    global_closure.clear();
     for (unsigned int i = 0; i < local_closures.length; i++)
-      hb_set_clear (local_closures[i]);
+      local_closures[i].clear();
   }
 
   bool is_valid () const { return valid; }
   bool  valid;
-  hb_set_t  *global_closure;
-  hb_vector_t<hb_set_t *> local_closures;
+  hb_set_t  global_closure;
+  hb_vector_t<hb_set_t> local_closures;
 };
 
 struct parsed_cs_op_t : op_str_t
 {
   void init (unsigned int subr_num_ = 0)
   {
-    op_str_t::init ();
     subr_num = subr_num_;
     drop_flag = false;
     keep_flag = false;
     skip_flag = false;
   }
 
-  void fini () { op_str_t::fini (); }
-
   bool for_drop () const { return drop_flag; }
   void set_drop ()       { if (!for_keep ()) drop_flag = true; }
 
@@ -416,16 +392,6 @@ struct parsed_cs_str_t : parsed_values_t<parsed_cs_op_t>
 
 struct parsed_cs_str_vec_t : hb_vector_t<parsed_cs_str_t>
 {
-  void init (unsigned int len_ = 0)
-  {
-    SUPER::init ();
-    if (unlikely (!resize (len_)))
-      return;
-    for (unsigned int i = 0; i < length; i++)
-      (*this)[i].init ();
-  }
-  void fini () { SUPER::fini_deep (); }
-
   private:
   typedef hb_vector_t<parsed_cs_str_t> SUPER;
 };
@@ -496,7 +462,7 @@ struct subr_subset_param_t
 
 struct subr_remap_t : hb_inc_bimap_t
 {
-  void create (hb_set_t *closure)
+  void create (const hb_set_t *closure)
   {
     /* create a remapping of subroutine numbers from old to new.
      * no optimization based on usage counts. fonttools doesn't appear doing that either.
@@ -526,19 +492,9 @@ struct subr_remap_t : hb_inc_bimap_t
 
 struct subr_remaps_t
 {
-  subr_remaps_t ()
+  subr_remaps_t (unsigned int fdCount)
   {
-    global_remap.init ();
-    local_remaps.init ();
-  }
-
-  ~subr_remaps_t () { fini (); }
-
-  void init (unsigned int fdCount)
-  {
-    if (unlikely (!local_remaps.resize (fdCount))) return;
-    for (unsigned int i = 0; i < fdCount; i++)
-      local_remaps[i].init ();
+    local_remaps.resize (fdCount);
   }
 
   bool in_error()
@@ -548,15 +504,9 @@ struct subr_remaps_t
 
   void create (subr_closures_t& closures)
   {
-    global_remap.create (closures.global_closure);
+    global_remap.create (&closures.global_closure);
     for (unsigned int i = 0; i < local_remaps.length; i++)
-      local_remaps[i].create (closures.local_closures[i]);
-  }
-
-  void fini ()
-  {
-    global_remap.fini ();
-    local_remaps.fini_deep ();
+      local_remaps[i].create (&closures.local_closures[i]);
   }
 
   subr_remap_t	       global_remap;
@@ -567,21 +517,8 @@ template <typename SUBSETTER, typename SUBRS, typename ACC, typename ENV, typena
 struct subr_subsetter_t
 {
   subr_subsetter_t (ACC &acc_, const hb_subset_plan_t *plan_)
-    : acc (acc_), plan (plan_)
-  {
-    parsed_charstrings.init ();
-    parsed_global_subrs.init ();
-    parsed_local_subrs.init ();
-  }
-
-  ~subr_subsetter_t ()
-  {
-    closures.fini ();
-    remaps.fini ();
-    parsed_charstrings.fini_deep ();
-    parsed_global_subrs.fini_deep ();
-    parsed_local_subrs.fini_deep ();
-  }
+      : acc (acc_), plan (plan_), closures(acc_.fdCount), remaps(acc_.fdCount)
+  {}
 
   /* Subroutine subsetting with --no-desubroutinize runs in phases:
    *
@@ -599,11 +536,8 @@ struct subr_subsetter_t
    */
   bool subset (void)
   {
-    closures.init (acc.fdCount);
-    remaps.init (acc.fdCount);
-
-    parsed_charstrings.init (plan->num_output_glyphs ());
-    parsed_global_subrs.init (acc.globalSubrs->count);
+    parsed_charstrings.resize (plan->num_output_glyphs ());
+    parsed_global_subrs.resize (acc.globalSubrs->count);
 
     if (unlikely (remaps.in_error()
                   || parsed_charstrings.in_error ()
@@ -615,7 +549,7 @@ struct subr_subsetter_t
 
     for (unsigned int i = 0; i < acc.fdCount; i++)
     {
-      parsed_local_subrs[i].init (acc.privateDicts[i].localSubrs->count);
+      parsed_local_subrs[i].resize (acc.privateDicts[i].localSubrs->count);
       if (unlikely (parsed_local_subrs[i].in_error ())) return false;
     }
     if (unlikely (!closures.valid))
@@ -638,7 +572,7 @@ struct subr_subsetter_t
       subr_subset_param_t  param;
       param.init (&parsed_charstrings[i],
 		  &parsed_global_subrs,  &parsed_local_subrs[fd],
-		  closures.global_closure, closures.local_closures[fd],
+		  &closures.global_closure, &closures.local_closures[fd],
 		  plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 
       if (unlikely (!interp.interpret (param)))
@@ -662,7 +596,7 @@ struct subr_subsetter_t
 	subr_subset_param_t  param;
 	param.init (&parsed_charstrings[i],
 		    &parsed_global_subrs,  &parsed_local_subrs[fd],
-		    closures.global_closure, closures.local_closures[fd],
+		    &closures.global_closure, &closures.local_closures[fd],
                     plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 
 	drop_hints_param_t  drop;
@@ -687,7 +621,7 @@ struct subr_subsetter_t
 	subr_subset_param_t  param;
 	param.init (&parsed_charstrings[i],
 		    &parsed_global_subrs,  &parsed_local_subrs[fd],
-		    closures.global_closure, closures.local_closures[fd],
+		    &closures.global_closure, &closures.local_closures[fd],
                     plan->flags & HB_SUBSET_FLAGS_NO_HINTING);
 	collect_subr_refs_in_str (parsed_charstrings[i], param);
       }
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff1.cc b/thirdparty/harfbuzz/src/hb-subset-cff1.cc
index b4e24122c9..35fecd67bc 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff1.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-cff1.cc
@@ -362,43 +362,11 @@ struct cff1_subr_subsetter_t : subr_subsetter_t<cff1_subr_subsetter_t, CFF1Subrs
 
 struct cff_subset_plan {
   cff_subset_plan ()
-    : info (),
-      orig_fdcount (0),
-      subset_fdcount (1),
-      subset_fdselect_format (0),
-      drop_hints (false),
-      desubroutinize(false)
   {
-    topdict_mod.init ();
-    subset_fdselect_ranges.init ();
-    fdmap.init ();
-    subset_charstrings.init ();
-    subset_globalsubrs.init ();
-    subset_localsubrs.init ();
-    fontdicts_mod.init ();
-    subset_enc_code_ranges.init ();
-    subset_enc_supp_codes.init ();
-    subset_charset_ranges.init ();
-    sidmap.init ();
     for (unsigned int i = 0; i < name_dict_values_t::ValCount; i++)
       topDictModSIDs[i] = CFF_UNDEF_SID;
   }
 
-  ~cff_subset_plan ()
-  {
-    topdict_mod.fini ();
-    subset_fdselect_ranges.fini ();
-    fdmap.fini ();
-    subset_charstrings.fini_deep ();
-    subset_globalsubrs.fini_deep ();
-    subset_localsubrs.fini_deep ();
-    fontdicts_mod.fini ();
-    subset_enc_code_ranges.fini ();
-    subset_enc_supp_codes.fini ();
-    subset_charset_ranges.fini ();
-    sidmap.fini ();
-  }
-
   void plan_subset_encoding (const OT::cff1::accelerator_subset_t &acc, hb_subset_plan_t *plan)
   {
     const Encoding *encoding = acc.encoding;
@@ -672,9 +640,9 @@ struct cff_subset_plan {
   cff1_sub_table_info_t		info;
 
   unsigned int    num_glyphs;
-  unsigned int    orig_fdcount;
-  unsigned int    subset_fdcount;
-  unsigned int    subset_fdselect_format;
+  unsigned int    orig_fdcount = 0;
+  unsigned int    subset_fdcount = 1;
+  unsigned int    subset_fdselect_format = 0;
   hb_vector_t<code_pair_t>   subset_fdselect_ranges;
 
   /* font dict index remap table from fullset FDArray to subset FDArray.
@@ -686,7 +654,7 @@ struct cff_subset_plan {
   hb_vector_t<str_buff_vec_t>	subset_localsubrs;
   hb_vector_t<cff1_font_dict_values_mod_t>  fontdicts_mod;
 
-  bool		drop_hints;
+  bool		drop_hints = false;
 
   bool		gid_renum;
   bool		subset_encoding;
@@ -702,7 +670,7 @@ struct cff_subset_plan {
   remap_sid_t	sidmap;
   unsigned int	topDictModSIDs[name_dict_values_t::ValCount];
 
-  bool		desubroutinize;
+  bool		desubroutinize = false;
 };
 
 static bool _serialize_cff1 (hb_serialize_context_t *c,
diff --git a/thirdparty/harfbuzz/src/hb-subset-cff2.cc b/thirdparty/harfbuzz/src/hb-subset-cff2.cc
index 896ae64016..92dd6b1d2c 100644
--- a/thirdparty/harfbuzz/src/hb-subset-cff2.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-cff2.cc
@@ -233,29 +233,6 @@ struct cff2_subr_subsetter_t : subr_subsetter_t<cff2_subr_subsetter_t, CFF2Subrs
 };
 
 struct cff2_subset_plan {
-  cff2_subset_plan ()
-    : orig_fdcount (0),
-      subset_fdcount(1),
-      subset_fdselect_size (0),
-      subset_fdselect_format (0),
-      drop_hints (false),
-      desubroutinize (false)
-  {
-    subset_fdselect_ranges.init ();
-    fdmap.init ();
-    subset_charstrings.init ();
-    subset_globalsubrs.init ();
-    subset_localsubrs.init ();
-  }
-
-  ~cff2_subset_plan ()
-  {
-    subset_fdselect_ranges.fini ();
-    fdmap.fini ();
-    subset_charstrings.fini_deep ();
-    subset_globalsubrs.fini_deep ();
-    subset_localsubrs.fini_deep ();
-  }
 
   bool create (const OT::cff2::accelerator_subset_t &acc,
 	      hb_subset_plan_t *plan)
@@ -320,10 +297,10 @@ struct cff2_subset_plan {
 
   cff2_sub_table_info_t info;
 
-  unsigned int    orig_fdcount;
-  unsigned int    subset_fdcount;
-  unsigned int	  subset_fdselect_size;
-  unsigned int    subset_fdselect_format;
+  unsigned int    orig_fdcount = 0;
+  unsigned int    subset_fdcount = 1;
+  unsigned int	  subset_fdselect_size = 0;
+  unsigned int    subset_fdselect_format = 0;
   hb_vector_t<code_pair_t>   subset_fdselect_ranges;
 
   hb_inc_bimap_t   fdmap;
@@ -332,8 +309,8 @@ struct cff2_subset_plan {
   str_buff_vec_t	    subset_globalsubrs;
   hb_vector_t<str_buff_vec_t> subset_localsubrs;
 
-  bool	    drop_hints;
-  bool	    desubroutinize;
+  bool	    drop_hints = false;
+  bool	    desubroutinize = false;
 };
 
 static bool _serialize_cff2 (hb_serialize_context_t *c,
@@ -473,12 +450,8 @@ _hb_subset_cff2 (const OT::cff2::accelerator_subset_t  &acc,
 bool
 hb_subset_cff2 (hb_subset_context_t *c)
 {
-  OT::cff2::accelerator_subset_t acc;
-  acc.init (c->plan->source);
-  bool result = likely (acc.is_valid ()) && _hb_subset_cff2 (acc, c);
-  acc.fini ();
-
-  return result;
+  OT::cff2::accelerator_subset_t acc (c->plan->source);
+  return acc.is_valid () && _hb_subset_cff2 (acc, c);
 }
 
 #endif
diff --git a/thirdparty/harfbuzz/src/hb-subset-plan.cc b/thirdparty/harfbuzz/src/hb-subset-plan.cc
index 883ab82093..af4fcb8137 100644
--- a/thirdparty/harfbuzz/src/hb-subset-plan.cc
+++ b/thirdparty/harfbuzz/src/hb-subset-plan.cc
@@ -228,10 +228,8 @@ _cmap_closure (hb_face_t	   *face,
 	       const hb_set_t	   *unicodes,
 	       hb_set_t		   *glyphset)
 {
-  OT::cmap::accelerator_t cmap;
-  cmap.init (face);
+  OT::cmap::accelerator_t cmap (face);
   cmap.table->closure_glyphs (unicodes, glyphset);
-  cmap.fini ();
 }
 
 static void _colr_closure (hb_face_t *face,
@@ -239,8 +237,7 @@ static void _colr_closure (hb_face_t *face,
                            hb_map_t *palettes_map,
                            hb_set_t *glyphs_colred)
 {
-  OT::COLR::accelerator_t colr;
-  colr.init (face);
+  OT::COLR::accelerator_t colr (face);
   if (!colr.is_valid ()) return;
 
   unsigned iteration_count = 0;
@@ -263,7 +260,6 @@ static void _colr_closure (hb_face_t *face,
   colr.closure_V0palette_indices (glyphs_colred, &palette_indices);
   _remap_indexes (&layer_indices, layers_map);
   _remap_palette_indexes (&palette_indices, palettes_map);
-  colr.fini ();
 }
 
 static inline void
@@ -294,8 +290,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
                               const hb_set_t *glyphs,
                               hb_subset_plan_t *plan)
 {
-  OT::cmap::accelerator_t cmap;
-  cmap.init (plan->source);
+  OT::cmap::accelerator_t cmap (plan->source);
 
   constexpr static const int size_threshold = 4096;
 
@@ -343,8 +338,6 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
 
   + plan->codepoint_to_glyph->keys ()   | hb_sink (plan->unicodes);
   + plan->codepoint_to_glyph->values () | hb_sink (plan->_glyphset_gsub);
-
-  cmap.fini ();
 }
 
 static void
@@ -353,13 +346,9 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
 			  bool close_over_gpos,
 			  bool close_over_gdef)
 {
-  OT::glyf::accelerator_t glyf;
-#ifndef HB_NO_SUBSET_CFF
-  OT::cff1::accelerator_t cff;
-#endif
-  glyf.init (plan->source);
+  OT::glyf::accelerator_t glyf (plan->source);
 #ifndef HB_NO_SUBSET_CFF
-  cff.init (plan->source);
+  OT::cff1::accelerator_t cff (plan->source);
 #endif
 
   plan->_glyphset_gsub->add (0); // Not-def
@@ -419,11 +408,6 @@ _populate_gids_to_retain (hb_subset_plan_t* plan,
 				       plan->layout_variation_indices,
 				       plan->layout_variation_idx_map);
 #endif
-
-#ifndef HB_NO_SUBSET_CFF
-  cff.fini ();
-#endif
-  glyf.fini ();
 }
 
 static void
diff --git a/thirdparty/harfbuzz/src/hb-uniscribe.cc b/thirdparty/harfbuzz/src/hb-uniscribe.cc
index 0e5a114f7d..50f71ce9ce 100644
--- a/thirdparty/harfbuzz/src/hb-uniscribe.cc
+++ b/thirdparty/harfbuzz/src/hb-uniscribe.cc
@@ -878,7 +878,8 @@ retry:
   if (backward)
     hb_buffer_reverse (buffer);
 
-  buffer->clear_glyph_flags (HB_GLYPH_FLAG_UNSAFE_TO_BREAK);
+  buffer->clear_glyph_flags ();
+  buffer->unsafe_to_break ();
 
   /* Wow, done! */
   return true;
diff --git a/thirdparty/harfbuzz/src/hb-vector.hh b/thirdparty/harfbuzz/src/hb-vector.hh
index b0a1e5e966..6c7d32e49d 100644
--- a/thirdparty/harfbuzz/src/hb-vector.hh
+++ b/thirdparty/harfbuzz/src/hb-vector.hh
@@ -32,11 +32,14 @@
 #include "hb-null.hh"
 
 
-template <typename Type>
-struct hb_vector_t
+template <typename Type,
+	  bool sorted=false>
+struct hb_vector_t : std::conditional<sorted, hb_vector_t<Type, false>, hb_empty_t>::type
 {
   typedef Type item_t;
   static constexpr unsigned item_size = hb_static_size (Type);
+  using array_t = typename std::conditional<sorted, hb_sorted_array_t<Type>, hb_array_t<Type>>::type;
+  using c_array_t = typename std::conditional<sorted, hb_sorted_array_t<const Type>, hb_array_t<const Type>>::type;
 
   hb_vector_t () = default;
   hb_vector_t (std::initializer_list<Type> lst) : hb_vector_t ()
@@ -82,16 +85,10 @@ struct hb_vector_t
 
   void fini ()
   {
+    shrink_vector (0);
     hb_free (arrayZ);
     init ();
   }
-  void fini_deep ()
-  {
-    unsigned int count = length;
-    for (unsigned int i = 0; i < count; i++)
-      arrayZ[i].fini ();
-    fini ();
-  }
 
   void reset ()
   {
@@ -152,24 +149,24 @@ struct hb_vector_t
   template <typename T>
   hb_vector_t& operator << (T&& v) { push (std::forward<T> (v)); return *this; }
 
-  hb_array_t<      Type> as_array ()       { return hb_array (arrayZ, length); }
-  hb_array_t<const Type> as_array () const { return hb_array (arrayZ, length); }
+  array_t   as_array ()       { return hb_array (arrayZ, length); }
+  c_array_t as_array () const { return hb_array (arrayZ, length); }
 
   /* Iterator. */
-  typedef hb_array_t<const Type>   iter_t;
-  typedef hb_array_t<      Type> writer_t;
+  typedef c_array_t   iter_t;
+  typedef array_t   writer_t;
     iter_t   iter () const { return as_array (); }
   writer_t writer ()       { return as_array (); }
   operator   iter_t () const { return   iter (); }
   operator writer_t ()       { return writer (); }
 
-  hb_array_t<const Type> sub_array (unsigned int start_offset, unsigned int count) const
+  c_array_t sub_array (unsigned int start_offset, unsigned int count) const
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<const Type> sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */) const
+  c_array_t sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */) const
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int count)
+  array_t sub_array (unsigned int start_offset, unsigned int count)
   { return as_array ().sub_array (start_offset, count); }
-  hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */)
+  array_t sub_array (unsigned int start_offset, unsigned int *count = nullptr /* IN/OUT */)
   { return as_array ().sub_array (start_offset, count); }
 
   hb_sorted_array_t<Type> as_sorted_array ()
@@ -192,6 +189,7 @@ struct hb_vector_t
   template <typename T>
   Type *push (T&& v)
   {
+    /* TODO Emplace? */
     Type *p = push ();
     if (p == &Crap (Type))
       // If push failed to allocate then don't copy v, since this may cause
@@ -204,6 +202,92 @@ struct hb_vector_t
 
   bool in_error () const { return allocated < 0; }
 
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_copy_assignable<T>::value)>
+  Type *
+  realloc_vector (unsigned new_allocated)
+  {
+    return (Type *) hb_realloc (arrayZ, new_allocated * sizeof (Type));
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_copy_assignable<T>::value)>
+  Type *
+  realloc_vector (unsigned new_allocated)
+  {
+    Type *new_array = (Type *) hb_malloc (new_allocated * sizeof (Type));
+    if (likely (new_array))
+    {
+      for (unsigned i = 0; i < length; i++)
+	new (std::addressof (new_array[i])) Type ();
+      for (unsigned i = 0; i < (unsigned) length; i++)
+	new_array[i] = std::move (arrayZ[i]);
+      unsigned old_length = length;
+      shrink_vector (0);
+      length = old_length;
+      hb_free (arrayZ);
+    }
+    return new_array;
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_constructible<T>::value ||
+			  !std::is_default_constructible<T>::value)>
+  void
+  grow_vector (unsigned size)
+  {
+    memset (arrayZ + length, 0, (size - length) * sizeof (*arrayZ));
+    length = size;
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_constructible<T>::value &&
+			   std::is_default_constructible<T>::value)>
+  void
+  grow_vector (unsigned size)
+  {
+    while (length < size)
+    {
+      length++;
+      new (std::addressof (arrayZ[length - 1])) Type ();
+    }
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_destructible<T>::value)>
+  void
+  shrink_vector (unsigned size)
+  {
+    length = size;
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_destructible<T>::value)>
+  void
+  shrink_vector (unsigned size)
+  {
+    while ((unsigned) length > size)
+    {
+      arrayZ[(unsigned) length - 1].~Type ();
+      length--;
+    }
+  }
+
+  template <typename T = Type,
+	    hb_enable_if (std::is_trivially_copy_assignable<T>::value)>
+  void
+  shift_down_vector (unsigned i)
+  {
+    memmove (static_cast<void *> (&arrayZ[i - 1]),
+	     static_cast<void *> (&arrayZ[i]),
+	     (length - i) * sizeof (Type));
+  }
+  template <typename T = Type,
+	    hb_enable_if (!std::is_trivially_copy_assignable<T>::value)>
+  void
+  shift_down_vector (unsigned i)
+  {
+    for (; i < length; i++)
+      arrayZ[i - 1] = std::move (arrayZ[i]);
+  }
+
   /* Allocate for size but don't adjust length. */
   bool alloc (unsigned int size)
   {
@@ -225,7 +309,7 @@ struct hb_vector_t
       (new_allocated < (unsigned) allocated) ||
       hb_unsigned_mul_overflows (new_allocated, sizeof (Type));
     if (likely (!overflows))
-      new_array = (Type *) hb_realloc (arrayZ, new_allocated * sizeof (Type));
+      new_array = realloc_vector (new_allocated);
 
     if (unlikely (!new_array))
     {
@@ -246,7 +330,9 @@ struct hb_vector_t
       return false;
 
     if (size > length)
-      memset (arrayZ + length, 0, (size - length) * sizeof (*arrayZ));
+      grow_vector (size);
+    else if (size < length)
+      shrink_vector (size);
 
     length = size;
     return true;
@@ -255,48 +341,38 @@ struct hb_vector_t
   Type pop ()
   {
     if (!length) return Null (Type);
-    return std::move (arrayZ[--length]); /* Does this move actually work? */
+    Type v = std::move (arrayZ[length - 1]);
+    arrayZ[length - 1].~Type ();
+    length--;
+    return v;
   }
 
   void remove (unsigned int i)
   {
     if (unlikely (i >= length))
       return;
-    memmove (static_cast<void *> (&arrayZ[i]),
-	     static_cast<void *> (&arrayZ[i + 1]),
-	     (length - i - 1) * sizeof (Type));
+    arrayZ[i].~Type ();
+    shift_down_vector (i + 1);
     length--;
   }
 
   void shrink (int size_)
   {
     unsigned int size = size_ < 0 ? 0u : (unsigned int) size_;
-     if (size < length)
-       length = size;
-  }
+    if (size >= length)
+      return;
 
-  template <typename T>
-  Type *find (T v)
-  {
-    for (unsigned int i = 0; i < length; i++)
-      if (arrayZ[i] == v)
-	return &arrayZ[i];
-    return nullptr;
-  }
-  template <typename T>
-  const Type *find (T v) const
-  {
-    for (unsigned int i = 0; i < length; i++)
-      if (arrayZ[i] == v)
-	return &arrayZ[i];
-    return nullptr;
+    shrink_vector (size);
   }
 
+
+  /* Sorting API. */
   void qsort (int (*cmp)(const void*, const void*))
   { as_array ().qsort (cmp); }
   void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1)
   { as_array ().qsort (start, end); }
 
+  /* Unsorted search API. */
   template <typename T>
   Type *lsearch (const T &x, Type *not_found = nullptr)
   { return as_array ().lsearch (x, not_found); }
@@ -306,47 +382,25 @@ struct hb_vector_t
   template <typename T>
   bool lfind (const T &x, unsigned *pos = nullptr) const
   { return as_array ().lfind (x, pos); }
-};
 
-template <typename Type>
-struct hb_sorted_vector_t : hb_vector_t<Type>
-{
-  hb_sorted_vector_t () = default;
-  ~hb_sorted_vector_t () = default;
-  hb_sorted_vector_t (hb_sorted_vector_t& o) = default;
-  hb_sorted_vector_t (hb_sorted_vector_t &&o) = default;
-  hb_sorted_vector_t (std::initializer_list<Type> lst) : hb_vector_t<Type> (lst) {}
-  template <typename Iterable,
-	    hb_requires (hb_is_iterable (Iterable))>
-  hb_sorted_vector_t (const Iterable &o) : hb_vector_t<Type> (o) {}
-  hb_sorted_vector_t& operator = (const hb_sorted_vector_t &o) = default;
-  hb_sorted_vector_t& operator = (hb_sorted_vector_t &&o) = default;
-  friend void swap (hb_sorted_vector_t& a, hb_sorted_vector_t& b)
-  { hb_swap ((hb_vector_t<Type>&) (a), (hb_vector_t<Type>&) (b)); }
-
-  hb_sorted_array_t<      Type> as_array ()       { return hb_sorted_array (this->arrayZ, this->length); }
-  hb_sorted_array_t<const Type> as_array () const { return hb_sorted_array (this->arrayZ, this->length); }
-
-  /* Iterator. */
-  typedef hb_sorted_array_t<const Type> const_iter_t;
-  typedef hb_sorted_array_t<      Type>       iter_t;
-  const_iter_t  iter () const { return as_array (); }
-  const_iter_t citer () const { return as_array (); }
-	iter_t  iter ()       { return as_array (); }
-  operator       iter_t ()       { return iter (); }
-  operator const_iter_t () const { return iter (); }
-
-  template <typename T>
+  /* Sorted search API. */
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   Type *bsearch (const T &x, Type *not_found = nullptr)
   { return as_array ().bsearch (x, not_found); }
-  template <typename T>
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   const Type *bsearch (const T &x, const Type *not_found = nullptr) const
   { return as_array ().bsearch (x, not_found); }
-  template <typename T>
+  template <typename T,
+	    bool Sorted=sorted, hb_enable_if (Sorted)>
   bool bfind (const T &x, unsigned int *i = nullptr,
 	      hb_not_found_t not_found = HB_NOT_FOUND_DONT_STORE,
 	      unsigned int to_store = (unsigned int) -1) const
   { return as_array ().bfind (x, i, not_found, to_store); }
 };
 
+template <typename Type>
+using hb_sorted_vector_t = hb_vector_t<Type, true>;
+
 #endif /* HB_VECTOR_HH */
diff --git a/thirdparty/harfbuzz/src/hb-version.h b/thirdparty/harfbuzz/src/hb-version.h
index 52b124b745..91ccb3dcde 100644
--- a/thirdparty/harfbuzz/src/hb-version.h
+++ b/thirdparty/harfbuzz/src/hb-version.h
@@ -47,20 +47,20 @@ HB_BEGIN_DECLS
  *
  * The minor component of the library version available at compile-time.
  */
-#define HB_VERSION_MINOR 2
+#define HB_VERSION_MINOR 3
 /**
  * HB_VERSION_MICRO:
  *
  * The micro component of the library version available at compile-time.
  */
-#define HB_VERSION_MICRO 0
+#define HB_VERSION_MICRO 1
 
 /**
  * HB_VERSION_STRING:
  *
  * A string literal containing the library version available at compile-time.
  */
-#define HB_VERSION_STRING "3.2.0"
+#define HB_VERSION_STRING "3.3.1"
 
 /**
  * HB_VERSION_ATLEAST:
diff --git a/thirdparty/libwebp/AUTHORS b/thirdparty/libwebp/AUTHORS
index 30abde0326..8307c2099d 100644
--- a/thirdparty/libwebp/AUTHORS
+++ b/thirdparty/libwebp/AUTHORS
@@ -32,6 +32,7 @@ Contributors:
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Roberto Alanis (alanisbaez at google dot com)
 - Sam Clegg (sbc at chromium dot org)
 - Scott Hancher (seh at google dot com)
 - Scott LaVarnway (slavarnway at google dot com)
diff --git a/thirdparty/libwebp/src/dec/vp8_dec.c b/thirdparty/libwebp/src/dec/vp8_dec.c
index 5f405e4c2a..2003935ec4 100644
--- a/thirdparty/libwebp/src/dec/vp8_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8_dec.c
@@ -403,7 +403,7 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+// See section 13-2: https://datatracker.ietf.org/doc/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
   if (!VP8GetBit(br, p[3], "coeffs")) {
diff --git a/thirdparty/libwebp/src/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h
index 20526a87c4..9af22f8cc6 100644
--- a/thirdparty/libwebp/src/dec/vp8i_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8i_dec.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 1
 #define DEC_MIN_VERSION 2
-#define DEC_REV_VERSION 1
+#define DEC_REV_VERSION 2
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/thirdparty/libwebp/src/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c
index 73c3b54fff..78db014030 100644
--- a/thirdparty/libwebp/src/dec/vp8l_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8l_dec.c
@@ -84,7 +84,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // to 256 (green component values) + 24 (length prefix values)
 // + color_cache_size (between 0 and 2048).
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
-// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+// https://github.com/madler/zlib/blob/v1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
 static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
diff --git a/thirdparty/libwebp/src/demux/anim_decode.c b/thirdparty/libwebp/src/demux/anim_decode.c
index 2bf4dcffe0..e077ffb536 100644
--- a/thirdparty/libwebp/src/demux/anim_decode.c
+++ b/thirdparty/libwebp/src/demux/anim_decode.c
@@ -23,6 +23,14 @@
 
 #define NUM_CHANNELS 4
 
+// Channel extraction from a uint32_t representation of a uint8_t RGBA/BGRA
+// buffer.
+#ifdef WORDS_BIGENDIAN
+#define CHANNEL_SHIFT(i) (24 - (i) * 8)
+#else
+#define CHANNEL_SHIFT(i) ((i) * 8)
+#endif
+
 typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
 static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels);
@@ -209,35 +217,35 @@ static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
   const uint8_t dst_channel = (dst >> shift) & 0xff;
   const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
   assert(blend_unscaled < (1ULL << 32) / scale);
-  return (blend_unscaled * scale) >> 24;
+  return (blend_unscaled * scale) >> CHANNEL_SHIFT(3);
 }
 
 // Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
 static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
 
   if (src_a == 0) {
     return dst;
   } else {
-    const uint8_t dst_a = (dst >> 24) & 0xff;
+    const uint8_t dst_a = (dst >> CHANNEL_SHIFT(3)) & 0xff;
     // This is the approximate integer arithmetic for the actual formula:
     // dst_factor_a = (dst_a * (255 - src_a)) / 255.
     const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
     const uint8_t blend_a = src_a + dst_factor_a;
     const uint32_t scale = (1UL << 24) / blend_a;
 
-    const uint8_t blend_r =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
-    const uint8_t blend_g =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
-    const uint8_t blend_b =
-        BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
+    const uint8_t blend_r = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(0));
+    const uint8_t blend_g = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(1));
+    const uint8_t blend_b = BlendChannelNonPremult(
+        src, src_a, dst, dst_factor_a, scale, CHANNEL_SHIFT(2));
     assert(src_a + dst_factor_a < 256);
 
-    return (blend_r << 0) |
-           (blend_g << 8) |
-           (blend_b << 16) |
-           ((uint32_t)blend_a << 24);
+    return ((uint32_t)blend_r << CHANNEL_SHIFT(0)) |
+           ((uint32_t)blend_g << CHANNEL_SHIFT(1)) |
+           ((uint32_t)blend_b << CHANNEL_SHIFT(2)) |
+           ((uint32_t)blend_a << CHANNEL_SHIFT(3));
   }
 }
 
@@ -247,7 +255,7 @@ static void BlendPixelRowNonPremult(uint32_t* const src,
                                     const uint32_t* const dst, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelNonPremult(src[i], dst[i]);
     }
@@ -264,7 +272,7 @@ static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
 
 // Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
 static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
-  const uint8_t src_a = (src >> 24) & 0xff;
+  const uint8_t src_a = (src >> CHANNEL_SHIFT(3)) & 0xff;
   return src + ChannelwiseMultiply(dst, 256 - src_a);
 }
 
@@ -274,7 +282,7 @@ static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
                                  int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint8_t src_alpha = (src[i] >> 24) & 0xff;
+    const uint8_t src_alpha = (src[i] >> CHANNEL_SHIFT(3)) & 0xff;
     if (src_alpha != 0xff) {
       src[i] = BlendPixelPremult(src[i], dst[i]);
     }
diff --git a/thirdparty/libwebp/src/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c
index 547a7725de..f04a2b8450 100644
--- a/thirdparty/libwebp/src/demux/demux.c
+++ b/thirdparty/libwebp/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 1
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 1
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/thirdparty/libwebp/src/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h
index 513e159bb3..c4f57e4d5b 100644
--- a/thirdparty/libwebp/src/dsp/dsp.h
+++ b/thirdparty/libwebp/src/dsp/dsp.h
@@ -119,7 +119,12 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+// Note: ARM64 is supported in Visual Studio 2017, but requires the direct
+// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in
+// arm_neon.h.
+#if defined(_MSC_VER) && \
+  ((_MSC_VER >= 1700 && defined(_M_ARM)) || \
+   (_MSC_VER >= 1920 && defined(_M_ARM64)))
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
 #endif
diff --git a/thirdparty/libwebp/src/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c
index 43bf1245c5..601962ba76 100644
--- a/thirdparty/libwebp/src/dsp/enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/enc_neon.c
@@ -9,7 +9,7 @@
 //
 // ARM NEON version of speed-critical encoding functions.
 //
-// adapted from libvpx (http://www.webmproject.org/code/)
+// adapted from libvpx (https://www.webmproject.org/code/)
 
 #include "src/dsp/dsp.h"
 
diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c
index d8bbb02b35..84a54296fd 100644
--- a/thirdparty/libwebp/src/dsp/lossless.c
+++ b/thirdparty/libwebp/src/dsp/lossless.c
@@ -107,63 +107,77 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)top;
-  return left;
+  return *left;
 }
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average3(*left, top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top) {
+  const uint32_t pred = Average2(*left, top[0]);
   return pred;
 }
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Average4(*left, top[-1], top[0], top[1]);
   return pred;
 }
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], *left, top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(*left, top[0], top[-1]);
   return pred;
 }
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(*left, top[0], top[-1]);
   return pred;
 }
 
diff --git a/thirdparty/libwebp/src/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h
index ebd316d1ed..c26c6bca07 100644
--- a/thirdparty/libwebp/src/dsp/lossless.h
+++ b/thirdparty/libwebp/src/dsp/lossless.h
@@ -28,23 +28,38 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Decoding
 
-typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
+                                      const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 
-uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
-uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor2_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor3_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor4_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor5_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor6_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor7_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor8_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor9_C(const uint32_t* const left,
+                          const uint32_t* const top);
+uint32_t VP8LPredictor10_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor11_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor12_C(const uint32_t* const left,
+                           const uint32_t* const top);
+uint32_t VP8LPredictor13_C(const uint32_t* const left,
+                           const uint32_t* const top);
 
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
diff --git a/thirdparty/libwebp/src/dsp/lossless_common.h b/thirdparty/libwebp/src/dsp/lossless_common.h
index 96a106f9ee..6a2f736b5e 100644
--- a/thirdparty/libwebp/src/dsp/lossless_common.h
+++ b/thirdparty/libwebp/src/dsp/lossless_common.h
@@ -179,7 +179,7 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
   int x;                                                             \
   assert(upper != NULL);                                             \
   for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
     out[x] = VP8LAddPixels(in[x], pred);                             \
   }                                                                  \
 }
diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c
index c3e8537ade..1580631e38 100644
--- a/thirdparty/libwebp/src/dsp/lossless_enc.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc.c
@@ -745,7 +745,7 @@ static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
   assert(upper != NULL);                                                   \
   for (x = 0; x < num_pixels; ++x) {                                       \
     const uint32_t pred =                                                  \
-        VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x);              \
+        VP8LPredictor##PREDICTOR_I##_C(&in[x - 1], upper + x);             \
     out[x] = VP8LSubPixels(in[x], pred);                                   \
   }                                                                        \
 }
diff --git a/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 9888854d57..bfe5ea6b38 100644
--- a/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -188,46 +188,51 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average3(left, top[0], top[1]);
+static uint32_t Predictor5_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average3(*left, top[0], top[1]);
 }
 
-static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[-1]);
+static uint32_t Predictor6_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[-1]);
 }
 
-static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
-  return Average2(left, top[0]);
+static uint32_t Predictor7_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
+  return Average2(*left, top[0]);
 }
 
-static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(const uint32_t* const left,
+                                     const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+static uint32_t Predictor10_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Average4(left, top[-1], top[0], top[1]);
+  return Average4(*left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+static uint32_t Predictor11_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return Select(top[0], left, top[-1]);
+  return Select(top[0], *left, top[-1]);
 }
 
-static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+static uint32_t Predictor12_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractFull(left, top[0], top[-1]);
+  return ClampedAddSubtractFull(*left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+static uint32_t Predictor13_MIPSdspR2(const uint32_t* const left,
                                       const uint32_t* const top) {
-  return ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return ClampedAddSubtractHalf(*left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
diff --git a/thirdparty/libwebp/src/dsp/lossless_neon.c b/thirdparty/libwebp/src/dsp/lossless_neon.c
index 76a1b6f873..89e3e013a0 100644
--- a/thirdparty/libwebp/src/dsp/lossless_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_neon.c
@@ -188,17 +188,21 @@ static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
   return avg;
 }
 
-static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
-  return Average3_NEON(left, top[0], top[1]);
+static uint32_t Predictor5_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average3_NEON(*left, top[0], top[1]);
 }
-static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[-1]);
+static uint32_t Predictor6_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[-1]);
 }
-static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
-  return Average2_NEON(left, top[0]);
+static uint32_t Predictor7_NEON(const uint32_t* const left,
+                                const uint32_t* const top) {
+  return Average2_NEON(*left, top[0]);
 }
-static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+static uint32_t Predictor13_NEON(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(*left, top[0], top[-1]);
 }
 
 // Batch versions of those functions.
diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c
index 3a0eb440db..396cb0bdfc 100644
--- a/thirdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c
@@ -138,42 +138,51 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
+static uint32_t Predictor5_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average3_SSE2(*left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[-1]);
+static uint32_t Predictor6_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2_SSE2(left, top[0]);
+static uint32_t Predictor7_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
+  const uint32_t pred = Average2_SSE2(*left, top[0]);
   return pred;
 }
-static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(const uint32_t* const left,
+                                const uint32_t* const top) {
   const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
+static uint32_t Predictor10_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Average4_SSE2(*left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
+static uint32_t Predictor11_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = Select_SSE2(top[0], *left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor12_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(*left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
+static uint32_t Predictor13_SSE2(const uint32_t* const left,
+                                 const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(*left, top[0], top[-1]);
   return pred;
 }
 
diff --git a/thirdparty/libwebp/src/dsp/msa_macro.h b/thirdparty/libwebp/src/dsp/msa_macro.h
index de026a1d9e..51f6c643ab 100644
--- a/thirdparty/libwebp/src/dsp/msa_macro.h
+++ b/thirdparty/libwebp/src/dsp/msa_macro.h
@@ -14,6 +14,10 @@
 #ifndef WEBP_DSP_MSA_MACRO_H_
 #define WEBP_DSP_MSA_MACRO_H_
 
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
 #include <stdint.h>
 #include <msa.h>
 
@@ -1389,4 +1393,5 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
 } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
+#endif  // WEBP_USE_MSA
 #endif  // WEBP_DSP_MSA_MACRO_H_
diff --git a/thirdparty/libwebp/src/dsp/neon.h b/thirdparty/libwebp/src/dsp/neon.h
index aa1dea1301..c591f9b9a7 100644
--- a/thirdparty/libwebp/src/dsp/neon.h
+++ b/thirdparty/libwebp/src/dsp/neon.h
@@ -12,10 +12,12 @@
 #ifndef WEBP_DSP_NEON_H_
 #define WEBP_DSP_NEON_H_
 
-#include <arm_neon.h>
-
 #include "src/dsp/dsp.h"
 
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except newer clang/gcc or aarch64 where the inline assembly is
 // incompatible.
@@ -98,4 +100,5 @@ static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
 } while (0)
 #endif
 
+#endif  // WEBP_USE_NEON
 #endif  // WEBP_DSP_NEON_H_
diff --git a/thirdparty/libwebp/src/dsp/yuv.h b/thirdparty/libwebp/src/dsp/yuv.h
index c12be1d094..66a397d117 100644
--- a/thirdparty/libwebp/src/dsp/yuv.h
+++ b/thirdparty/libwebp/src/dsp/yuv.h
@@ -10,7 +10,7 @@
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
-// More information at: http://en.wikipedia.org/wiki/YCbCr
+// More information at: https://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
diff --git a/thirdparty/libwebp/src/enc/frame_enc.c b/thirdparty/libwebp/src/enc/frame_enc.c
index af538d83ba..b93d9e5b99 100644
--- a/thirdparty/libwebp/src/enc/frame_enc.c
+++ b/thirdparty/libwebp/src/enc/frame_enc.c
@@ -778,6 +778,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   // Roughly refresh the proba eight times per pass
   int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
   int num_pass_left = enc->config_->pass;
+  int remaining_progress = 40;  // percents
   const int do_search = enc->do_search_;
   VP8EncIterator it;
   VP8EncProba* const proba = &enc->proba_;
@@ -805,6 +806,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     uint64_t size_p0 = 0;
     uint64_t distortion = 0;
     int cnt = max_count;
+    // The final number of passes is not trivial to know in advance.
+    const int pass_progress = remaining_progress / (2 + num_pass_left);
+    remaining_progress -= pass_progress;
     VP8IteratorInit(enc, &it);
     SetLoopParams(enc, stats.q);
     if (is_last_pass) {
@@ -832,7 +836,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
         VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
+        ok = VP8IteratorProgress(&it, pass_progress);
       }
       VP8IteratorSaveBoundary(&it);
     } while (ok && VP8IteratorNext(&it));
@@ -878,7 +882,8 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                        (const uint8_t*)proba->coeffs_, 1);
   }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + remaining_progress,
+                                &enc->percent_);
   return PostLoopFinalize(&it, ok);
 }
 
diff --git a/thirdparty/libwebp/src/enc/predictor_enc.c b/thirdparty/libwebp/src/enc/predictor_enc.c
index 2e6762ea0d..2b5c767280 100644
--- a/thirdparty/libwebp/src/enc/predictor_enc.c
+++ b/thirdparty/libwebp/src/enc/predictor_enc.c
@@ -249,7 +249,7 @@ static WEBP_INLINE void GetResidual(
       } else if (x == 0) {
         predict = upper_row[x];  // Top.
       } else {
-        predict = pred_func(current_row[x - 1], upper_row + x);
+        predict = pred_func(&current_row[x - 1], upper_row + x);
       }
 #if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
diff --git a/thirdparty/libwebp/src/enc/quant_enc.c b/thirdparty/libwebp/src/enc/quant_enc.c
index 01eb565c7f..6cede28ab4 100644
--- a/thirdparty/libwebp/src/enc/quant_enc.c
+++ b/thirdparty/libwebp/src/enc/quant_enc.c
@@ -585,6 +585,9 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
+// Coefficient type.
+enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
+
 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
@@ -593,7 +596,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
   CostArrayPtr const costs =
       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
-  const int first = (coeff_type == 0) ? 1 : 0;
+  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
@@ -657,16 +660,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int level = level0 + m;
+      const int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
       score_t base_score;
-      score_t best_cur_score = MAX_COST;
-      int best_prev = 0;   // default, in case
+      score_t best_cur_score;
+      int best_prev;
+      score_t cost, score;
 
-      ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
       if (level < 0 || level > thresh_level) {
+        ss_cur[m].score = MAX_COST;
         // Node is dead.
         continue;
       }
@@ -682,18 +686,24 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
-      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
+      // The base_score is added to all scores so it is only added for the final
+      // value after the loop.
+      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
+      best_cur_score =
+          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
+      best_prev = -MIN_DELTA;
+      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
         // eliminated since their score can't be better than the current best.
-        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
+        cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        const score_t score =
-            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
         if (score < best_cur_score) {
           best_cur_score = score;
           best_prev = p;
         }
       }
+      best_cur_score += base_score;
       // Store best finding in current node.
       cur->sign = sign;
       cur->level = level;
@@ -701,11 +711,11 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       ss_cur[m].score = best_cur_score;
 
       // Now, record best terminal node (and thus best entry in the graph).
-      if (level != 0) {
+      if (level != 0 && best_cur_score < best_score) {
         const score_t last_pos_cost =
             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-        const score_t score = best_cur_score + last_pos_score;
+        score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
           best_path[0] = n;                     // best eob position
@@ -717,10 +727,16 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
   }
 
   // Fresh start
-  memset(in + first, 0, (16 - first) * sizeof(*in));
-  memset(out + first, 0, (16 - first) * sizeof(*out));
+  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
+  if (coeff_type == TYPE_I16_AC) {
+    memset(in + 1, 0, 15 * sizeof(*in));
+    memset(out + 1, 0, 15 * sizeof(*out));
+  } else {
+    memset(in, 0, 16 * sizeof(*in));
+    memset(out, 0, 16 * sizeof(*out));
+  }
   if (best_path[0] == -1) {
-    return 0;   // skip!
+    return 0;  // skip!
   }
 
   {
@@ -775,9 +791,9 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     for (y = 0, n = 0; y < 4; ++y) {
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
-        const int non_zero =
-            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                 &dqm->y1_, dqm->lambda_trellis_i16_);
+        const int non_zero = TrellisQuantizeBlock(
+            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
+            dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
         rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
@@ -818,7 +834,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
@@ -927,9 +943,9 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       for (y = 0; y < 2; ++y) {
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-          const int non_zero =
-              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
-                                   &dqm->uv_, dqm->lambda_trellis_uv_);
+          const int non_zero = TrellisQuantizeBlock(
+              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
+              dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
diff --git a/thirdparty/libwebp/src/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 67e9509367..b4bba08f27 100644
--- a/thirdparty/libwebp/src/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 1
 #define ENC_MIN_VERSION 2
-#define ENC_REV_VERSION 1
+#define ENC_REV_VERSION 2
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
diff --git a/thirdparty/libwebp/src/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h
index 330da66754..d9bf9b3770 100644
--- a/thirdparty/libwebp/src/mux/muxi.h
+++ b/thirdparty/libwebp/src/mux/muxi.h
@@ -29,7 +29,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 1
 #define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 1
+#define MUX_REV_VERSION 2
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/thirdparty/libwebp/src/utils/huffman_encode_utils.c b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
index fd7a47d8f7..585db91951 100644
--- a/thirdparty/libwebp/src/utils/huffman_encode_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -161,7 +161,7 @@ static void SetBitDepths(const HuffmanTree* const tree,
 // especially when population counts are longer than 2**tree_limit, but
 // we are not planning to use this with extremely long blocks.
 //
-// See http://en.wikipedia.org/wiki/Huffman_coding
+// See https://en.wikipedia.org/wiki/Huffman_coding
 static void GenerateOptimalTree(const uint32_t* const histogram,
                                 int histogram_size,
                                 HuffmanTree* tree, int tree_depth_limit,
diff --git a/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
index f65b6cdbb6..97e7893704 100644
--- a/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -30,7 +30,7 @@
 
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
-// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+// cf. https://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
   {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
   { 12,  4, 14,  6 },
diff --git a/thirdparty/libwebp/src/utils/utils.c b/thirdparty/libwebp/src/utils/utils.c
index 9e464c16ce..a7c3a70fef 100644
--- a/thirdparty/libwebp/src/utils/utils.c
+++ b/thirdparty/libwebp/src/utils/utils.c
@@ -23,7 +23,7 @@
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
 // and not multi-thread safe!).
 // An interesting alternative is valgrind's 'massif' tool:
-//    http://valgrind.org/docs/manual/ms-manual.html
+//    https://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
                --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
diff --git a/thirdparty/libwebp/src/webp/decode.h b/thirdparty/libwebp/src/webp/decode.h
index 44fcd64a84..d98247509a 100644
--- a/thirdparty/libwebp/src/webp/decode.h
+++ b/thirdparty/libwebp/src/webp/decode.h
@@ -85,7 +85,7 @@ WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
-// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+// (*) Also named Y'CbCr. See: https://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
diff --git a/thirdparty/misc/patches/polypartition-godot-types.patch b/thirdparty/misc/patches/polypartition-godot-types.patch
index 782f02e8dc..61737f9fd2 100644
--- a/thirdparty/misc/patches/polypartition-godot-types.patch
+++ b/thirdparty/misc/patches/polypartition-godot-types.patch
@@ -1,19 +1,16 @@
 diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
-index 3a8a6efa83..5e94793b79 100644
+index 3a8a6efa83..8c5409bf24 100644
 --- a/thirdparty/misc/polypartition.cpp
 +++ b/thirdparty/misc/polypartition.cpp
-@@ -23,10 +23,7 @@
- 
- #include "polypartition.h"
- 
--#include <math.h>
--#include <string.h>
+@@ -26,7 +26,6 @@
+ #include <math.h>
+ #include <string.h>
  #include <algorithm>
 -#include <vector>
  
  TPPLPoly::TPPLPoly() {
    hole = false;
-@@ -186,7 +183,7 @@ int TPPLPartition::Intersects(TPPLPoint &p11, TPPLPoint &p12, TPPLPoint &p21, TP
+@@ -186,7 +185,7 @@ int TPPLPartition::Intersects(TPPLPoint &p11, TPPLPoint &p12, TPPLPoint &p21, TP
  // Removes holes from inpolys by merging them with non-holes.
  int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
    TPPLPolyList polys;
@@ -22,7 +19,7 @@ index 3a8a6efa83..5e94793b79 100644
    long i, i2, holepointindex, polypointindex;
    TPPLPoint holepoint, polypoint, bestpolypoint;
    TPPLPoint linep1, linep2;
-@@ -198,15 +195,15 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -198,15 +197,15 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
  
    // Check for the trivial case of no holes.
    hasholes = false;
@@ -42,7 +39,7 @@ index 3a8a6efa83..5e94793b79 100644
      }
      return 1;
    }
-@@ -216,8 +213,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -216,8 +215,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
    while (1) {
      // Find the hole point with the largest x.
      hasholes = false;
@@ -53,7 +50,7 @@ index 3a8a6efa83..5e94793b79 100644
          continue;
        }
  
-@@ -227,8 +224,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -227,8 +226,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
          holepointindex = 0;
        }
  
@@ -64,7 +61,7 @@ index 3a8a6efa83..5e94793b79 100644
            holeiter = iter;
            holepointindex = i;
          }
-@@ -237,24 +234,24 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -237,24 +236,24 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
      if (!hasholes) {
        break;
      }
@@ -98,7 +95,7 @@ index 3a8a6efa83..5e94793b79 100644
          if (pointfound) {
            v1 = Normalize(polypoint - holepoint);
            v2 = Normalize(bestpolypoint - holepoint);
-@@ -263,13 +260,13 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -263,13 +262,13 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
            }
          }
          pointvisible = true;
@@ -117,7 +114,7 @@ index 3a8a6efa83..5e94793b79 100644
              if (Intersects(holepoint, polypoint, linep1, linep2)) {
                pointvisible = false;
                break;
-@@ -292,18 +289,18 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -292,18 +291,18 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
        return 0;
      }
  
@@ -142,7 +139,7 @@ index 3a8a6efa83..5e94793b79 100644
        i2++;
      }
  
-@@ -312,8 +309,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
+@@ -312,8 +311,8 @@ int TPPLPartition::RemoveHoles(TPPLPolyList *inpolys, TPPLPolyList *outpolys) {
      polys.push_back(newpoly);
    }
  
@@ -153,7 +150,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    return 1;
-@@ -524,13 +521,13 @@ int TPPLPartition::Triangulate_EC(TPPLPoly *poly, TPPLPolyList *triangles) {
+@@ -524,13 +523,13 @@ int TPPLPartition::Triangulate_EC(TPPLPoly *poly, TPPLPolyList *triangles) {
  
  int TPPLPartition::Triangulate_EC(TPPLPolyList *inpolys, TPPLPolyList *triangles) {
    TPPLPolyList outpolys;
@@ -170,7 +167,7 @@ index 3a8a6efa83..5e94793b79 100644
        return 0;
      }
    }
-@@ -543,7 +540,7 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -543,7 +542,7 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
    }
  
    TPPLPolyList triangles;
@@ -179,7 +176,7 @@ index 3a8a6efa83..5e94793b79 100644
    TPPLPoly *poly1 = NULL, *poly2 = NULL;
    TPPLPoly newpoly;
    TPPLPoint d1, d2, p1, p2, p3;
-@@ -578,19 +575,19 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -578,19 +577,19 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
      return 0;
    }
  
@@ -203,7 +200,7 @@ index 3a8a6efa83..5e94793b79 100644
  
          for (i21 = 0; i21 < poly2->GetNumPoints(); i21++) {
            if ((d2.x != poly2->GetPoint(i21).x) || (d2.y != poly2->GetPoint(i21).y)) {
-@@ -660,16 +657,16 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -660,16 +659,16 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
        }
  
        triangles.erase(iter2);
@@ -224,7 +221,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    return 1;
-@@ -677,13 +674,13 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -677,13 +676,13 @@ int TPPLPartition::ConvexPartition_HM(TPPLPoly *poly, TPPLPolyList *parts) {
  
  int TPPLPartition::ConvexPartition_HM(TPPLPolyList *inpolys, TPPLPolyList *parts) {
    TPPLPolyList outpolys;
@@ -241,7 +238,7 @@ index 3a8a6efa83..5e94793b79 100644
        return 0;
      }
    }
-@@ -824,8 +821,8 @@ int TPPLPartition::Triangulate_OPT(TPPLPoly *poly, TPPLPolyList *triangles) {
+@@ -824,8 +823,8 @@ int TPPLPartition::Triangulate_OPT(TPPLPoly *poly, TPPLPolyList *triangles) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_back(newdiagonal);
@@ -252,7 +249,7 @@ index 3a8a6efa83..5e94793b79 100644
      diagonals.pop_front();
      bestvertex = dpstates[diagonal.index2][diagonal.index1].bestvertex;
      if (bestvertex == -1) {
-@@ -873,10 +870,10 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
+@@ -873,10 +872,10 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
      pairs->push_front(newdiagonal);
      dpstates[a][b].weight = w;
    } else {
@@ -265,7 +262,7 @@ index 3a8a6efa83..5e94793b79 100644
        pairs->pop_front();
      }
      pairs->push_front(newdiagonal);
-@@ -885,7 +882,7 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
+@@ -885,7 +884,7 @@ void TPPLPartition::UpdateState(long a, long b, long w, long i, long j, DPState2
  
  void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
    DiagonalList *pairs = NULL;
@@ -274,7 +271,7 @@ index 3a8a6efa83..5e94793b79 100644
    long top;
    long w;
  
-@@ -902,23 +899,23 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -902,23 +901,23 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
    }
    if (j - i > 1) {
      pairs = &(dpstates[i][j].pairs);
@@ -305,7 +302,7 @@ index 3a8a6efa83..5e94793b79 100644
        }
      }
    }
-@@ -927,7 +924,7 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -927,7 +926,7 @@ void TPPLPartition::TypeA(long i, long j, long k, PartitionVertex *vertices, DPS
  
  void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPState2 **dpstates) {
    DiagonalList *pairs = NULL;
@@ -314,7 +311,7 @@ index 3a8a6efa83..5e94793b79 100644
    long top;
    long w;
  
-@@ -946,21 +943,21 @@ void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPS
+@@ -946,21 +945,21 @@ void TPPLPartition::TypeB(long i, long j, long k, PartitionVertex *vertices, DPS
    if (k - j > 1) {
      pairs = &(dpstates[j][k].pairs);
  
@@ -343,7 +340,7 @@ index 3a8a6efa83..5e94793b79 100644
        }
      } else {
        w++;
-@@ -981,11 +978,11 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -981,11 +980,11 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    DiagonalList diagonals, diagonals2;
    Diagonal diagonal, newdiagonal;
    DiagonalList *pairs = NULL, *pairs2 = NULL;
@@ -358,7 +355,7 @@ index 3a8a6efa83..5e94793b79 100644
    bool ijreal, jkreal;
  
    n = poly->GetNumPoints();
-@@ -1110,35 +1107,35 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1110,35 +1109,35 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_front(newdiagonal);
@@ -403,7 +400,7 @@ index 3a8a6efa83..5e94793b79 100644
                pairs2->pop_back();
              } else {
                break;
-@@ -1153,21 +1150,21 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1153,21 +1152,21 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
          diagonals.push_front(newdiagonal);
        }
      } else {
@@ -431,7 +428,7 @@ index 3a8a6efa83..5e94793b79 100644
                pairs2->pop_front();
              } else {
                break;
-@@ -1197,8 +1194,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1197,8 +1196,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
    newdiagonal.index1 = 0;
    newdiagonal.index2 = n - 1;
    diagonals.push_front(newdiagonal);
@@ -442,7 +439,7 @@ index 3a8a6efa83..5e94793b79 100644
      diagonals.pop_front();
      if ((diagonal.index2 - diagonal.index1) <= 1) {
        continue;
-@@ -1210,8 +1207,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1210,8 +1209,8 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
      indices.push_back(diagonal.index2);
      diagonals2.push_front(diagonal);
  
@@ -453,7 +450,7 @@ index 3a8a6efa83..5e94793b79 100644
        diagonals2.pop_front();
        if ((diagonal.index2 - diagonal.index1) <= 1) {
          continue;
-@@ -1220,16 +1217,16 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1220,16 +1219,16 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
        jkreal = true;
        pairs = &(dpstates[diagonal.index1][diagonal.index2].pairs);
        if (!vertices[diagonal.index1].isConvex) {
@@ -476,7 +473,7 @@ index 3a8a6efa83..5e94793b79 100644
            jkreal = false;
          }
        }
-@@ -1253,11 +1250,12 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1253,11 +1252,12 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
        indices.push_back(j);
      }
  
@@ -492,7 +489,7 @@ index 3a8a6efa83..5e94793b79 100644
        k++;
      }
      parts->push_back(newpoly);
-@@ -1281,7 +1279,7 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
+@@ -1281,7 +1281,7 @@ int TPPLPartition::ConvexPartition_OPT(TPPLPoly *poly, TPPLPolyList *parts) {
  // "Computational Geometry: Algorithms and Applications"
  // by Mark de Berg, Otfried Cheong, Marc van Kreveld, and Mark Overmars.
  int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monotonePolys) {
@@ -501,7 +498,7 @@ index 3a8a6efa83..5e94793b79 100644
    MonotoneVertex *vertices = NULL;
    long i, numvertices, vindex, vindex2, newnumvertices, maxnumvertices;
    long polystartindex, polyendindex;
-@@ -1291,11 +1289,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1291,11 +1291,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    bool error = false;
  
    numvertices = 0;
@@ -515,7 +512,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    maxnumvertices = numvertices * 3;
-@@ -1303,8 +1298,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1303,8 +1300,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    newnumvertices = numvertices;
  
    polystartindex = 0;
@@ -526,7 +523,7 @@ index 3a8a6efa83..5e94793b79 100644
      polyendindex = polystartindex + poly->GetNumPoints() - 1;
      for (i = 0; i < poly->GetNumPoints(); i++) {
        vertices[i + polystartindex].p = poly->GetPoint(i);
-@@ -1360,14 +1355,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1360,14 +1357,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
    // Note that while set doesn't actually have to be implemented as
    // a tree, complexity requirements for operations are the same as
    // for the balanced binary search tree.
@@ -546,7 +543,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  
    // For each vertex.
-@@ -1387,13 +1382,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1387,13 +1384,14 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = vertices[v->next].p;
          newedge.index = vindex;
@@ -564,7 +561,7 @@ index 3a8a6efa83..5e94793b79 100644
            error = true;
            break;
          }
-@@ -1412,29 +1408,30 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1412,29 +1410,30 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
@@ -601,7 +598,7 @@ index 3a8a6efa83..5e94793b79 100644
            error = true;
            break;
          }
-@@ -1452,25 +1449,25 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1452,25 +1451,25 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
          newedge.p1 = v->p;
          newedge.p2 = v->p;
          edgeIter = edgeTree.lower_bound(newedge);
@@ -632,7 +629,7 @@ index 3a8a6efa83..5e94793b79 100644
              error = true;
              break;
            }
-@@ -1488,27 +1485,28 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1488,27 +1487,28 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
            newedge.p1 = v2->p;
            newedge.p2 = vertices[v2->next].p;
            newedge.index = vindex2;
@@ -668,7 +665,7 @@ index 3a8a6efa83..5e94793b79 100644
          }
          break;
      }
-@@ -1569,8 +1567,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
+@@ -1569,8 +1569,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
  
  // Adds a diagonal to the doubly-connected list of vertices.
  void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
@@ -679,7 +676,7 @@ index 3a8a6efa83..5e94793b79 100644
    long newindex1, newindex2;
  
    newindex1 = *numvertices;
-@@ -1597,14 +1595,14 @@ void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, lon
+@@ -1597,14 +1597,14 @@ void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, lon
    vertextypes[newindex1] = vertextypes[index1];
    edgeTreeIterators[newindex1] = edgeTreeIterators[index1];
    helpers[newindex1] = helpers[index1];
@@ -698,7 +695,7 @@ index 3a8a6efa83..5e94793b79 100644
    }
  }
  
-@@ -1830,13 +1828,13 @@ int TPPLPartition::TriangulateMonotone(TPPLPoly *inPoly, TPPLPolyList *triangles
+@@ -1830,13 +1830,13 @@ int TPPLPartition::TriangulateMonotone(TPPLPoly *inPoly, TPPLPolyList *triangles
  
  int TPPLPartition::Triangulate_MONO(TPPLPolyList *inpolys, TPPLPolyList *triangles) {
    TPPLPolyList monotone;
diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
index 5e94793b79..8c5409bf24 100644
--- a/thirdparty/misc/polypartition.cpp
+++ b/thirdparty/misc/polypartition.cpp
@@ -23,6 +23,8 @@
 
 #include "polypartition.h"
 
+#include <math.h>
+#include <string.h>
 #include <algorithm>
 
 TPPLPoly::TPPLPoly() {
diff --git a/thirdparty/thorvg/AUTHORS b/thirdparty/thorvg/AUTHORS
index 66057232b6..ec06c49118 100644
--- a/thirdparty/thorvg/AUTHORS
+++ b/thirdparty/thorvg/AUTHORS
@@ -13,3 +13,5 @@ Pankaj Kumar <pankaj.m1@samsung.com>
 Patryk Kaczmarek <patryk.k@partner.samsung.com>
 Michal Maciola <m.maciola@samsung.com>
 Peter Vullings <peter@projectitis.com>
+K. S. Ernest (iFire) Lee <ernest.lee@chibifire.com>
+Rémi Verschelde <rverschelde@gmail.com>
diff --git a/thirdparty/thorvg/inc/config.h b/thirdparty/thorvg/inc/config.h
index 04a450b1bb..41e8f6dafa 100644
--- a/thirdparty/thorvg/inc/config.h
+++ b/thirdparty/thorvg/inc/config.h
@@ -13,5 +13,5 @@
 
 #define THORVG_JPG_LOADER_SUPPORT 1
 
-#define THORVG_VERSION_STRING "0.7.0"
+#define THORVG_VERSION_STRING "0.7.1"
 #endif
diff --git a/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch b/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch
deleted file mode 100644
index a174880306..0000000000
--- a/thirdparty/thorvg/patches/thorvg-pr1159-mingw-fix.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-index def8ae169a..cf103774c5 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
-@@ -51,6 +51,7 @@
- 
- #define _USE_MATH_DEFINES       //Math Constants are not defined in Standard C/C++.
- 
-+#include <cstring>
- #include <fstream>
- #include <float.h>
- #include <math.h>
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-index 2b62315de8..32685ee620 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgPath.cpp
-@@ -50,6 +50,7 @@
- 
- #define _USE_MATH_DEFINES       //Math Constants are not defined in Standard C/C++.
- 
-+#include <cstring>
- #include <math.h>
- #include <clocale>
- #include <ctype.h>
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-index 8701fe32b1..ae17634f31 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
-@@ -49,6 +49,7 @@
- */
- 
- 
-+#include <cstring>
- #include <string>
- #include "tvgMath.h"
- #include "tvgSvgLoaderCommon.h"
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-index d5b9cdcf7b..9f269b29a2 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgUtil.cpp
-@@ -20,6 +20,7 @@
-  * SOFTWARE.
-  */
- 
-+#include <cstring>
- #include <math.h>
- #include <memory.h>
- #include "tvgSvgUtil.h"
-diff --git a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-index 2e3d5928d9..1571aa4e25 100644
---- a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-+++ b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
-@@ -20,6 +20,7 @@
-  * SOFTWARE.
-  */
- 
-+#include <cstring>
- #include <ctype.h>
- #include <string>
- 
-diff --git a/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp b/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-index 9450d80e88..9dd57e5a89 100644
---- a/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-+++ b/thirdparty/thorvg/src/savers/tvg/tvgTvgSaver.cpp
-@@ -24,6 +24,8 @@
- #include "tvgTvgSaver.h"
- #include "tvgLzw.h"
- 
-+#include <cstring>
-+
- #ifdef _WIN32
-     #include <malloc.h>
- #else
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
index fe22fce017..f9974d9847 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
@@ -84,8 +84,8 @@ bool imagePrepare(SwImage* image, const Matrix* transform, const SwBBox& clipReg
 
     //Fast track: Non-transformed image but just shifted.
     if (image->direct) {
-        image->ox = -static_cast<uint32_t>(round(transform->e13));
-        image->oy = -static_cast<uint32_t>(round(transform->e23));
+        image->ox = -static_cast<int32_t>(round(transform->e13));
+        image->oy = -static_cast<int32_t>(round(transform->e23));
     //Figure out the scale factor by transform matrix
     } else {
         auto scaleX = sqrtf((transform->e11 * transform->e11) + (transform->e21 * transform->e21));
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
index deebed16ee..56bc2f77dc 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRaster.cpp
@@ -481,7 +481,10 @@ static bool _rasterScaledRleRGBAImage(SwSurface* surface, const SwImage* image,
 static bool _scaledRleRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity)
 {
     Matrix itransform;
-    if (transform && !mathInverse(transform, &itransform)) return false;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
 
     auto halfScale = _halfScale(image->scale);
 
@@ -816,7 +819,10 @@ static bool _rasterScaledRGBAImage(SwSurface* surface, const SwImage* image, con
 static bool _scaledRGBAImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint32_t opacity)
 {
     Matrix itransform;
-    if (transform && !mathInverse(transform, &itransform)) return false;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
 
     auto halfScale = _halfScale(image->scale);
 
@@ -1113,12 +1119,12 @@ static bool _rasterTranslucentLinearGradientRle(SwSurface* surface, const SwRleD
         auto dst = &surface->buffer[span->y * surface->stride + span->x];
         fillFetchLinear(fill, buffer, span->y, span->x, span->len);
         if (span->coverage == 255) {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = buffer[i] + ALPHA_BLEND(*dst, _ialpha(buffer[i]));
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
             }
         } else {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[i], span->coverage);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
                 *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
             }
         }
@@ -1142,8 +1148,8 @@ static bool _rasterSolidLinearGradientRle(SwSurface* surface, const SwRleData* r
         } else {
             fillFetchLinear(fill, buf, span->y, span->x, span->len);
             auto dst = &surface->buffer[span->y * surface->stride + span->x];
-            for (uint32_t i = 0; i < span->len; ++i) {
-                dst[i] = INTERPOLATE(span->coverage, buf[i], dst[i]);
+            for (uint32_t x = 0; x < span->len; ++x) {
+                dst[x] = INTERPOLATE(span->coverage, buf[x], dst[x]);
             }
         }
     }
@@ -1302,12 +1308,12 @@ static bool _rasterTranslucentRadialGradientRle(SwSurface* surface, const SwRleD
         auto dst = &surface->buffer[span->y * surface->stride + span->x];
         fillFetchRadial(fill, buffer, span->y, span->x, span->len);
         if (span->coverage == 255) {
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = buffer[i] + ALPHA_BLEND(*dst, _ialpha(buffer[i]));
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
             }
         } else {
-           for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[i], span->coverage);
+           for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
                 *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
             }
         }
@@ -1332,8 +1338,8 @@ static bool _rasterSolidRadialGradientRle(SwSurface* surface, const SwRleData* r
         } else {
             fillFetchRadial(fill, buf, span->y, span->x, span->len);
             auto ialpha = 255 - span->coverage;
-            for (uint32_t i = 0; i < span->len; ++i, ++dst) {
-                *dst = ALPHA_BLEND(buf[i], span->coverage) + ALPHA_BLEND(*dst, ialpha);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = ALPHA_BLEND(buf[x], span->coverage) + ALPHA_BLEND(*dst, ialpha);
             }
         }
     }
@@ -1487,7 +1493,7 @@ bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint
 bool rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, const SwBBox& bbox, uint32_t opacity)
 {
     //Verify Boundary
-    if (bbox.max.x < 0 || bbox.max.y < 0 || bbox.min.x >= surface->w || bbox.min.y >= surface->h) return false;
+    if (bbox.max.x < 0 || bbox.max.y < 0 || bbox.min.x >= static_cast<SwCoord>(surface->w) || bbox.min.y >= static_cast<SwCoord>(surface->h)) return false;
 
     //TOOD: switch (image->format)
     //TODO: case: _rasterRGBImage()
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
index 4e8d342137..e96307c874 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRasterTexmapInternal.h
@@ -58,8 +58,8 @@
     y = yStart;
 
     while (y < yEnd) {
-        x1 = _xa;
-        x2 = _xb;
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
 
         if (!region) {
             minx = INT32_MAX;
@@ -160,4 +160,4 @@ next:
     xb = _xb;
     ua = _ua;
     va = _va;
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
index 78537e7726..c75e73760e 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwRenderer.cpp
@@ -23,6 +23,7 @@
 #include "tvgSwCommon.h"
 #include "tvgTaskScheduler.h"
 #include "tvgSwRenderer.h"
+#include "tvgMath.h"
 
 /************************************************************************/
 /* Internal Class Implementation                                        */
@@ -594,10 +595,10 @@ void* SwRenderer::prepareCommon(SwTask* task, const RenderTransform* transform,
     task->surface = surface;
     task->mpool = mpool;
     task->flags = flags;
-    task->bbox.min.x = max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
-    task->bbox.min.y = max(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
-    task->bbox.max.x = min(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
-    task->bbox.max.y = min(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
+    task->bbox.min.x = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
+    task->bbox.min.y = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
+    task->bbox.max.x = mathMin(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
+    task->bbox.max.y = mathMin(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
 
     if (!task->pushed) {
         task->pushed = true;
diff --git a/thirdparty/thorvg/src/lib/tvgMath.h b/thirdparty/thorvg/src/lib/tvgMath.h
index 9e5c915fc3..423fb6eb1b 100644
--- a/thirdparty/thorvg/src/lib/tvgMath.h
+++ b/thirdparty/thorvg/src/lib/tvgMath.h
@@ -29,6 +29,10 @@
 #include "tvgCommon.h"
 
 
+#define mathMin(x, y) (((x) < (y)) ? (x) : (y))
+#define mathMax(x, y) (((x) > (y)) ? (x) : (y))
+
+
 static inline bool mathZero(float a)
 {
     return (fabsf(a) < FLT_EPSILON) ? true : false;
@@ -43,7 +47,7 @@ static inline bool mathEqual(float a, float b)
 
 static inline bool mathRightAngle(const Matrix* m)
 {
-   auto radian = fabsf(atan2(m->e21, m->e11));
+   auto radian = fabsf(atan2f(m->e21, m->e11));
    if (radian < FLT_EPSILON || mathEqual(radian, float(M_PI_2)) || mathEqual(radian, float(M_PI))) return true;
    return false;
 }
@@ -154,4 +158,4 @@ static inline Matrix mathMultiply(const Matrix* lhs, const Matrix* rhs)
 }
 
 
-#endif //_TVG_MATH_H_
-\ No newline at end of file
+#endif //_TVG_MATH_H_
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp b/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
index 8846613c6b..f27881da42 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgLoader.cpp
@@ -47,6 +47,7 @@ JpgLoader::~JpgLoader()
 {
     jpgdDelete(decoder);
     if (freeData) free(data);
+    free(image);
 }
 
 
@@ -128,5 +129,9 @@ unique_ptr<Surface> JpgLoader::bitmap()
 
 void JpgLoader::run(unsigned tid)
 {
+    if (image) {
+        free(image);
+        image = nullptr;
+    }
     image = jpgdDecompress(decoder);
 }
 \ No newline at end of file
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
index fa72734ec4..4ccc5788d5 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.cpp
@@ -1080,7 +1080,9 @@ namespace DCT_Upsample
 // Unconditionally frees all allocated m_blocks.
 void jpeg_decoder::free_all_blocks()
 {
+    delete(m_pStream);
     m_pStream = nullptr;
+
     for (mem_block *b = m_pMem_blocks; b; ) {
         mem_block *n = b->m_pNext;
         free(b);
@@ -2815,7 +2817,6 @@ int jpeg_decoder::begin_decoding()
 jpeg_decoder::~jpeg_decoder()
 {
     free_all_blocks();
-    delete(m_pStream);
 }
 
 
@@ -3025,4 +3026,4 @@ unsigned char* jpgdDecompress(jpeg_decoder* decoder)
         }
     }
     return pImage_data;
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
index d32ffd99d4..ca9cb35c32 100644
--- a/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
+++ b/thirdparty/thorvg/src/loaders/jpg/tvgJpgd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+ * Copyright (c) 2021 - 2022 Samsung Electronics Co., Ltd. All rights reserved.
 
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp b/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
index c6d95be5ba..3cc08e902b 100644
--- a/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/png/tvgPngLoader.cpp
@@ -72,6 +72,7 @@ PngLoader::PngLoader()
 PngLoader::~PngLoader()
 {
     if (freeData) free(data);
+    free(image);
 }
 
 
@@ -121,7 +122,7 @@ bool PngLoader::open(const char* data, uint32_t size, bool copy)
     clear();
 
     lodepng_state_init(&state);
-    
+
     unsigned int width, height;
     if (lodepng_inspect(&width, &height, &state, (unsigned char*)(data), size) > 0) return false;
 
@@ -180,10 +181,14 @@ unique_ptr<Surface> PngLoader::bitmap()
 
 void PngLoader::run(unsigned tid)
 {
+    if (image) {
+        free(image);
+        image = nullptr;
+    }
     auto width = static_cast<unsigned>(w);
     auto height = static_cast<unsigned>(h);
 
     lodepng_decode(&image, &width, &height, &state, data, size);
 
     _premultiply((uint32_t*)(image), width, height);
-}
-\ No newline at end of file
+}
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
index cf103774c5..08b3308165 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
@@ -541,7 +541,7 @@ static void _toColor(const char* str, uint8_t* r, uint8_t* g, uint8_t* b, char**
                 }
             }
         }
-    } else if (len >= 3 && !strncmp(str, "url", 3)) {
+    } else if (ref && len >= 3 && !strncmp(str, "url", 3)) {
         *ref = _idFromUrl((const char*)(str + 3));
     } else {
         //Handle named color
@@ -789,7 +789,7 @@ static bool _attrParseSvgNode(void* data, const char* key, const char* value)
         return simpleXmlParseW3CAttribute(value, _parseStyleAttr, loader);
     }
 #ifdef THORVG_LOG_ENABLED
-    else if ((!strcmp(key, "x") || !strcmp(key, "y")) && fabsf(svgUtilStrtof(value, nullptr)) > FLT_EPSILON ) {
+    else if ((!strcmp(key, "x") || !strcmp(key, "y")) && fabsf(svgUtilStrtof(value, nullptr)) > FLT_EPSILON) {
         TVGLOG("SVG", "Unsupported attributes used [Elements type: Svg][Attribute: %s][Value: %s]", key, value);
     }
 #endif
@@ -1611,6 +1611,7 @@ static bool _attrParseImageNode(void* data, const char* key, const char* value)
     }
 
     if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (image->href && value) free(image->href);
         image->href = _idFromHref(value);
     } else if (!strcmp(key, "id")) {
         if (node->id && value) free(node->id);
@@ -1728,6 +1729,112 @@ error_grad_alloc:
 }
 
 
+static void _styleInherit(SvgStyleProperty* child, const SvgStyleProperty* parent)
+{
+    if (parent == nullptr) return;
+    //Inherit the property of parent if not present in child.
+    if (!child->curColorSet) {
+        child->color = parent->color;
+        child->curColorSet = parent->curColorSet;
+    }
+    //Fill
+    if (!((int)child->fill.flags & (int)SvgFillFlags::Paint)) {
+        child->fill.paint.color = parent->fill.paint.color;
+        child->fill.paint.none = parent->fill.paint.none;
+        child->fill.paint.curColor = parent->fill.paint.curColor;
+        if (parent->fill.paint.url) child->fill.paint.url = _copyId(parent->fill.paint.url);
+    }
+    if (!((int)child->fill.flags & (int)SvgFillFlags::Opacity)) {
+        child->fill.opacity = parent->fill.opacity;
+    }
+    if (!((int)child->fill.flags & (int)SvgFillFlags::FillRule)) {
+        child->fill.fillRule = parent->fill.fillRule;
+    }
+    //Stroke
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Paint)) {
+        child->stroke.paint.color = parent->stroke.paint.color;
+        child->stroke.paint.none = parent->stroke.paint.none;
+        child->stroke.paint.curColor = parent->stroke.paint.curColor;
+        child->stroke.paint.url = parent->stroke.paint.url ? _copyId(parent->stroke.paint.url) : nullptr;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
+        child->stroke.opacity = parent->stroke.opacity;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Width)) {
+        child->stroke.width = parent->stroke.width;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Dash)) {
+        if (parent->stroke.dash.array.count > 0) {
+            child->stroke.dash.array.clear();
+            child->stroke.dash.array.reserve(parent->stroke.dash.array.count);
+            for (uint32_t i = 0; i < parent->stroke.dash.array.count; ++i) {
+                child->stroke.dash.array.push(parent->stroke.dash.array.data[i]);
+            }
+        }
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Cap)) {
+        child->stroke.cap = parent->stroke.cap;
+    }
+    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Join)) {
+        child->stroke.join = parent->stroke.join;
+    }
+}
+
+
+static void _styleCopy(SvgStyleProperty* to, const SvgStyleProperty* from)
+{
+    if (from == nullptr) return;
+    //Copy the properties of 'from' only if they were explicitly set (not the default ones).
+    if (from->curColorSet) {
+        to->color = from->color;
+        to->curColorSet = true;
+    }
+    //Fill
+    to->fill.flags = (SvgFillFlags)((int)to->fill.flags | (int)from->fill.flags);
+    if (((int)from->fill.flags & (int)SvgFillFlags::Paint)) {
+        to->fill.paint.color = from->fill.paint.color;
+        to->fill.paint.none = from->fill.paint.none;
+        to->fill.paint.curColor = from->fill.paint.curColor;
+        if (from->fill.paint.url) to->fill.paint.url = _copyId(from->fill.paint.url);
+    }
+    if (((int)from->fill.flags & (int)SvgFillFlags::Opacity)) {
+        to->fill.opacity = from->fill.opacity;
+    }
+    if (((int)from->fill.flags & (int)SvgFillFlags::FillRule)) {
+        to->fill.fillRule = from->fill.fillRule;
+    }
+    //Stroke
+    to->stroke.flags = (SvgStrokeFlags)((int)to->stroke.flags | (int)from->stroke.flags);
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Paint)) {
+        to->stroke.paint.color = from->stroke.paint.color;
+        to->stroke.paint.none = from->stroke.paint.none;
+        to->stroke.paint.curColor = from->stroke.paint.curColor;
+        to->stroke.paint.url = from->stroke.paint.url ? _copyId(from->stroke.paint.url) : nullptr;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
+        to->stroke.opacity = from->stroke.opacity;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Width)) {
+        to->stroke.width = from->stroke.width;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Dash)) {
+        if (from->stroke.dash.array.count > 0) {
+            to->stroke.dash.array.clear();
+            to->stroke.dash.array.reserve(from->stroke.dash.array.count);
+            for (uint32_t i = 0; i < from->stroke.dash.array.count; ++i) {
+                to->stroke.dash.array.push(from->stroke.dash.array.data[i]);
+            }
+        }
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Cap)) {
+        to->stroke.cap = from->stroke.cap;
+    }
+    if (((int)from->stroke.flags & (int)SvgStrokeFlags::Join)) {
+        to->stroke.join = from->stroke.join;
+    }
+}
+
+
 static void _copyAttr(SvgNode* to, const SvgNode* from)
 {
     //Copy matrix attribute
@@ -1736,7 +1843,8 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
         if (to->transform) *to->transform = *from->transform;
     }
     //Copy style attribute
-    *to->style = *from->style;
+    _styleCopy(to->style, from->style);
+    to->style->flags = (SvgStyleFlags)((int)to->style->flags | (int)from->style->flags);
     if (from->style->fill.paint.url) to->style->fill.paint.url = strdup(from->style->fill.paint.url);
     if (from->style->stroke.paint.url) to->style->stroke.paint.url = strdup(from->style->stroke.paint.url);
     if (from->style->clipPath.url) to->style->clipPath.url = strdup(from->style->clipPath.url);
@@ -1780,15 +1888,17 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
             break;
         }
         case SvgNodeType::Polygon: {
-            to->node.polygon.pointsCount = from->node.polygon.pointsCount;
-            to->node.polygon.points = (float*)malloc(to->node.polygon.pointsCount * sizeof(float));
-            memcpy(to->node.polygon.points, from->node.polygon.points, to->node.polygon.pointsCount * sizeof(float));
+            if ((to->node.polygon.pointsCount = from->node.polygon.pointsCount)) {
+                to->node.polygon.points = (float*)malloc(to->node.polygon.pointsCount * sizeof(float));
+                memcpy(to->node.polygon.points, from->node.polygon.points, to->node.polygon.pointsCount * sizeof(float));
+            }
             break;
         }
         case SvgNodeType::Polyline: {
-            to->node.polyline.pointsCount = from->node.polyline.pointsCount;
-            to->node.polyline.points = (float*)malloc(to->node.polyline.pointsCount * sizeof(float));
-            memcpy(to->node.polyline.points, from->node.polyline.points, to->node.polyline.pointsCount * sizeof(float));
+            if ((to->node.polyline.pointsCount = from->node.polyline.pointsCount)) {
+                to->node.polyline.points = (float*)malloc(to->node.polyline.pointsCount * sizeof(float));
+                memcpy(to->node.polyline.points, from->node.polyline.points, to->node.polyline.pointsCount * sizeof(float));
+            }
             break;
         }
         case SvgNodeType::Image: {
@@ -1806,35 +1916,45 @@ static void _copyAttr(SvgNode* to, const SvgNode* from)
 }
 
 
-static void _cloneNode(SvgNode* from, SvgNode* parent)
+static void _cloneNode(SvgNode* from, SvgNode* parent, int depth)
 {
+    /* Exception handling: Prevent invalid SVG data input.
+       The size is the arbitrary value, we need an experimental size. */
+    if (depth == 8192) {
+        TVGERR("SVG", "Infinite recursive call - stopped after %d calls! Svg file may be incorrectly formatted.", depth);
+        return;
+    }
+
     SvgNode* newNode;
-    if (!from || !parent) return;
+    if (!from || !parent || from == parent) return;
 
     newNode = _createNode(parent, from->type);
-
     if (!newNode) return;
 
+    _styleInherit(newNode->style, parent->style);
     _copyAttr(newNode, from);
 
     auto child = from->child.data;
     for (uint32_t i = 0; i < from->child.count; ++i, ++child) {
-        _cloneNode(*child, newNode);
+        _cloneNode(*child, newNode, depth + 1);
     }
 }
 
 
-static void _postponeCloneNode(SvgLoaderData* loader, SvgNode *node, char* id) {
+static void _postponeCloneNode(SvgLoaderData* loader, SvgNode *node, char* id)
+{
     loader->cloneNodes.push({node, id});
 }
 
 
-static void _clonePostponedNodes(Array<SvgNodeIdPair>* cloneNodes) {
+static void _clonePostponedNodes(Array<SvgNodeIdPair>* cloneNodes, SvgNode* doc)
+{
     for (uint32_t i = 0; i < cloneNodes->count; ++i) {
         auto nodeIdPair = cloneNodes->data[i];
         auto defs = _getDefsNode(nodeIdPair.node);
         auto nodeFrom = _findChildById(defs, nodeIdPair.id);
-        _cloneNode(nodeFrom, nodeIdPair.node);
+        if (!nodeFrom) nodeFrom = _findChildById(doc, nodeIdPair.id);
+        _cloneNode(nodeFrom, nodeIdPair.node, 0);
         free(nodeIdPair.id);
     }
 }
@@ -1875,7 +1995,7 @@ static bool _attrParseUseNode(void* data, const char* key, const char* value)
         defs = _getDefsNode(node);
         nodeFrom = _findChildById(defs, id);
         if (nodeFrom) {
-            _cloneNode(nodeFrom, node);
+            _cloneNode(nodeFrom, node, 0);
             free(id);
         } else {
             //some svg export software include <defs> element at the end of the file
@@ -1883,10 +2003,6 @@ static bool _attrParseUseNode(void* data, const char* key, const char* value)
             //after the whole file is parsed
             _postponeCloneNode(loader, node, id);
         }
-    } else if (!strcmp(key, "clip-path")) {
-        _handleClipPathAttr(loader, node, value);
-    } else if (!strcmp(key, "mask")) {
-        _handleMaskAttr(loader, node, value);
     } else {
         return _attrParseGNode(data, key, value);
     }
@@ -2081,10 +2197,12 @@ static bool _attrParseRadialGradientNode(void* data, const char* key, const char
     }
 
     if (!strcmp(key, "id")) {
+        if (grad->id && value) free(grad->id);
         grad->id = _copyId(value);
     } else if (!strcmp(key, "spreadMethod")) {
         grad->spread = _parseSpreadValue(value);
     } else if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (grad->ref && value) free(grad->ref);
         grad->ref = _idFromHref(value);
     } else if (!strcmp(key, "gradientUnits") && !strcmp(value, "userSpaceOnUse")) {
         grad->userSpace = true;
@@ -2269,10 +2387,12 @@ static bool _attrParseLinearGradientNode(void* data, const char* key, const char
     }
 
     if (!strcmp(key, "id")) {
+        if (grad->id && value) free(grad->id);
         grad->id = _copyId(value);
     } else if (!strcmp(key, "spreadMethod")) {
         grad->spread = _parseSpreadValue(value);
     } else if (!strcmp(key, "href") || !strcmp(key, "xlink:href")) {
+        if (grad->ref && value) free(grad->ref);
         grad->ref = _idFromHref(value);
     } else if (!strcmp(key, "gradientUnits") && !strcmp(value, "userSpaceOnUse")) {
         grad->userSpace = true;
@@ -2408,6 +2528,7 @@ static void _svgLoaderParserXmlOpen(SvgLoaderData* loader, const char* content,
 
     if ((method = _findGroupFactory(tagName))) {
         //Group
+        if (empty) return;
         if (!loader->doc) {
             if (strcmp(tagName, "svg")) return; //Not a valid svg document
             node = method(loader, nullptr, attrs, attrsLength);
@@ -2493,59 +2614,8 @@ static bool _svgLoaderParser(void* data, SimpleXMLType type, const char* content
 }
 
 
-static void _styleInherit(SvgStyleProperty* child, const SvgStyleProperty* parent)
+static void _inefficientNodeCheck(TVG_UNUSED SvgNode* node)
 {
-    if (parent == nullptr) return;
-    //Inherit the property of parent if not present in child.
-    //Fill
-    if (!((int)child->fill.flags & (int)SvgFillFlags::Paint)) {
-        child->fill.paint.color = parent->fill.paint.color;
-        child->fill.paint.none = parent->fill.paint.none;
-        child->fill.paint.curColor = parent->fill.paint.curColor;
-        if (parent->fill.paint.url) child->fill.paint.url = _copyId(parent->fill.paint.url);
-    } else if (child->fill.paint.curColor && !child->curColorSet) {
-        child->color = parent->color;
-    }
-    if (!((int)child->fill.flags & (int)SvgFillFlags::Opacity)) {
-        child->fill.opacity = parent->fill.opacity;
-    }
-    if (!((int)child->fill.flags & (int)SvgFillFlags::FillRule)) {
-        child->fill.fillRule = parent->fill.fillRule;
-    }
-    //Stroke
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Paint)) {
-        child->stroke.paint.color = parent->stroke.paint.color;
-        child->stroke.paint.none = parent->stroke.paint.none;
-        child->stroke.paint.curColor = parent->stroke.paint.curColor;
-        child->stroke.paint.url = parent->stroke.paint.url ? _copyId(parent->stroke.paint.url) : nullptr;
-    } else if (child->stroke.paint.curColor && !child->curColorSet) {
-        child->color = parent->color;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Opacity)) {
-        child->stroke.opacity = parent->stroke.opacity;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Width)) {
-        child->stroke.width = parent->stroke.width;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Dash)) {
-        if (parent->stroke.dash.array.count > 0) {
-            child->stroke.dash.array.clear();
-            child->stroke.dash.array.reserve(parent->stroke.dash.array.count);
-            for (uint32_t i = 0; i < parent->stroke.dash.array.count; ++i) {
-                child->stroke.dash.array.push(parent->stroke.dash.array.data[i]);
-            }
-        }
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Cap)) {
-        child->stroke.cap = parent->stroke.cap;
-    }
-    if (!((int)child->stroke.flags & (int)SvgStrokeFlags::Join)) {
-        child->stroke.join = parent->stroke.join;
-    }
-}
-
-
-static void _inefficientNodeCheck(TVG_UNUSED SvgNode* node){
 #ifdef THORVG_LOG_ENABLED
     auto type = simpleXmlNodeTypeToString(node->type);
 
@@ -2838,14 +2908,14 @@ void SvgLoader::run(unsigned tid)
     if (loaderData.doc) {
         _updateStyle(loaderData.doc, nullptr);
         auto defs = loaderData.doc->node.doc.defs;
-        if (defs) _updateGradient(loaderData.doc, &defs->node.defs.gradients);
-
-        if (loaderData.gradients.count > 0) _updateGradient(loaderData.doc, &loaderData.gradients);
 
         _updateComposite(loaderData.doc, loaderData.doc);
         if (defs) _updateComposite(loaderData.doc, defs);
 
-        if (loaderData.cloneNodes.count > 0) _clonePostponedNodes(&loaderData.cloneNodes);
+        if (loaderData.cloneNodes.count > 0) _clonePostponedNodes(&loaderData.cloneNodes, loaderData.doc);
+
+        if (loaderData.gradients.count > 0) _updateGradient(loaderData.doc, &loaderData.gradients);
+        if (defs) _updateGradient(loaderData.doc, &defs->node.defs.gradients);
     }
     root = svgSceneBuild(loaderData.doc, vx, vy, vw, vh, w, h, preserveAspect, svgPath);
 }
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
index 1571aa4e25..ee199da231 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
@@ -220,15 +220,15 @@ static SimpleXMLType _getXMLType(const char* itr, const char* itrEnd, size_t &to
         if ((itr + sizeof("<!DOCTYPE>") - 1 < itrEnd) && (!memcmp(itr + 2, "DOCTYPE", sizeof("DOCTYPE") - 1)) && ((itr[2 + sizeof("DOCTYPE") - 1] == '>') || (isspace((unsigned char)itr[2 + sizeof("DOCTYPE") - 1])))) {
             toff = sizeof("!DOCTYPE") - 1;
             return SimpleXMLType::Doctype;
-        } else if (itr + sizeof("<!>") - 1 < itrEnd) {
-            toff = sizeof("!") - 1;
-            return SimpleXMLType::DoctypeChild;
         } else if ((itr + sizeof("<![CDATA[]]>") - 1 < itrEnd) && (!memcmp(itr + 2, "[CDATA[", sizeof("[CDATA[") - 1))) {
             toff = sizeof("![CDATA[") - 1;
             return SimpleXMLType::CData;
         } else if ((itr + sizeof("<!---->") - 1 < itrEnd) && (!memcmp(itr + 2, "--", sizeof("--") - 1))) {
             toff = sizeof("!--") - 1;
             return SimpleXMLType::Comment;
+        } else if (itr + sizeof("<!>") - 1 < itrEnd) {
+            toff = sizeof("!") - 1;
+            return SimpleXMLType::DoctypeChild;
         }
         return SimpleXMLType::Open;
     }
diff --git a/thirdparty/thorvg/update-thorvg.sh b/thirdparty/thorvg/update-thorvg.sh
index c200131eba..ce3d5eed1c 100755
--- a/thirdparty/thorvg/update-thorvg.sh
+++ b/thirdparty/thorvg/update-thorvg.sh
@@ -1,4 +1,4 @@
-VERSION=0.7.0
+VERSION=0.7.1
 rm -rf AUTHORS inc LICENSE src *.zip
 curl -L -O https://github.com/Samsung/thorvg/archive/refs/tags/v$VERSION.zip
 bsdtar --strip-components=1 -xvf *.zip