/*
Convection Texture Tools
Copyright (c) 2018-2019 Eric Lasota

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject
to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

-------------------------------------------------------------------------------------

Portions based on DirectX Texture Library (DirectXTex)

Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.

http://go.microsoft.com/fwlink/?LinkId=248926
*/
#include "ConvectionKernels_Config.h"

#if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)

#include "ConvectionKernels_BC67.h"

#include "ConvectionKernels_AggregatedError.h"
#include "ConvectionKernels_BCCommon.h"
#include "ConvectionKernels_BC7_Prio.h"
#include "ConvectionKernels_BC7_SingleColor.h"
#include "ConvectionKernels_BC6H_IO.h"
#include "ConvectionKernels_EndpointRefiner.h"
#include "ConvectionKernels_EndpointSelector.h"
#include "ConvectionKernels_IndexSelectorHDR.h"
#include "ConvectionKernels_ParallelMath.h"
#include "ConvectionKernels_UnfinishedEndpoints.h"

namespace cvtt
{
    namespace Internal
    {
        namespace BC67
        {
            typedef ParallelMath::Float MFloat;
            typedef ParallelMath::UInt15 MUInt15;

            struct WorkInfo
            {
                MUInt15 m_mode;
                MFloat m_error;
                MUInt15 m_ep[3][2][4];
                MUInt15 m_indexes[16];
                MUInt15 m_indexes2[16];

                union
                {
                    MUInt15 m_partition;
                    struct IndexSelectorAndRotation
                    {
                        MUInt15 m_indexSelector;
                        MUInt15 m_rotation;
                    } m_isr;
                } m_u;
            };
        }

        namespace BC7Data
        {
            enum AlphaMode
            {
                AlphaMode_Combined,
                AlphaMode_Separate,
                AlphaMode_None,
            };

            enum PBitMode
            {
                PBitMode_PerEndpoint,
                PBitMode_PerSubset,
                PBitMode_None
            };

            struct BC7ModeInfo
            {
                PBitMode m_pBitMode;
                AlphaMode m_alphaMode;
                int m_rgbBits;
                int m_alphaBits;
                int m_partitionBits;
                int m_numSubsets;
                int m_indexBits;
                int m_alphaIndexBits;
                bool m_hasIndexSelector;
            };

            BC7ModeInfo g_modes[] =
            {
                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)

                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
            };

            const int g_weight2[] = { 0, 21, 43, 64 };
            const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
            const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };

            const int *g_weightTables[] =
            {
                NULL,
                NULL,
                g_weight2,
                g_weight3,
                g_weight4
            };

            struct BC6HModeInfo
            {
                uint16_t m_modeID;
                bool m_partitioned;
                bool m_transformed;
                int m_aPrec;
                int m_bPrec[3];
            };

            // [partitioned][precision]
            bool g_hdrModesExistForPrecision[2][17] =
            {
                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
            };

            BC6HModeInfo g_hdrModes[] =
            {
                { 0x00, true,  true,  10,{ 5, 5, 5 } },
                { 0x01, true,  true,  7,{ 6, 6, 6 } },
                { 0x02, true,  true,  11,{ 5, 4, 4 } },
                { 0x06, true,  true,  11,{ 4, 5, 4 } },
                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
                { 0x12, true,  true,  8,{ 6, 5, 5 } },
                { 0x16, true,  true,  8,{ 5, 6, 5 } },
                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
                { 0x03, false, false, 10,{ 10, 10, 10 } },
                { 0x07, false, true,  11,{ 9, 9, 9 } },
                { 0x0b, false, true,  12,{ 8, 8, 8 } },
                { 0x0f, false, true,  16,{ 4, 4, 4 } },
            };

            const int g_maxHDRPrecision = 16;

            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);

            static uint16_t g_partitionMap[64] =
            {
                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
                0xC800, 0xFFEC, 0xFE80, 0xE800,
                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
                0xF710, 0x008E, 0x7100, 0x08CE,
                0x008C, 0x7310, 0x3100, 0x8CCE,
                0x088C, 0x3110, 0x6666, 0x366C,
                0x17E8, 0x0FF0, 0x718E, 0x399C,
                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
                0x3c3c, 0x55aa, 0x9696, 0xa55a,
                0x73ce, 0x13c8, 0x324c, 0x3bdc,
                0x6996, 0xc33c, 0x9966, 0x660,
                0x272, 0x4e4, 0x4e40, 0x2720,
                0xc936, 0x936c, 0x39c6, 0x639c,
                0x9336, 0x9cc6, 0x817e, 0xe718,
                0xccf0, 0xfcc, 0x7744, 0xee22,
            };

            static uint32_t g_partitionMap2[64] =
            {
                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
            };

            static int g_fixupIndexes2[64] =
            {
                15,15,15,15,
                15,15,15,15,
                15,15,15,15,
                15,15,15,15,
                15, 2, 8, 2,
                2, 8, 8,15,
                2, 8, 2, 2,
                8, 8, 2, 2,

                15,15, 6, 8,
                2, 8,15,15,
                2, 8, 2, 2,
                2,15,15, 6,
                6, 2, 6, 8,
                15,15, 2, 2,
                15,15,15,15,
                15, 2, 2,15,
            };

            static int g_fixupIndexes3[64][2] =
            {
                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },

                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
            };

            static const unsigned char g_fragments[] =
            {
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
                0, 1, 2, 3,  // 16, 4
                0, 1, 4,  // 20, 3
                0, 1, 2, 4,  // 23, 4
                2, 3, 7,  // 27, 3
                1, 2, 3, 7,  // 30, 4
                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
                0, 1, 4, 8,  // 42, 4
                0, 1, 2, 4, 5, 8,  // 46, 6
                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
                1, 4, 5, 6, 9,  // 60, 5
                2, 5, 6, 7, 10,  // 65, 5
                5, 6, 9, 10,  // 70, 4
                2, 3, 7, 11,  // 74, 4
                1, 2, 3, 6, 7, 11,  // 78, 6
                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
                0, 4, 8, 12,  // 128, 4
                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
                3, 6, 7, 8, 9, 12,  // 158, 6
                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
                5, 8, 9, 10, 13,  // 192, 5
                8, 12, 13,  // 197, 3
                4, 8, 12, 13,  // 200, 4
                2, 3, 6, 9, 12, 13,  // 204, 6
                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
                6, 9, 10, 11, 14,  // 317, 5
                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
                8, 12, 13, 14,  // 418, 4
                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
                4, 8, 9, 12, 13, 14,  // 430, 6
                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
                3, 7, 11, 15,  // 490, 4
                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
                0, 4, 5, 10, 11, 15,  // 502, 6
                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
                0, 1, 5, 10, 14, 15,  // 669, 6
                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
                11, 14, 15,  // 691, 3
                7, 11, 14, 15,  // 694, 4
                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
                11, 13, 14, 15,  // 792, 4
                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
                7, 10, 11, 13, 14, 15,  // 814, 6
                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
                12, 13, 14, 15,  // 848, 4
                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
                0, 2,  // 1040, 2
                1, 3,  // 1042, 2
                0, 1, 4, 5,  // 1044, 4
                0, 1, 2, 4, 5,  // 1048, 5
                2, 3, 6,  // 1053, 3
                0, 2, 4, 6,  // 1056, 4
                1, 2, 5, 6,  // 1060, 4
                0, 1, 2, 3, 5, 6,  // 1064, 6
                0, 1, 2, 4, 5, 6,  // 1070, 6
                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
                0, 3, 4, 7,  // 1083, 4
                0, 1, 2, 3, 4, 7,  // 1087, 6
                1, 3, 5, 7,  // 1093, 4
                2, 3, 6, 7,  // 1097, 4
                1, 2, 3, 6, 7,  // 1101, 5
                1, 2, 3, 5, 6, 7,  // 1106, 6
                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
                4, 5, 6, 7,  // 1119, 4
                0, 8,  // 1123, 2
                0, 1, 4, 5, 8,  // 1125, 5
                0, 1, 8, 9,  // 1130, 4
                4, 5, 8, 9,  // 1134, 4
                0, 1, 4, 5, 8, 9,  // 1138, 6
                2, 6, 8, 9,  // 1144, 4
                6, 7, 8, 9,  // 1148, 4
                0, 2, 4, 6, 8, 10,  // 1152, 6
                1, 2, 5, 6, 9, 10,  // 1158, 6
                0, 3, 4, 7, 9, 10,  // 1164, 6
                0, 1, 2, 8, 9, 10,  // 1170, 6
                4, 5, 6, 8, 9, 10,  // 1176, 6
                3, 11,  // 1182, 2
                2, 3, 6, 7, 11,  // 1184, 5
                0, 3, 8, 11,  // 1189, 4
                0, 3, 4, 7, 8, 11,  // 1193, 6
                1, 3, 5, 7, 9, 11,  // 1199, 6
                2, 3, 10, 11,  // 1205, 4
                1, 5, 10, 11,  // 1209, 4
                4, 5, 10, 11,  // 1213, 4
                6, 7, 10, 11,  // 1217, 4
                2, 3, 6, 7, 10, 11,  // 1221, 6
                1, 2, 3, 9, 10, 11,  // 1227, 6
                5, 6, 7, 9, 10, 11,  // 1233, 6
                8, 9, 10, 11,  // 1239, 4
                4, 12,  // 1243, 2
                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
                8, 9, 12,  // 1253, 3
                0, 4, 5, 8, 9, 12,  // 1256, 6
                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
                1, 5, 9, 13,  // 1276, 4
                6, 7, 9, 13,  // 1280, 4
                1, 4, 7, 10, 13,  // 1284, 5
                1, 6, 8, 11, 13,  // 1289, 5
                0, 1, 12, 13,  // 1294, 4
                4, 5, 12, 13,  // 1298, 4
                0, 1, 6, 7, 12, 13,  // 1302, 6
                0, 1, 4, 8, 12, 13,  // 1308, 6
                8, 9, 12, 13,  // 1314, 4
                4, 8, 9, 12, 13,  // 1318, 5
                4, 5, 8, 9, 12, 13,  // 1323, 6
                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
                0, 1, 6, 10, 12, 13,  // 1336, 6
                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
                0, 1, 10, 11, 12, 13,  // 1349, 6
                2, 4, 7, 9, 14,  // 1355, 5
                4, 5, 10, 14,  // 1360, 4
                2, 6, 10, 14,  // 1364, 4
                2, 5, 8, 11, 14,  // 1368, 5
                0, 2, 12, 14,  // 1373, 4
                8, 10, 12, 14,  // 1377, 4
                4, 6, 8, 10, 12, 14,  // 1381, 6
                13, 14,  // 1387, 2
                9, 10, 13, 14,  // 1389, 4
                5, 6, 9, 10, 13, 14,  // 1393, 6
                0, 1, 2, 12, 13, 14,  // 1399, 6
                4, 5, 6, 12, 13, 14,  // 1405, 6
                8, 9, 12, 13, 14,  // 1411, 5
                8, 9, 10, 12, 13, 14,  // 1416, 6
                7, 15,  // 1422, 2
                0, 5, 10, 15,  // 1424, 4
                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
                10, 11, 15,  // 1436, 3
                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
                3, 6, 7, 10, 11, 15,  // 1446, 6
                12, 15,  // 1452, 2
                0, 3, 12, 15,  // 1454, 4
                4, 7, 12, 15,  // 1458, 4
                0, 3, 6, 9, 12, 15,  // 1462, 6
                0, 3, 5, 10, 12, 15,  // 1468, 6
                8, 11, 12, 15,  // 1474, 4
                5, 6, 8, 11, 12, 15,  // 1478, 6
                4, 7, 8, 11, 12, 15,  // 1484, 6
                1, 3, 13, 15,  // 1490, 4
                9, 11, 13, 15,  // 1494, 4
                5, 7, 9, 11, 13, 15,  // 1498, 6
                2, 3, 14, 15,  // 1504, 4
                2, 3, 4, 5, 14, 15,  // 1508, 6
                6, 7, 14, 15,  // 1514, 4
                2, 3, 5, 9, 14, 15,  // 1518, 6
                2, 3, 8, 9, 14, 15,  // 1524, 6
                10, 14, 15,  // 1530, 3
                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
                2, 3, 7, 11, 14, 15,  // 1540, 6
                10, 11, 14, 15,  // 1546, 4
                7, 10, 11, 14, 15,  // 1550, 5
                6, 7, 10, 11, 14, 15,  // 1555, 6
                1, 2, 3, 13, 14, 15,  // 1561, 6
                5, 6, 7, 13, 14, 15,  // 1567, 6
                10, 11, 13, 14, 15,  // 1573, 5
                9, 10, 11, 13, 14, 15,  // 1578, 6
                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
                9, 10, 12, 13, 14, 15,  // 1592, 6
                8, 11, 12, 13, 14, 15,  // 1598, 6
                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
            };
            static const int g_shapeRanges[][2] =
            {
                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
                { 1604, 8 },
            };
            static const int g_shapes1[][2] =
            {
                { 0, 16 }
            };
            static const int g_shapes2[64][2] =
            {
                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
            };
            static const int g_shapes3[64][3] =
            {
                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
            };

            static const int g_shapeList1[] =
            {
                0,
            };

            static const int g_shapeList2[] =
            {
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
                122, 123, 124, 125, 126, 127, 128,
            };

            static const int g_shapeList12[] =
            {
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
                121, 122, 123, 124, 125, 126, 127, 128,
            };

            static const int g_shapeList3[] =
            {
                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
                235, 236, 237, 238, 239, 240, 241, 242,
            };

            static const int g_shapeList3Short[] =
            {
                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
                233, 237, 240,
            };

            static const int g_shapeListAll[] =
            {
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
                242,
            };

            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
        }

        struct PackingVector
        {
            uint32_t m_vector[4];
            int m_offset;

            void Init()
            {
                for (int i = 0; i < 4; i++)
                    m_vector[i] = 0;

                m_offset = 0;
            }

            void InitPacked(const uint32_t *v, int bits)
            {
                for (int b = 0; b < bits; b += 32)
                    m_vector[b / 32] = v[b / 32];

                m_offset = bits;
            }

            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
            {
                int vOffset = m_offset >> 5;
                int bitOffset = m_offset & 0x1f;

                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);

                int overflowBits = bitOffset + bits - 32;
                if (overflowBits > 0)
                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));

                m_offset += bits;
            }

            inline void Flush(uint8_t* output)
            {
                assert(m_offset == 128);

                for (int v = 0; v < 4; v++)
                {
                    uint32_t chunk = m_vector[v];
                    for (int b = 0; b < 4; b++)
                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
                }
            }
        };


        struct UnpackingVector
        {
            uint32_t m_vector[4];

            void Init(const uint8_t *bytes)
            {
                for (int i = 0; i < 4; i++)
                    m_vector[i] = 0;

                for (int b = 0; b < 16; b++)
                    m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
            }

            inline void UnpackStart(uint32_t *v, int bits)
            {
                for (int b = 0; b < bits; b += 32)
                    v[b / 32] = m_vector[b / 32];

                int entriesShifted = bits / 32;
                int carry = bits % 32;

                for (int i = entriesShifted; i < 4; i++)
                    m_vector[i - entriesShifted] = m_vector[i];

                int entriesRemaining = 4 - entriesShifted;
                if (carry)
                {
                    uint32_t bitMask = (1 << carry) - 1;
                    for (int i = 0; i < 4; i++)
                    {
                        m_vector[i] >>= carry;
                        if (i != 3)
                            m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry);
                    }
                }
            }

            inline ParallelMath::ScalarUInt16 Unpack(int bits)
            {
                uint32_t bitMask = (1 << bits) - 1;

                ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);

                for (int i = 0; i < 4; i++)
                {
                    m_vector[i] >>= bits;
                    if (i != 3)
                        m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
                }

                return result;
            }
        };

        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
        {
            if (isSigned)
            {
                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
                return (v * 32.0f + offset) / 31.0f;
            }
            else
                return (v * 64.0f + 30.0f) / 31.0f;
        }

        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
        {
#ifdef CVTT_ENABLE_ASSERTS
            for (int i = 0; i < ParallelMath::ParallelSize; i++)
                assert(ParallelMath::Extract(v, i) != -32768)
#endif

                ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));

            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));

            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
        }

        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
        {
            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
        }

        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
        {
            for (int epi = 0; epi < 2; epi++)
            {
                for (int ch = 0; ch < 3; ch++)
                {
                    if (isSigned)
                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
                    else
                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
                }
            }
        }

        struct SinglePlaneTemporaries
        {
            UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
            UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];

            ParallelMath::UInt15 fragmentBestIndexes[BC7Data::g_numFragments];
            ParallelMath::UInt15 shapeBestEP[BC7Data::g_numShapesAll][2][4];
            ParallelMath::Float shapeBestError[BC7Data::g_numShapesAll];
        };
    }
}

void cvtt::Internal::BC7Computer::TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
{
    ParallelMath::RoundTowardNearestForScope roundingMode;

    float tf[2];
    Util::ComputeTweakFactors(tweak, range, tf);

    MFloat base = ParallelMath::ToFloat(original[0]);
    MFloat offs = ParallelMath::ToFloat(original[1]) - base;

    result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
    result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
}

void cvtt::Internal::BC7Computer::Quantize(MUInt15* color, int bits, int channels)
{
    for (int ch = 0; ch < channels; ch++)
        color[ch] = ParallelMath::RightShift(((color[ch] << bits) - color[ch]) + ParallelMath::MakeUInt15(127 + (1 << (7 - bits))), 8);
}

void cvtt::Internal::BC7Computer::QuantizeP(MUInt15* color, int bits, uint16_t p, int channels)
{
    int16_t addend;
    if (p)
        addend = ((1 << (8 - bits)) - 1);
    else
        addend = 255;

    for (int ch = 0; ch < channels; ch++)
    {
        MUInt16 ch16 = ParallelMath::LosslessCast<MUInt16>::Cast(color[ch]);
        ch16 = ParallelMath::RightShift((ch16 << (bits + 1)) - ch16 + addend, 9);
        ch16 = (ch16 << 1) | ParallelMath::MakeUInt16(p);
        color[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ch16);
    }
}

void cvtt::Internal::BC7Computer::Unquantize(MUInt15* color, int bits, int channels)
{
    for (int ch = 0; ch < channels; ch++)
    {
        MUInt15 clr = color[ch];
        clr = clr << (8 - bits);
        color[ch] = clr | ParallelMath::RightShift(clr, bits);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2])
{
    for (int j = 0; j < 2; j++)
    {
        QuantizeP(ep[j], 4, p[j], 3);
        Unquantize(ep[j], 5, 3);
        ep[j][3] = ParallelMath::MakeUInt15(255);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints1(MUInt15 ep[2][4], uint16_t p)
{
    for (int j = 0; j < 2; j++)
    {
        QuantizeP(ep[j], 6, p, 3);
        Unquantize(ep[j], 7, 3);
        ep[j][3] = ParallelMath::MakeUInt15(255);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints2(MUInt15 ep[2][4])
{
    for (int j = 0; j < 2; j++)
    {
        Quantize(ep[j], 5, 3);
        Unquantize(ep[j], 5, 3);
        ep[j][3] = ParallelMath::MakeUInt15(255);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2])
{
    for (int j = 0; j < 2; j++)
    {
        QuantizeP(ep[j], 7, p[j], 3);
        ep[j][3] = ParallelMath::MakeUInt15(255);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2])
{
    for (int j = 0; j < 2; j++)
    {
        Quantize(epRGB[j], 5, 3);
        Unquantize(epRGB[j], 5, 3);

        Quantize(epA + j, 6, 1);
        Unquantize(epA + j, 6, 1);
    }
}

void cvtt::Internal::BC7Computer::CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2])
{
    for (int j = 0; j < 2; j++)
    {
        Quantize(epRGB[j], 7, 3);
        Unquantize(epRGB[j], 7, 3);
    }

    // Alpha is full precision
    (void)epA;
}

void cvtt::Internal::BC7Computer::CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2])
{
    for (int j = 0; j < 2; j++)
        QuantizeP(ep[j], 7, p[j], 4);
}

void cvtt::Internal::BC7Computer::CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2])
{
    for (int j = 0; j < 2; j++)
    {
        QuantizeP(ep[j], 5, p[j], 4);
        Unquantize(ep[j], 6, 4);
    }
}

void cvtt::Internal::BC7Computer::TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
{
    MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);

    MUInt15 intAverage[4];
    for (int ch = 0; ch < 4; ch++)
        intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);

    MUInt15 eps[2][4];
    MUInt15 reconstructed[4];
    MUInt15 index = ParallelMath::MakeUInt15(0);

    for (int epi = 0; epi < 2; epi++)
    {
        for (int ch = 0; ch < 3; ch++)
            eps[epi][ch] = ParallelMath::MakeUInt15(0);
        eps[epi][3] = ParallelMath::MakeUInt15(255);
    }

    for (int ch = 0; ch < 3; ch++)
        reconstructed[ch] = ParallelMath::MakeUInt15(0);
    reconstructed[3] = ParallelMath::MakeUInt15(255);

    // Depending on the target index and parity bits, there are multiple valid solid colors.
    // We want to find the one closest to the actual average.
    MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
    for (int t = 0; t < numTables; t++)
    {
        const cvtt::Tables::BC7SC::Table& table = *(tables[t]);

        ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];

        MUInt15 candidateReconstructed[4];
        MUInt15 candidateEPs[2][4];

        for (int i = 0; i < ParallelMath::ParallelSize; i++)
        {
            for (int ch = 0; ch < numRealChannels; ch++)
            {
                ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
                assert(avgValue >= 0 && avgValue <= 255);

                const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];

                ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
                ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
                ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
            }
        }

        MFloat avgError = ParallelMath::MakeFloatZero();
        for (int ch = 0; ch < numRealChannels; ch++)
        {
            MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
            avgError = avgError + delta * delta * channelWeightsSq[ch];
        }

        ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
        better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations

        if (ParallelMath::AnySet(better))
        {
            ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);

            MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);

            ParallelMath::ConditionalSet(index, better, candidateIndex);

            for (int ch = 0; ch < numRealChannels; ch++)
                ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);

            for (int epi = 0; epi < 2; epi++)
                for (int ch = 0; ch < numRealChannels; ch++)
                    ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
        }
    }

    AggregatedError<4> aggError;
    for (int pxi = 0; pxi < shapeLength; pxi++)
    {
        int px = fragmentStart[pxi];

        BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
    }

    MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;

    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
    if (ParallelMath::AnySet(better))
    {
        shapeBestError = ParallelMath::Min(shapeBestError, error);
        for (int epi = 0; epi < 2; epi++)
        {
            for (int ch = 0; ch < numRealChannels; ch++)
                ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
        }

        for (int pxi = 0; pxi < shapeLength; pxi++)
            ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
    }
}

void cvtt::Internal::BC7Computer::TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
{
    if (numRefineRounds < 1)
        numRefineRounds = 1;

    float channelWeightsSq[4];

    for (int ch = 0; ch < 4; ch++)
        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];

    SinglePlaneTemporaries temps;

    MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
    MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
    ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
    for (int px = 0; px < 16; px++)
    {
        MUInt15 a = pixels[px][3];
        maxAlpha = ParallelMath::Max(maxAlpha, a);
        minAlpha = ParallelMath::Min(minAlpha, a);

        isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
    }

    ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
    ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);

    bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);

    // Try RGB modes if any block has a min alpha 251 or higher
    bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));

    // Try mode 7 if any block has alpha.
    // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
    // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
    // situations, and only by at most 1 unit of error per pixel.
    bool allowMode7 = anyBlockHasAlpha || (encodingPlan.mode7RGBPartitionEnabled != 0);

    MFloat preWeightedPixels[16][4];

    BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);

    // Get initial RGB endpoints
    if (allowRGBModes)
    {
        const uint8_t *shapeList = encodingPlan.rgbShapeList;
        int numShapesToEvaluate = encodingPlan.rgbNumShapesToEvaluate;

        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
        {
            int shape = shapeList[shapeIter];

            int shapeStart = BC7Data::g_shapeRanges[shape][0];
            int shapeSize = BC7Data::g_shapeRanges[shape][1];

            EndpointSelector<3, 8> epSelector;

            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
            {
                for (int spx = 0; spx < shapeSize; spx++)
                {
                    int px = BC7Data::g_fragments[shapeStart + spx];
                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
                }
                epSelector.FinishPass(epPass);
            }
            temps.unfinishedRGB[shape] = epSelector.GetEndpoints(channelWeights);
        }
    }

    // Get initial RGBA endpoints
    {
        const uint8_t *shapeList = encodingPlan.rgbaShapeList;
        int numShapesToEvaluate = encodingPlan.rgbaNumShapesToEvaluate;

        for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
        {
            int shape = shapeList[shapeIter];

            if (anyBlockHasAlpha || !allowRGBModes)
            {
                int shapeStart = BC7Data::g_shapeRanges[shape][0];
                int shapeSize = BC7Data::g_shapeRanges[shape][1];

                EndpointSelector<4, 8> epSelector;

                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
                {
                    for (int spx = 0; spx < shapeSize; spx++)
                    {
                        int px = BC7Data::g_fragments[shapeStart + spx];
                        epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
                    }
                    epSelector.FinishPass(epPass);
                }
                temps.unfinishedRGBA[shape] = epSelector.GetEndpoints(channelWeights);
            }
            else
            {
                temps.unfinishedRGBA[shape] = temps.unfinishedRGB[shape].ExpandTo<4>(255);
            }
        }
    }

    for (uint16_t mode = 0; mode <= 7; mode++)
    {
        if (mode == 4 || mode == 5)
            continue;

        if (mode < 4 && !allowRGBModes)
            continue;

        if (mode == 7 && !allowMode7)
            continue;

        uint64_t partitionEnabledBits = 0;
        switch (mode)
        {
        case 0:
            partitionEnabledBits = encodingPlan.mode0PartitionEnabled;
            break;
        case 1:
            partitionEnabledBits = encodingPlan.mode1PartitionEnabled;
            break;
        case 2:
            partitionEnabledBits = encodingPlan.mode2PartitionEnabled;
            break;
        case 3:
            partitionEnabledBits = encodingPlan.mode3PartitionEnabled;
            break;
        case 6:
            partitionEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
            break;
        case 7:
            if (anyBlockHasAlpha)
                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
            else
                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
            break;
        default:
            break;
        }

        bool isRGB = (mode < 4);

        unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
        int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
        int indexPrec = BC7Data::g_modes[mode].m_indexBits;

        int parityBitMax = 1;
        if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
            parityBitMax = 4;
        else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
            parityBitMax = 2;

        int numRealChannels = isRGB ? 3 : 4;

        int numShapes;
        const int *shapeList;

        if (numSubsets == 1)
        {
            numShapes = BC7Data::g_numShapes1;
            shapeList = BC7Data::g_shapeList1;
        }
        else if (numSubsets == 2)
        {
            numShapes = BC7Data::g_numShapes2;
            shapeList = BC7Data::g_shapeList2;
        }
        else
        {
            assert(numSubsets == 3);
            if (numPartitions == 16)
            {
                numShapes = BC7Data::g_numShapes3Short;
                shapeList = BC7Data::g_shapeList3Short;
            }
            else
            {
                assert(numPartitions == 64);
                numShapes = BC7Data::g_numShapes3;
                shapeList = BC7Data::g_shapeList3;
            }
        }

        for (int slot = 0; slot < BC7Data::g_numShapesAll; slot++)
            temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);

        for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
        {
            int shape = shapeList[shapeIter];

            int numTweakRounds = 0;
            if (isRGB)
                numTweakRounds = encodingPlan.seedPointsForShapeRGB[shape];
            else
                numTweakRounds = encodingPlan.seedPointsForShapeRGBA[shape];

            if (numTweakRounds == 0)
                continue;

            if (numTweakRounds > MaxTweakRounds)
                numTweakRounds = MaxTweakRounds;

            int shapeStart = BC7Data::g_shapeRanges[shape][0];
            int shapeLength = BC7Data::g_shapeRanges[shape][1];

            AggregatedError<1> alphaAggError;
            if (isRGB && anyBlockHasAlpha)
            {
                MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };

                for (int pxi = 0; pxi < shapeLength; pxi++)
                {
                    int px = BC7Data::g_fragments[shapeStart + pxi];
                    MUInt15 original[1] = { pixels[px][3] };
                    BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
                }
            }

            float alphaWeightsSq[1] = { channelWeightsSq[3] };
            MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);

            MUInt15 tweakBaseEP[MaxTweakRounds][2][4];

            for (int tweak = 0; tweak < numTweakRounds; tweak++)
            {
                if (isRGB)
                {
                    temps.unfinishedRGB[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
                    tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
                }
                else
                {
                    temps.unfinishedRGBA[shape].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
                }
            }

            ParallelMath::Int16CompFlag punchThroughInvalid[4];
            for (int pIter = 0; pIter < parityBitMax; pIter++)
            {
                punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);

                if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
                {
                    // Modes 6 and 7 have parity bits that affect alpha
                    if (pIter == 0)
                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
                    else if (pIter == parityBitMax - 1)
                        punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
                    else
                        punchThroughInvalid[pIter] = isPunchThrough;
                }
            }

            for (int pIter = 0; pIter < parityBitMax; pIter++)
            {
                if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
                    continue;

                bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);

                for (int tweak = 0; tweak < numTweakRounds; tweak++)
                {
                    uint16_t p[2];
                    p[0] = (pIter & 1);
                    p[1] = ((pIter >> 1) & 1);

                    MUInt15 ep[2][4];

                    for (int epi = 0; epi < 2; epi++)
                        for (int ch = 0; ch < 4; ch++)
                            ep[epi][ch] = tweakBaseEP[tweak][epi][ch];

                    for (int refine = 0; refine < numRefineRounds; refine++)
                    {
                        switch (mode)
                        {
                        case 0:
                            CompressEndpoints0(ep, p);
                            break;
                        case 1:
                            CompressEndpoints1(ep, p[0]);
                            break;
                        case 2:
                            CompressEndpoints2(ep);
                            break;
                        case 3:
                            CompressEndpoints3(ep, p);
                            break;
                        case 6:
                            CompressEndpoints6(ep, p);
                            break;
                        case 7:
                            CompressEndpoints7(ep, p);
                            break;
                        default:
                            assert(false);
                            break;
                        };

                        MFloat shapeError = ParallelMath::MakeFloatZero();

                        IndexSelector<4> indexSelector;
                        indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);

                        EndpointRefiner<4> epRefiner;
                        epRefiner.Init(1 << indexPrec, channelWeights);

                        MUInt15 indexes[16];

                        AggregatedError<4> aggError;
                        for (int pxi = 0; pxi < shapeLength; pxi++)
                        {
                            int px = BC7Data::g_fragments[shapeStart + pxi];

                            MUInt15 index;
                            MUInt15 reconstructed[4];

                            index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
                            indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);

                            if (flags & cvtt::Flags::BC7_FastIndexing)
                                BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
                            else
                            {
                                MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);

                                MUInt15 altIndexes[2];
                                altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
                                altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));

                                for (int ii = 0; ii < 2; ii++)
                                {
                                    indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);

                                    MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
                                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
                                    error = ParallelMath::Min(error, altError);
                                    ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
                                }

                                shapeError = shapeError + error;
                            }

                            if (refine != numRefineRounds - 1)
                                epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);

                            indexes[pxi] = index;
                        }

                        if (flags & cvtt::Flags::BC7_FastIndexing)
                            shapeError = aggError.Finalize(flags, channelWeightsSq);

                        if (isRGB)
                            shapeError = shapeError + staticAlphaError;

                        ParallelMath::FloatCompFlag shapeErrorBetter;
                        ParallelMath::Int16CompFlag shapeErrorBetter16;

                        shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shape]);
                        shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);

                        if (ParallelMath::AnySet(shapeErrorBetter16))
                        {
                            bool punchThroughOK = true;
                            if (needPunchThroughCheck)
                            {
                                shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
                                shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);

                                if (!ParallelMath::AnySet(shapeErrorBetter16))
                                    punchThroughOK = false;
                            }

                            if (punchThroughOK)
                            {
                                ParallelMath::ConditionalSet(temps.shapeBestError[shape], shapeErrorBetter, shapeError);
                                for (int epi = 0; epi < 2; epi++)
                                    for (int ch = 0; ch < numRealChannels; ch++)
                                        ParallelMath::ConditionalSet(temps.shapeBestEP[shape][epi][ch], shapeErrorBetter16, ep[epi][ch]);

                                for (int pxi = 0; pxi < shapeLength; pxi++)
                                    ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
                            }
                        }

                        if (refine != numRefineRounds - 1)
                            epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
                    } // refine
                } // tweak
            } // p

            if (flags & cvtt::Flags::BC7_TrySingleColor)
            {
                MUInt15 total[4];
                for (int ch = 0; ch < 4; ch++)
                    total[ch] = ParallelMath::MakeUInt15(0);

                for (int pxi = 0; pxi < shapeLength; pxi++)
                {
                    int px = BC7Data::g_fragments[shapeStart + pxi];
                    for (int ch = 0; ch < 4; ch++)
                        total[ch] = total[ch] + pixels[pxi][ch];
                }

                MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
                MFloat average[4];
                for (int ch = 0; ch < 4; ch++)
                    average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;

                const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
                MFloat &shapeBestError = temps.shapeBestError[shape];
                MUInt15 (&shapeBestEP)[2][4] = temps.shapeBestEP[shape];
                MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;

                const cvtt::Tables::BC7SC::Table **scTables = NULL;
                int numSCTables = 0;

                const cvtt::Tables::BC7SC::Table *tables0[] =
                {
                    &cvtt::Tables::BC7SC::g_mode0_p00_i1,
                    &cvtt::Tables::BC7SC::g_mode0_p00_i2,
                    &cvtt::Tables::BC7SC::g_mode0_p00_i3,
                    &cvtt::Tables::BC7SC::g_mode0_p01_i1,
                    &cvtt::Tables::BC7SC::g_mode0_p01_i2,
                    &cvtt::Tables::BC7SC::g_mode0_p01_i3,
                    &cvtt::Tables::BC7SC::g_mode0_p10_i1,
                    &cvtt::Tables::BC7SC::g_mode0_p10_i2,
                    &cvtt::Tables::BC7SC::g_mode0_p10_i3,
                    &cvtt::Tables::BC7SC::g_mode0_p11_i1,
                    &cvtt::Tables::BC7SC::g_mode0_p11_i2,
                    &cvtt::Tables::BC7SC::g_mode0_p11_i3,
                };

                const cvtt::Tables::BC7SC::Table *tables1[] =
                {
                    &cvtt::Tables::BC7SC::g_mode1_p0_i1,
                    &cvtt::Tables::BC7SC::g_mode1_p0_i2,
                    &cvtt::Tables::BC7SC::g_mode1_p0_i3,
                    &cvtt::Tables::BC7SC::g_mode1_p1_i1,
                    &cvtt::Tables::BC7SC::g_mode1_p1_i2,
                    &cvtt::Tables::BC7SC::g_mode1_p1_i3,
                };

                const cvtt::Tables::BC7SC::Table *tables2[] =
                {
                    &cvtt::Tables::BC7SC::g_mode2,
                };

                const cvtt::Tables::BC7SC::Table *tables3[] =
                {
                    &cvtt::Tables::BC7SC::g_mode3_p0,
                    &cvtt::Tables::BC7SC::g_mode3_p1,
                };

                const cvtt::Tables::BC7SC::Table *tables6[] =
                {
                    &cvtt::Tables::BC7SC::g_mode6_p0_i1,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i2,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i3,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i4,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i5,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i6,
                    &cvtt::Tables::BC7SC::g_mode6_p0_i7,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i1,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i2,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i3,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i4,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i5,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i6,
                    &cvtt::Tables::BC7SC::g_mode6_p1_i7,
                };

                const cvtt::Tables::BC7SC::Table *tables7[] =
                {
                    &cvtt::Tables::BC7SC::g_mode7_p00,
                    &cvtt::Tables::BC7SC::g_mode7_p01,
                    &cvtt::Tables::BC7SC::g_mode7_p10,
                    &cvtt::Tables::BC7SC::g_mode7_p11,
                };

                switch (mode)
                {
                case 0:
                {
                    scTables = tables0;
                    numSCTables = sizeof(tables0) / sizeof(tables0[0]);
                }
                break;
                case 1:
                {
                    scTables = tables1;
                    numSCTables = sizeof(tables1) / sizeof(tables1[0]);
                }
                break;
                case 2:
                {

                    scTables = tables2;
                    numSCTables = sizeof(tables2) / sizeof(tables2[0]);
                }
                break;
                case 3:
                {
                    scTables = tables3;
                    numSCTables = sizeof(tables3) / sizeof(tables3[0]);
                }
                break;
                case 6:
                {
                    scTables = tables6;
                    numSCTables = sizeof(tables6) / sizeof(tables6[0]);
                }
                break;
                case 7:
                {
                    scTables = tables7;
                    numSCTables = sizeof(tables7) / sizeof(tables7[0]);
                }
                break;
                default:
                    assert(false);
                    break;
                }

                TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
            }
        } // shapeIter

        uint64_t partitionsEnabledBits = 0xffffffffffffffffULL;

        switch (mode)
        {
        case 0:
            partitionsEnabledBits = encodingPlan.mode0PartitionEnabled;
            break;
        case 1:
            partitionsEnabledBits = encodingPlan.mode1PartitionEnabled;
            break;
        case 2:
            partitionsEnabledBits = encodingPlan.mode2PartitionEnabled;
            break;
        case 3:
            partitionsEnabledBits = encodingPlan.mode3PartitionEnabled;
            break;
        case 6:
            partitionsEnabledBits = encodingPlan.mode6Enabled ? 1 : 0;
            break;
        case 7:
            if (anyBlockHasAlpha)
                partitionEnabledBits = encodingPlan.mode7RGBAPartitionEnabled;
            else
                partitionEnabledBits = encodingPlan.mode7RGBPartitionEnabled;
            break;
        default:
            break;
        };

        for (uint16_t partition = 0; partition < numPartitions; partition++)
        {
            if (((partitionsEnabledBits >> partition) & 1) == 0)
                continue;

            const int *partitionShapes;
            if (numSubsets == 1)
                partitionShapes = BC7Data::g_shapes1[partition];
            else if (numSubsets == 2)
                partitionShapes = BC7Data::g_shapes2[partition];
            else
            {
                assert(numSubsets == 3);
                partitionShapes = BC7Data::g_shapes3[partition];
            }

            MFloat totalError = ParallelMath::MakeFloatZero();
            for (int subset = 0; subset < numSubsets; subset++)
                totalError = totalError + temps.shapeBestError[partitionShapes[subset]];

            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);

            if (mode == 7 && anyBlockHasAlpha)
            {
                // Some lanes could be better, but we filter them out to ensure consistency with scalar
                bool isRGBAllowedForThisPartition = (((encodingPlan.mode7RGBPartitionEnabled >> partition) & 1) != 0);

                if (!isRGBAllowedForThisPartition)
                {
                    errorBetter16 = (errorBetter16 & blockHasNonMaxAlpha);
                    errorBetter = ParallelMath::Int16FlagToFloat(errorBetter16);
                }
            }

            if (ParallelMath::AnySet(errorBetter16))
            {
                for (int subset = 0; subset < numSubsets; subset++)
                {
                    int shape = partitionShapes[subset];
                    int shapeStart = BC7Data::g_shapeRanges[shape][0];
                    int shapeLength = BC7Data::g_shapeRanges[shape][1];

                    for (int epi = 0; epi < 2; epi++)
                        for (int ch = 0; ch < 4; ch++)
                            ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shape][epi][ch]);

                    for (int pxi = 0; pxi < shapeLength; pxi++)
                    {
                        int px = BC7Data::g_fragments[shapeStart + pxi];
                        ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
                    }
                }

                ParallelMath::ConditionalSet(work.m_error, errorBetter, totalError);
                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
                ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
            }
        }
    }
}

void cvtt::Internal::BC7Computer::TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds, BC67::WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
{
    // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
    // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
    // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
    // - Separate alpha channel, then weighted RGB
    // - Alpha+2 other channels, then the independent channel
    if (numRefineRounds < 1)
        numRefineRounds = 1;

    float channelWeightsSq[4];
    for (int ch = 0; ch < 4; ch++)
        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];

    for (uint16_t mode = 4; mode <= 5; mode++)
    {
        int numSP[2] = { 0, 0 };

        for (uint16_t rotation = 0; rotation < 4; rotation++)
        {
            if (mode == 4)
            {
                numSP[0] = encodingPlan.mode4SP[rotation][0];
                numSP[1] = encodingPlan.mode4SP[rotation][1];
            }
            else
                numSP[0] = numSP[1] = encodingPlan.mode5SP[rotation];

            if (numSP[0] == 0 && numSP[1] == 0)
                continue;

            int alphaChannel = (rotation + 3) & 3;
            int redChannel = (rotation == 1) ? 3 : 0;
            int greenChannel = (rotation == 2) ? 3 : 1;
            int blueChannel = (rotation == 3) ? 3 : 2;

            MUInt15 rotatedRGB[16][3];
            MFloat floatRotatedRGB[16][3];

            for (int px = 0; px < 16; px++)
            {
                rotatedRGB[px][0] = pixels[px][redChannel];
                rotatedRGB[px][1] = pixels[px][greenChannel];
                rotatedRGB[px][2] = pixels[px][blueChannel];

                for (int ch = 0; ch < 3; ch++)
                    floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
            }

            uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;

            float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
            float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
            float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
            float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };

            float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error

            MFloat preWeightedRotatedRGB[16][3];
            BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);

            for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
            {
                int numTweakRounds = numSP[indexSelector];

                if (numTweakRounds <= 0)
                    continue;

                if (numTweakRounds > MaxTweakRounds)
                    numTweakRounds = MaxTweakRounds;

                EndpointSelector<3, 8> rgbSelector;

                for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
                {
                    for (int px = 0; px < 16; px++)
                        rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));

                    rgbSelector.FinishPass(epPass);
                }

                MUInt15 alphaRange[2];

                alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
                for (int px = 1; px < 16; px++)
                {
                    alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
                    alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
                }

                int rgbPrec = 0;
                int alphaPrec = 0;

                if (mode == 4)
                {
                    rgbPrec = indexSelector ? 3 : 2;
                    alphaPrec = indexSelector ? 2 : 3;
                }
                else
                    rgbPrec = alphaPrec = 2;

                UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);

                MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
                MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);

                MUInt15 bestRGBIndexes[16];
                MUInt15 bestAlphaIndexes[16];
                MUInt15 bestEP[2][4];

                for (int px = 0; px < 16; px++)
                    bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);

                for (int tweak = 0; tweak < numTweakRounds; tweak++)
                {
                    MUInt15 rgbEP[2][3];
                    MUInt15 alphaEP[2];

                    unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);

                    TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);

                    for (int refine = 0; refine < numRefineRounds; refine++)
                    {
                        if (mode == 4)
                            CompressEndpoints4(rgbEP, alphaEP);
                        else
                            CompressEndpoints5(rgbEP, alphaEP);


                        IndexSelector<1> alphaIndexSelector;
                        IndexSelector<3> rgbIndexSelector;

                        {
                            MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
                            alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
                        }
                        rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);

                        EndpointRefiner<3> rgbRefiner;
                        EndpointRefiner<1> alphaRefiner;

                        rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
                        alphaRefiner.Init(1 << alphaPrec, uniformWeight);

                        MFloat errorRGB = ParallelMath::MakeFloatZero();
                        MFloat errorA = ParallelMath::MakeFloatZero();

                        MUInt15 rgbIndexes[16];
                        MUInt15 alphaIndexes[16];

                        AggregatedError<3> rgbAggError;
                        AggregatedError<1> alphaAggError;

                        for (int px = 0; px < 16; px++)
                        {
                            MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
                            MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);

                            MUInt15 reconstructedRGB[3];
                            MUInt15 reconstructedAlpha[1];

                            rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
                            alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);

                            if (flags & cvtt::Flags::BC7_FastIndexing)
                            {
                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
                            }
                            else
                            {
                                AggregatedError<3> baseRGBAggError;
                                AggregatedError<1> baseAlphaAggError;

                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);

                                MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
                                MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);

                                MUInt15 altRGBIndexes[2];
                                MUInt15 altAlphaIndexes[2];

                                altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
                                altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));

                                altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
                                altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));

                                for (int ii = 0; ii < 2; ii++)
                                {
                                    rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
                                    alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);

                                    AggregatedError<3> altRGBAggError;
                                    AggregatedError<1> altAlphaAggError;

                                    BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
                                    BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);

                                    MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
                                    MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);

                                    ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
                                    ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));

                                    rgbError = ParallelMath::Min(altRGBError, rgbError);
                                    alphaError = ParallelMath::Min(altAlphaError, alphaError);

                                    ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
                                    ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
                                }

                                errorRGB = errorRGB + rgbError;
                                errorA = errorA + alphaError;
                            }

                            if (refine != numRefineRounds - 1)
                            {
                                rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
                                alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
                            }

                            if (flags & Flags::BC7_FastIndexing)
                            {
                                errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
                                errorA = alphaAggError.Finalize(flags, rotatedAlphaWeightSq);
                            }

                            rgbIndexes[px] = rgbIndex;
                            alphaIndexes[px] = alphaIndex;
                        }

                        ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
                        ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);

                        ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
                        ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);

                        if (ParallelMath::AnySet(rgbBetterInt16))
                        {
                            bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);

                            for (int px = 0; px < 16; px++)
                                ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);

                            for (int ep = 0; ep < 2; ep++)
                            {
                                for (int ch = 0; ch < 3; ch++)
                                    ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
                            }
                        }

                        if (ParallelMath::AnySet(alphaBetterInt16))
                        {
                            bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);

                            for (int px = 0; px < 16; px++)
                                ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);

                            for (int ep = 0; ep < 2; ep++)
                                ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
                        }

                        if (refine != numRefineRounds - 1)
                        {
                            rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);

                            MUInt15 alphaEPTemp[2][1];
                            alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);

                            for (int i = 0; i < 2; i++)
                                alphaEP[i] = alphaEPTemp[i][0];
                        }
                    }	// refine
                } // tweak

                MFloat combinedError = bestRGBError + bestAlphaError;

                ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
                ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);

                work.m_error = ParallelMath::Min(combinedError, work.m_error);

                ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
                ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
                ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));

                for (int px = 0; px < 16; px++)
                {
                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
                    ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
                }

                for (int ep = 0; ep < 2; ep++)
                    for (int ch = 0; ch < 4; ch++)
                        ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
            }
        }
    }
}

template<class T>
void cvtt::Internal::BC7Computer::Swap(T& a, T& b)
{
    T temp = a;
    a = b;
    b = temp;
}

void cvtt::Internal::BC7Computer::Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], const BC7EncodingPlan &encodingPlan, int numRefineRounds)
{
    MUInt15 pixels[16][4];
    MFloat floatPixels[16][4];

    for (int px = 0; px < 16; px++)
    {
        for (int ch = 0; ch < 4; ch++)
            ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
    }

    for (int px = 0; px < 16; px++)
    {
        for (int ch = 0; ch < 4; ch++)
            floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
    }

    BC67::WorkInfo work;
    memset(&work, 0, sizeof(work));

    work.m_error = ParallelMath::MakeFloat(FLT_MAX);

    {
        ParallelMath::RoundTowardNearestForScope rtn;
        TrySinglePlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
        TryDualPlane(flags, pixels, floatPixels, channelWeights, encodingPlan, numRefineRounds, work, &rtn);
    }

    for (int block = 0; block < ParallelMath::ParallelSize; block++)
    {
        PackingVector pv;
        pv.Init();

        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
        ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);

        const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];

        ParallelMath::ScalarUInt16 indexes[16];
        ParallelMath::ScalarUInt16 indexes2[16];
        ParallelMath::ScalarUInt16 endPoints[3][2][4];

        for (int i = 0; i < 16; i++)
        {
            indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
            if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
                indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
        }

        for (int subset = 0; subset < 3; subset++)
        {
            for (int ep = 0; ep < 2; ep++)
            {
                for (int ch = 0; ch < 4; ch++)
                    endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
            }
        }

        int fixups[3] = { 0, 0, 0 };

        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
        {
            bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
            bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);

            if (flipRGB)
            {
                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
                for (int px = 0; px < 16; px++)
                    indexes[px] = highIndex - indexes[px];
            }

            if (flipAlpha)
            {
                uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
                for (int px = 0; px < 16; px++)
                    indexes2[px] = highIndex - indexes2[px];
            }

            if (indexSelector)
                Swap(flipRGB, flipAlpha);

            if (flipRGB)
            {
                for (int ch = 0; ch < 3; ch++)
                    Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
            }
            if (flipAlpha)
                Swap(endPoints[0][0][3], endPoints[0][1][3]);

        }
        else
        {
            if (modeInfo.m_numSubsets == 2)
                fixups[1] = BC7Data::g_fixupIndexes2[partition];
            else if (modeInfo.m_numSubsets == 3)
            {
                fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
                fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
            }

            bool flip[3] = { false, false, false };
            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
                flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);

            if (flip[0] || flip[1] || flip[2])
            {
                uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
                for (int px = 0; px < 16; px++)
                {
                    int subset = 0;
                    if (modeInfo.m_numSubsets == 2)
                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
                    else if (modeInfo.m_numSubsets == 3)
                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;

                    if (flip[subset])
                        indexes[px] = highIndex - indexes[px];
                }

                int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
                {
                    if (flip[subset])
                        for (int ch = 0; ch < maxCH; ch++)
                            Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
                }
            }
        }

        pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);

        if (modeInfo.m_partitionBits)
            pv.Pack(partition, modeInfo.m_partitionBits);

        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
        {
            ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
            pv.Pack(rotation, 2);
        }

        if (modeInfo.m_hasIndexSelector)
            pv.Pack(indexSelector, 1);

        // Encode RGB
        for (int ch = 0; ch < 3; ch++)
        {
            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
            {
                for (int ep = 0; ep < 2; ep++)
                {
                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
                    epPart >>= (8 - modeInfo.m_rgbBits);

                    pv.Pack(epPart, modeInfo.m_rgbBits);
                }
            }
        }

        // Encode alpha
        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
        {
            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
            {
                for (int ep = 0; ep < 2; ep++)
                {
                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
                    epPart >>= (8 - modeInfo.m_alphaBits);

                    pv.Pack(epPart, modeInfo.m_alphaBits);
                }
            }
        }

        // Encode parity bits
        if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
        {
            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
            {
                ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
                epPart >>= (7 - modeInfo.m_rgbBits);
                epPart &= 1;

                pv.Pack(epPart, 1);
            }
        }
        else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
        {
            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
            {
                for (int ep = 0; ep < 2; ep++)
                {
                    ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
                    epPart >>= (7 - modeInfo.m_rgbBits);
                    epPart &= 1;

                    pv.Pack(epPart, 1);
                }
            }
        }

        // Encode indexes
        for (int px = 0; px < 16; px++)
        {
            int bits = modeInfo.m_indexBits;
            if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
                bits--;

            pv.Pack(indexes[px], bits);
        }

        // Encode secondary indexes
        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
        {
            for (int px = 0; px < 16; px++)
            {
                int bits = modeInfo.m_alphaIndexBits;
                if (px == 0)
                    bits--;

                pv.Pack(indexes2[px], bits);
            }
        }

        pv.Flush(packedBlocks);

        packedBlocks += 16;
    }
}

void cvtt::Internal::BC7Computer::UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
{
    UnpackingVector pv;
    pv.Init(packedBlock);

    int mode = 8;
    for (int i = 0; i < 8; i++)
    {
        if (pv.Unpack(1) == 1)
        {
            mode = i;
            break;
        }
    }

    if (mode > 7)
    {
        for (int px = 0; px < 16; px++)
            for (int ch = 0; ch < 4; ch++)
                output.m_pixels[px][ch] = 0;

        return;
    }

    const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];

    int partition = 0;
    if (modeInfo.m_partitionBits)
        partition = pv.Unpack(modeInfo.m_partitionBits);

    int rotation = 0;
    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
        rotation = pv.Unpack(2);

    int indexSelector = 0;
    if (modeInfo.m_hasIndexSelector)
        indexSelector = pv.Unpack(1);

    // Resolve fixups
    int fixups[3] = { 0, 0, 0 };

    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
    {
        if (modeInfo.m_numSubsets == 2)
            fixups[1] = BC7Data::g_fixupIndexes2[partition];
        else if (modeInfo.m_numSubsets == 3)
        {
            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
        }
    }

    int endPoints[3][2][4];

    // Decode RGB
    for (int ch = 0; ch < 3; ch++)
    {
        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
        {
            for (int ep = 0; ep < 2; ep++)
                endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
        }
    }

    // Decode alpha
    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
    {
        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
        {
            for (int ep = 0; ep < 2; ep++)
                endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
        }
    }
    else
    {
        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
        {
            for (int ep = 0; ep < 2; ep++)
                endPoints[subset][ep][3] = 255;
        }
    }

    int parityBits = 0;

    // Decode parity bits
    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
    {
        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
        {
            int p = pv.Unpack(1);

            for (int ep = 0; ep < 2; ep++)
            {
                for (int ch = 0; ch < 3; ch++)
                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);

                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
            }
        }

        parityBits = 1;
    }
    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
    {
        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
        {
            for (int ep = 0; ep < 2; ep++)
            {
                int p = pv.Unpack(1);

                for (int ch = 0; ch < 3; ch++)
                    endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);

                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
                    endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
            }
        }

        parityBits = 1;
    }

    // Fill endpoint bits
    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
    {
        for (int ep = 0; ep < 2; ep++)
        {
            for (int ch = 0; ch < 3; ch++)
                endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));

            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
                endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
        }
    }

    int indexes[16];
    int indexes2[16];

    // Decode indexes
    for (int px = 0; px < 16; px++)
    {
        int bits = modeInfo.m_indexBits;
        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
            bits--;

        indexes[px] = pv.Unpack(bits);
    }

    // Decode secondary indexes
    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
    {
        for (int px = 0; px < 16; px++)
        {
            int bits = modeInfo.m_alphaIndexBits;
            if (px == 0)
                bits--;

            indexes2[px] = pv.Unpack(bits);
        }
    }
    else
    {
        for (int px = 0; px < 16; px++)
            indexes2[px] = 0;
    }

    const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
    const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];

    // Decode each pixel
    for (int px = 0; px < 16; px++)
    {
        int rgbWeight = 0;
        int alphaWeight = 0;

        int rgbIndex = indexes[px];

        rgbWeight = rgbWeights[indexes[px]];

        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
            alphaWeight = rgbWeight;
        else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
            alphaWeight = alphaWeights[indexes2[px]];

        if (indexSelector == 1)
        {
            int temp = rgbWeight;
            rgbWeight = alphaWeight;
            alphaWeight = temp;
        }

        int pixel[4] = { 0, 0, 0, 255 };

        int subset = 0;

        if (modeInfo.m_numSubsets == 2)
            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
        else if (modeInfo.m_numSubsets == 3)
            subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;

        for (int ch = 0; ch < 3; ch++)
            pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;

        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
            pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;

        if (rotation != 0)
        {
            int ch = rotation - 1;
            int temp = pixel[ch];
            pixel[ch] = pixel[3];
            pixel[3] = temp;
        }

        for (int ch = 0; ch < 4; ch++)
            output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
    }
}

cvtt::ParallelMath::SInt16 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
{
    assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
    assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));

    // Expand to full range
    ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
    MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));

    absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);

    MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);

    return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
}

cvtt::ParallelMath::UInt15 cvtt::Internal::BC6HComputer::QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
{
    MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
    return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
}

void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
{
    MSInt16 zero = ParallelMath::MakeSInt16(0);

    ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
    MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));

    MSInt16 unq;
    MUInt15 absUnq;

    if (precision >= 16)
    {
        unq = comp;
        absUnq = absComp;
    }
    else
    {
        MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);

        absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
        ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
        ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));

        unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
    }

    outUnquantized = unq;

    MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));

    outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
}

void cvtt::Internal::BC6HComputer::UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
{
    MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
    if (precision < 15)
    {
        MUInt15 zero = ParallelMath::MakeUInt15(0);
        MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));

        ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
        ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);

        unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));

        ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
        ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
    }

    outUnquantized = unq;
    outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
}

void cvtt::Internal::BC6HComputer::QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
{
    MSInt16 unquantizedEP[2][3];
    MSInt16 finishedUnquantizedEP[2][3];

    {
        ParallelMath::RoundUpForScope ru;

        for (int epi = 0; epi < 2; epi++)
        {
            for (int ch = 0; ch < 3; ch++)
            {
                MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
                UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
            }
        }
    }

    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
    indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);

    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);

    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);

    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);

    if (ParallelMath::AnySet(invert))
    {
        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));

        indexSelector.ConditionalInvert(invert);

        for (int ch = 0; ch < 3; ch++)
        {
            MAInt16 firstEP = quantizedEndPoints[0][ch];
            MAInt16 secondEP = quantizedEndPoints[1][ch];

            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
        }
    }

    indexes[fixupIndex] = index;
}

void cvtt::Internal::BC6HComputer::QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
{
    MUInt16 unquantizedEP[2][3];
    MUInt16 finishedUnquantizedEP[2][3];

    {
        ParallelMath::RoundUpForScope ru;

        for (int epi = 0; epi < 2; epi++)
        {
            for (int ch = 0; ch < 3; ch++)
            {
                MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
                UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
                quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
            }
        }
    }

    indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
    indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);

    MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);

    MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);

    ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);

    if (ParallelMath::AnySet(invert))
    {
        ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));

        indexSelector.ConditionalInvert(invert);

        for (int ch = 0; ch < 3; ch++)
        {
            MAInt16 firstEP = quantizedEndPoints[0][ch];
            MAInt16 secondEP = quantizedEndPoints[1][ch];

            quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
            quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
        }
    }

    indexes[fixupIndex] = index;
}

void cvtt::Internal::BC6HComputer::EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
{
    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);

    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));

    for (int ch = 0; ch < 3; ch++)
    {
        outEncodedEPs[0][0][ch] = ep0[0][ch];
        outEncodedEPs[0][1][ch] = ep0[1][ch];
        outEncodedEPs[1][0][ch] = ep1[0][ch];
        outEncodedEPs[1][1][ch] = ep1[1][ch];

        if (isTransformed)
        {
            for (int subset = 0; subset < 2; subset++)
            {
                for (int epi = 0; epi < 2; epi++)
                {
                    if (epi == 0 && subset == 0)
                        continue;

                    MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);

                    MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);

                    outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);

                    MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
                    allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
                }
            }
        }

        if (!ParallelMath::AnySet(allLegal))
            break;
    }

    outIsLegal = allLegal;
}

void cvtt::Internal::BC6HComputer::EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
{
    ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);

    MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));

    for (int ch = 0; ch < 3; ch++)
    {
        outEncodedEPs[0][ch] = ep[0][ch];
        outEncodedEPs[1][ch] = ep[1][ch];

        if (isTransformed)
        {
            MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);

            MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);

            outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);

            MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
            allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
        }
    }

    outIsLegal = allLegal;
}

void cvtt::Internal::BC6HComputer::Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
{
    if (numTweakRounds < 1)
        numTweakRounds = 1;
    else if (numTweakRounds > MaxTweakRounds)
        numTweakRounds = MaxTweakRounds;

    if (numRefineRounds < 1)
        numRefineRounds = 1;
    else if (numRefineRounds > MaxRefineRounds)
        numRefineRounds = MaxRefineRounds;

    bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
    float channelWeightsSq[3];

    ParallelMath::RoundTowardNearestForScope rtn;

    MSInt16 pixels[16][3];
    MFloat floatPixels2CL[16][3];
    MFloat floatPixelsLinearWeighted[16][3];

    MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);

    for (int ch = 0; ch < 3; ch++)
        channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];

    for (int px = 0; px < 16; px++)
    {
        for (int ch = 0; ch < 3; ch++)
        {
            MSInt16 pixelValue;
            ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);

            // Convert from sign+magnitude to 2CL
            if (isSigned)
            {
                ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
                MSInt16 magnitude = (pixelValue & low15Bits);
                ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
            }
            else
                pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));

            pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));

            pixels[px][ch] = pixelValue;
            floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
            floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
        }
    }

    MFloat preWeightedPixels[16][3];

    BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);

    MAInt16 bestEndPoints[2][2][3];
    MUInt15 bestIndexes[16];
    MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
    MUInt15 bestMode = ParallelMath::MakeUInt15(0);
    MUInt15 bestPartition = ParallelMath::MakeUInt15(0);

    for (int px = 0; px < 16; px++)
        bestIndexes[px] = ParallelMath::MakeUInt15(0);

    for (int subset = 0; subset < 2; subset++)
        for (int epi = 0; epi < 2; epi++)
            for (int ch = 0; ch < 3; ch++)
                bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);

    UnfinishedEndpoints<3> partitionedUFEP[32][2];
    UnfinishedEndpoints<3> singleUFEP;

    // Generate UFEP for partitions
    for (int p = 0; p < 32; p++)
    {
        int partitionMask = BC7Data::g_partitionMap[p];

        EndpointSelector<3, 8> epSelectors[2];

        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
        {
            for (int px = 0; px < 16; px++)
            {
                int subset = (partitionMask >> px) & 1;
                epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
            }

            for (int subset = 0; subset < 2; subset++)
                epSelectors[subset].FinishPass(pass);
        }

        for (int subset = 0; subset < 2; subset++)
            partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
    }

    // Generate UFEP for single
    {
        EndpointSelector<3, 8> epSelector;

        for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
        {
            for (int px = 0; px < 16; px++)
                epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));

            epSelector.FinishPass(pass);
        }

        singleUFEP = epSelector.GetEndpoints(channelWeights);
    }

    for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
    {
        bool partitioned = (partitionedInt == 1);

        for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
        {
            if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
                continue;

            int numPartitions = partitioned ? 32 : 1;
            int numSubsets = partitioned ? 2 : 1;
            int indexBits = partitioned ? 3 : 4;
            int indexRange = (1 << indexBits);

            for (int p = 0; p < numPartitions; p++)
            {
                int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;

                const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;

                MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
                MUInt15 metaIndexes[MaxMetaRounds][16];
                MFloat metaError[MaxMetaRounds][2];

                bool roundValid[MaxMetaRounds][2];

                for (int r = 0; r < MaxMetaRounds; r++)
                    for (int subset = 0; subset < 2; subset++)
                        roundValid[r][subset] = true;

                for (int subset = 0; subset < numSubsets; subset++)
                {
                    for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
                    {
                        EndpointRefiner<3> refiners[2];

                        bool abortRemainingRefines = false;
                        for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
                        {
                            int metaRound = tweak * MaxRefineRounds + refinePass;

                            if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
                                abortRemainingRefines = true;

                            if (abortRemainingRefines)
                            {
                                roundValid[metaRound][subset] = false;
                                continue;
                            }

                            MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
                            MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];

                            MSInt16 endPointsColorSpace[2][3];

                            if (refinePass == 0)
                            {
                                UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;

                                if (isSigned)
                                    ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
                                else
                                    ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
                            }
                            else
                                refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);

                            refiners[subset].Init(indexRange, channelWeights);

                            int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];

                            IndexSelectorHDR<3> indexSelector;
                            if (isSigned)
                                QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
                            else
                                QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);

                            if (metaRound > 0)
                            {
                                ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);

                                for (int prevRound = 0; prevRound < metaRound; prevRound++)
                                {
                                    MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];

                                    ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);

                                    for (int epi = 0; epi < 2; epi++)
                                        for (int ch = 0; ch < 3; ch++)
                                            same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));

                                    anySame = (anySame | same);
                                    if (ParallelMath::AllSet(anySame))
                                        break;
                                }

                                if (ParallelMath::AllSet(anySame))
                                {
                                    roundValid[metaRound][subset] = false;
                                    continue;
                                }
                            }

                            MFloat subsetError = ParallelMath::MakeFloatZero();

                            {
                                for (int px = 0; px < 16; px++)
                                {
                                    if (subset != ((partitionMask >> px) & 1))
                                        continue;

                                    MUInt15 index;
                                    if (px == fixupIndex)
                                        index = mrIndexes[px];
                                    else
                                    {
                                        index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
                                        mrIndexes[px] = index;
                                    }

                                    MSInt16 reconstructed[3];
                                    if (isSigned)
                                        indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
                                    else
                                        indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);

                                    subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));

                                    if (refinePass != numRefineRounds - 1)
                                        refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
                                }
                            }

                            metaError[metaRound][subset] = subsetError;
                        }
                    }
                }

                // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
                int numMeta1 = partitioned ? MaxMetaRounds : 1;
                for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
                {
                    if (!roundValid[meta0][0])
                        continue;

                    for (int meta1 = 0; meta1 < numMeta1; meta1++)
                    {
                        MFloat combinedError = metaError[meta0][0];
                        if (partitioned)
                        {
                            if (!roundValid[meta1][1])
                                continue;

                            combinedError = combinedError + metaError[meta1][1];
                        }

                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
                        if (!ParallelMath::AnySet(errorBetter))
                            continue;

                        ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);

                        // Figure out if this is encodable
                        for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
                        {
                            const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];

                            if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
                                continue;

                            MAInt16 encodedEPs[2][2][3];
                            ParallelMath::Int16CompFlag isLegal;
                            if (partitioned)
                                EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
                            else
                                EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);

                            ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
                            if (!ParallelMath::AnySet(isLegalAndBetter))
                                continue;

                            ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);

                            ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
                            ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
                            ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));

                            for (int subset = 0; subset < numSubsets; subset++)
                            {
                                for (int epi = 0; epi < 2; epi++)
                                {
                                    for (int ch = 0; ch < 3; ch++)
                                        ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
                                }
                            }

                            for (int px = 0; px < 16; px++)
                            {
                                int subset = ((partitionMask >> px) & 1);
                                if (subset == 0)
                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
                                else
                                    ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
                            }

                            needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
                            if (!ParallelMath::AnySet(needsCommit))
                                break;
                        }
                    }
                }
            }
        }
    }

    // At this point, everything should be set
    for (int block = 0; block < ParallelMath::ParallelSize; block++)
    {
        ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
        ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
        int32_t eps[2][2][3];
        ParallelMath::ScalarUInt16 indexes[16];

        const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];

        BC6H_IO::WriteFunc_t writeFunc = BC6H_IO::g_writeFuncs[mode];

        const int headerBits = modeInfo.m_partitioned ? 82 : 65;

        for (int subset = 0; subset < 2; subset++)
        {
            for (int epi = 0; epi < 2; epi++)
            {
                for (int ch = 0; ch < 3; ch++)
                    eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
            }
        }

        for (int px = 0; px < 16; px++)
            indexes[px] = ParallelMath::Extract(bestIndexes[px], block);

        uint16_t modeID = modeInfo.m_modeID;

        PackingVector pv;

        {
            uint32_t header[3];
            writeFunc(header, modeID, partition,
                eps[0][0][0], eps[0][1][0], eps[1][0][0], eps[1][1][0],
                eps[0][0][1], eps[0][1][1], eps[1][0][1], eps[1][1][1],
                eps[0][0][2], eps[0][1][2], eps[1][0][2], eps[1][1][2]
            );

            pv.InitPacked(header, headerBits);
        }

        int fixupIndex1 = 0;
        int indexBits = 4;
        if (modeInfo.m_partitioned)
        {
            fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
            indexBits = 3;
        }

        for (int px = 0; px < 16; px++)
        {
            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
            if (px == 0 || px == fixupIndex1)
                pv.Pack(index, indexBits - 1);
            else
                pv.Pack(index, indexBits);
        }

        pv.Flush(packedBlocks + 16 * block);
    }
}

void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits)
{
    if (v & (1 << (bits - 1)))
        v |= -(1 << bits);
}

void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
{
    UnpackingVector pv;
    pv.Init(pBC);

    int numModeBits = 2;
    int modeBits = pv.Unpack(2);
    if (modeBits != 0 && modeBits != 1)
    {
        modeBits |= pv.Unpack(3) << 2;
        numModeBits += 3;
    }

    int mode = -1;
    for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
    {
        if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
        {
            mode = possibleMode;
            break;
        }
    }

    if (mode < 0)
    {
        for (int px = 0; px < 16; px++)
        {
            for (int ch = 0; ch < 3; ch++)
                output.m_pixels[px][ch] = 0;
            output.m_pixels[px][3] = 0x3c00;	// 1.0
        }
        return;
    }

    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
    const int headerBits = modeInfo.m_partitioned ? 82 : 65;
    const BC6H_IO::ReadFunc_t readFunc = BC6H_IO::g_readFuncs[mode];

    uint16_t partition = 0;
    int32_t eps[2][2][3];

    for (int subset = 0; subset < 2; subset++)
        for (int epi = 0; epi < 2; epi++)
            for (int ch = 0; ch < 3; ch++)
                eps[subset][epi][ch] = 0;

    {
        uint32_t header[3];
        uint16_t codedEPs[2][2][3];
        pv.UnpackStart(header, headerBits);

        readFunc(header, partition,
            codedEPs[0][0][0], codedEPs[0][1][0], codedEPs[1][0][0], codedEPs[1][1][0],
            codedEPs[0][0][1], codedEPs[0][1][1], codedEPs[1][0][1], codedEPs[1][1][1],
            codedEPs[0][0][2], codedEPs[0][1][2], codedEPs[1][0][2], codedEPs[1][1][2]
        );

        for (int subset = 0; subset < 2; subset++)
            for (int epi = 0; epi < 2; epi++)
                for (int ch = 0; ch < 3; ch++)
                    eps[subset][epi][ch] = codedEPs[subset][epi][ch];
    }

    uint16_t modeID = modeInfo.m_modeID;

    int fixupIndex1 = 0;
    int indexBits = 4;
    int numSubsets = 1;
    if (modeInfo.m_partitioned)
    {
        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
        indexBits = 3;
        numSubsets = 2;
    }

    int indexes[16];
    for (int px = 0; px < 16; px++)
    {
        if (px == 0 || px == fixupIndex1)
            indexes[px] = pv.Unpack(indexBits - 1);
        else
            indexes[px] = pv.Unpack(indexBits);
    }

    if (modeInfo.m_partitioned)
    {
        for (int ch = 0; ch < 3; ch++)
        {
            if (isSigned)
                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
            if (modeInfo.m_transformed || isSigned)
            {
                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
                SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
                SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
            }
        }
    }
    else
    {
        for (int ch = 0; ch < 3; ch++)
        {
            if (isSigned)
                SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
            if (modeInfo.m_transformed || isSigned)
                SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
        }
    }

    int aPrec = modeInfo.m_aPrec;

    if (modeInfo.m_transformed)
    {
        for (int ch = 0; ch < 3; ch++)
        {
            int wrapMask = (1 << aPrec) - 1;

            eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
            if (isSigned)
                SignExtendSingle(eps[0][1][ch], aPrec);

            if (modeInfo.m_partitioned)
            {
                eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
                eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);

                if (isSigned)
                {
                    SignExtendSingle(eps[1][0][ch], aPrec);
                    SignExtendSingle(eps[1][1][ch], aPrec);
                }
            }
        }
    }

    // Unquantize endpoints
    for (int subset = 0; subset < numSubsets; subset++)
    {
        for (int epi = 0; epi < 2; epi++)
        {
            for (int ch = 0; ch < 3; ch++)
            {
                int &v = eps[subset][epi][ch];

                if (isSigned)
                {
                    if (aPrec >= 16)
                    {
                        // Nothing
                    }
                    else
                    {
                        bool s = false;
                        int comp = v;
                        if (v < 0)
                        {
                            s = true;
                            comp = -comp;
                        }

                        int unq = 0;
                        if (comp == 0)
                            unq = 0;
                        else if (comp >= ((1 << (aPrec - 1)) - 1))
                            unq = 0x7fff;
                        else
                            unq = ((comp << 15) + 0x4000) >> (aPrec - 1);

                        if (s)
                            unq = -unq;

                        v = unq;
                    }
                }
                else
                {
                    if (aPrec >= 15)
                    {
                        // Nothing
                    }
                    else if (v == 0)
                    {
                        // Nothing
                    }
                    else if (v == ((1 << aPrec) - 1))
                        v = 0xffff;
                    else
                        v = ((v << 16) + 0x8000) >> aPrec;
                }
            }
        }
    }

    const int *weights = BC7Data::g_weightTables[indexBits];

    for (int px = 0; px < 16; px++)
    {
        int subset = 0;
        if (modeInfo.m_partitioned)
            subset = (BC7Data::g_partitionMap[partition] >> px) & 1;

        int w = weights[indexes[px]];
        for (int ch = 0; ch < 3; ch++)
        {
            int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;

            if (isSigned)
            {
                if (comp < 0)
                    comp = -(((-comp) * 31) >> 5);
                else
                    comp = (comp * 31) >> 5;

                int s = 0;
                if (comp < 0)
                {
                    s = 0x8000;
                    comp = -comp;
                }

                output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
            }
            else
            {
                comp = (comp * 31) >> 6;
                output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
            }
        }
        output.m_pixels[px][3] = 0x3c00;	// 1.0
    }
}

void cvtt::Kernels::ConfigureBC7EncodingPlanFromQuality(BC7EncodingPlan &encodingPlan, int quality)
{
    static const int kMaxQuality = 100;

    if (quality < 1)
        quality = 1;
    else if (quality > kMaxQuality)
        quality = kMaxQuality;

    const int numRGBModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGB * quality / kMaxQuality;
    const int numRGBAModes = cvtt::Tables::BC7Prio::g_bc7NumPrioCodesRGBA * quality / kMaxQuality;

    const uint16_t *prioLists[] = { cvtt::Tables::BC7Prio::g_bc7PrioCodesRGB, cvtt::Tables::BC7Prio::g_bc7PrioCodesRGBA };
    const int prioListSizes[] = { numRGBModes, numRGBAModes };

    BC7FineTuningParams ftParams;
    memset(&ftParams, 0, sizeof(ftParams));

    for (int listIndex = 0; listIndex < 2; listIndex++)
    {
        int prioListSize = prioListSizes[listIndex];
        const uint16_t *prioList = prioLists[listIndex];

        for (int prioIndex = 0; prioIndex < prioListSize; prioIndex++)
        {
            const uint16_t packedMode = prioList[prioIndex];

            uint8_t seedPoints = static_cast<uint8_t>(cvtt::Tables::BC7Prio::UnpackSeedPointCount(packedMode));
            int mode = cvtt::Tables::BC7Prio::UnpackMode(packedMode);

            switch (mode)
            {
            case 0:
                ftParams.mode0SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
                break;
            case 1:
                ftParams.mode1SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
                break;
            case 2:
                ftParams.mode2SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
                break;
            case 3:
                ftParams.mode3SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
                break;
            case 4:
                ftParams.mode4SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)][cvtt::Tables::BC7Prio::UnpackIndexSelector(packedMode)] = seedPoints;
                break;
            case 5:
                ftParams.mode5SP[cvtt::Tables::BC7Prio::UnpackRotation(packedMode)] = seedPoints;
                break;
            case 6:
                ftParams.mode6SP = seedPoints;
                break;
            case 7:
                ftParams.mode7SP[cvtt::Tables::BC7Prio::UnpackPartition(packedMode)] = seedPoints;
                break;
            }
        }
    }

    ConfigureBC7EncodingPlanFromFineTuningParams(encodingPlan, ftParams);
}

// Generates a BC7 encoding plan from fine-tuning parameters.
bool cvtt::Kernels::ConfigureBC7EncodingPlanFromFineTuningParams(BC7EncodingPlan &encodingPlan, const BC7FineTuningParams &params)
{
    memset(&encodingPlan, 0, sizeof(encodingPlan));

    // Mode 0
    for (int partition = 0; partition < 16; partition++)
    {
        uint8_t sp = params.mode0SP[partition];
        if (sp == 0)
            continue;

        encodingPlan.mode0PartitionEnabled |= static_cast<uint16_t>(1) << partition;

        for (int subset = 0; subset < 3; subset++)
        {
            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
        }
    }

    // Mode 1
    for (int partition = 0; partition < 64; partition++)
    {
        uint8_t sp = params.mode1SP[partition];
        if (sp == 0)
            continue;

        encodingPlan.mode1PartitionEnabled |= static_cast<uint64_t>(1) << partition;

        for (int subset = 0; subset < 2; subset++)
        {
            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
        }
    }

    // Mode 2
    for (int partition = 0; partition < 64; partition++)
    {
        uint8_t sp = params.mode2SP[partition];
        if (sp == 0)
            continue;

        encodingPlan.mode2PartitionEnabled |= static_cast<uint64_t>(1) << partition;

        for (int subset = 0; subset < 3; subset++)
        {
            int shape = cvtt::Internal::BC7Data::g_shapes3[partition][subset];
            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
        }
    }

    // Mode 3
    for (int partition = 0; partition < 64; partition++)
    {
        uint8_t sp = params.mode3SP[partition];
        if (sp == 0)
            continue;

        encodingPlan.mode3PartitionEnabled |= static_cast<uint64_t>(1) << partition;

        for (int subset = 0; subset < 2; subset++)
        {
            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
            encodingPlan.seedPointsForShapeRGB[shape] = std::max(encodingPlan.seedPointsForShapeRGB[shape], sp);
        }
    }

    // Mode 4
    for (int rotation = 0; rotation < 4; rotation++)
    {
        for (int indexMode = 0; indexMode < 2; indexMode++)
            encodingPlan.mode4SP[rotation][indexMode] = params.mode4SP[rotation][indexMode];
    }

    // Mode 5
    for (int rotation = 0; rotation < 4; rotation++)
        encodingPlan.mode5SP[rotation] = params.mode5SP[rotation];

    // Mode 6
    {
        uint8_t sp = params.mode6SP;
        if (sp != 0)
        {
            encodingPlan.mode6Enabled = true;

            int shape = cvtt::Internal::BC7Data::g_shapes1[0][0];
            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
        }
    }

    // Mode 7
    for (int partition = 0; partition < 64; partition++)
    {
        uint8_t sp = params.mode7SP[partition];
        if (sp == 0)
            continue;

        encodingPlan.mode7RGBAPartitionEnabled |= static_cast<uint64_t>(1) << partition;

        for (int subset = 0; subset < 2; subset++)
        {
            int shape = cvtt::Internal::BC7Data::g_shapes2[partition][subset];
            encodingPlan.seedPointsForShapeRGBA[shape] = std::max(encodingPlan.seedPointsForShapeRGBA[shape], sp);
        }
    }

    for (int i = 0; i < BC7EncodingPlan::kNumRGBShapes; i++)
    {
        if (encodingPlan.seedPointsForShapeRGB[i] > 0)
        {
            encodingPlan.rgbShapeList[encodingPlan.rgbNumShapesToEvaluate] = i;
            encodingPlan.rgbNumShapesToEvaluate++;
        }
    }

    for (int i = 0; i < BC7EncodingPlan::kNumRGBAShapes; i++)
    {
        if (encodingPlan.seedPointsForShapeRGBA[i] > 0)
        {
            encodingPlan.rgbaShapeList[encodingPlan.rgbaNumShapesToEvaluate] = i;
            encodingPlan.rgbaNumShapesToEvaluate++;
        }
    }

    encodingPlan.mode7RGBPartitionEnabled = (encodingPlan.mode7RGBAPartitionEnabled & ~encodingPlan.mode3PartitionEnabled);

    return true;
}

#endif