2 files changed, 940 insertions, 146 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 41cd0230cc..d32db920ad 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -115,7 +115,7 @@ will limit its functionality to IPv4 only.
 ## etcpak
 
 - Upstream: https://github.com/wolfpld/etcpak
-- Version: git (7c3cb6fe708d4ae330b0ab2af1ad472bae2a37a2, 2021)
+- Version: git (10fc4ce627f9a17ed49bf97fcc3796a712033ba1, 2022)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
index d60164bcc8..f488f3b282 100644
--- a/thirdparty/etcpak/ProcessRGB.cpp
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -28,6 +28,10 @@
 #  define _bswap64(x) __builtin_bswap64(x)
 #endif
 
+static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
+// common T-/H-mode table
+static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
 // thresholds for the early compression-mode decision scheme
 // default: 0.03, 0.09, and 0.38
 float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
@@ -36,13 +40,17 @@ static const uint8_t ModeUndecided = 0;
 static const uint8_t ModePlanar = 0x1;
 static const uint8_t ModeTH = 0x2;
 
+const unsigned int R = 2;
+const unsigned int G = 1;
+const unsigned int B = 0;
+
 struct Luma
 {
 #ifdef __AVX2__
     float max, min;
     uint8_t minIdx = 255, maxIdx = 255;
     __m128i luma8;
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
     float max, min;
     uint8_t minIdx = 255, maxIdx = 255;
     uint8x16_t luma8;
@@ -52,8 +60,206 @@ struct Luma
 #endif
 };
 
+#ifdef __AVX2__
+struct Plane
+{
+    uint64_t plane;
+    uint64_t error;
+    __m256i sum4;
+};
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+struct Channels
+{
+#ifdef __AVX2__
+    __m128i r8, g8, b8;
+#elif defined __ARM_NEON && defined __aarch64__
+    uint8x16x2_t r, g, b;
+#endif
+};
+#endif
+
 namespace
 {
+static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
+{
+    return val < min ? min : ( val > max ? max : val );
+}
+
+static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
+{
+    return val < min ? min : val;
+}
+
+static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
+{
+    return val > max ? max : val;
+}
+
+// slightly faster than std::sort
+static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
+{
+    for( uint8_t i = 1; i < 16; ++i )
+    {
+        uint8_t value = arr1[i];
+        uint8_t hole = i;
+
+        for( ; hole > 0 && value < arr1[hole - 1]; --hole )
+        {
+            arr1[hole] = arr1[hole - 1];
+            arr2[hole] = arr2[hole - 1];
+        }
+        arr1[hole] = value;
+        arr2[hole] = i;
+    }
+}
+
+//converts indices from  |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
+//                     into  |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline int indexConversion( int pixelIndices )
+{
+    int correctIndices = 0;
+    int LSB[4][4];
+    int MSB[4][4];
+    int shift = 0;
+    for( int y = 3; y >= 0; y-- )
+    {
+        for( int x = 3; x >= 0; x-- )
+        {
+            LSB[x][y] = ( pixelIndices >> shift ) & 1;
+            shift++;
+            MSB[x][y] = ( pixelIndices >> shift ) & 1;
+            shift++;
+        }
+    }
+    shift = 0;
+    for( int x = 0; x < 4; x++ )
+    {
+        for( int y = 0; y < 4; y++ )
+        {
+            correctIndices |= ( LSB[x][y] << shift );
+            correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
+            shift++;
+        }
+    }
+    return correctIndices;
+}
+
+// Swapping two RGB-colors
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
+{
+    uint8_t temp = colors[0][R];
+    colors[0][R] = colors[1][R];
+    colors[1][R] = temp;
+
+    temp = colors[0][G];
+    colors[0][G] = colors[1][G];
+    colors[1][G] = temp;
+
+    temp = colors[0][B];
+    colors[0][B] = colors[1][B];
+    colors[1][B] = temp;
+}
+
+
+// calculates quantized colors for T or H modes
+void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
+{
+    if( t_mode )
+    {
+        quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
+        quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
+        quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
+    }
+    else // clamped to [1,14] to get a wider range
+    {
+        quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
+        quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
+        quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
+    }
+
+    // clamped to [1,14] to get a wider range
+    quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
+    quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
+    quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
+}
+
+// three decoding functions come from ETCPACK v2.74 and are slightly changed.
+static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
+{
+    // The color should be retrieved as:
+    //
+    // c = round(255/(r_bits^2-1))*comp_color
+    //
+    // This is similar to bit replication
+    //
+    // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
+    // two copy operations.
+    colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
+    colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
+    colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
+    colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
+    colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
+    colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
+}
+
+// calculates the paint colors from the block colors
+// using a distance d and one of the H- or T-patterns.
+static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+    //////////////////////////////////////////////
+    //
+    //		C3      C1		C4----C1---C2
+    //		|		|			  |
+    //		|		|			  |
+    //		|-------|			  |
+    //		|		|			  |
+    //		|		|			  |
+    //		C4      C2			  C3
+    //
+    //////////////////////////////////////////////
+
+    // C4
+    pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+    pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+    pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+    // C3
+    pColors[0][R] = colors[0][R];
+    pColors[0][G] = colors[0][G];
+    pColors[0][B] = colors[0][B];
+    // C2
+    pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+    pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+    pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+    // C1
+    pColors[2][R] = colors[1][R];
+    pColors[2][G] = colors[1][G];
+    pColors[2][B] = colors[1][B];
+}
+
+static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+    pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+    pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+    pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+    // C1
+    pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
+    pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
+    pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
+    // C2
+    pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
+    pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
+    pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
+    // C3
+    pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+    pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+    pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+}
 
 #if defined _MSC_VER && !defined __clang__
 static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
@@ -586,127 +792,107 @@ static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cv
     return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
 }
 
-struct Plane
+static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
 {
-    uint64_t plane;
-    uint64_t error;
-    __m256i sum4;
-};
-
-static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t mode )
-{
-    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
-    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
-    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
-    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
-
-    __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-    __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-    __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-    __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-
-    __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
-    __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
-    __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
-    __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
-
-    // swap channels
-    __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
-    __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
-    __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+    __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
+    __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
+    __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
 
-    __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
-    __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
-    __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+    __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+    __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+    __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
 
-    __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
-    __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
-    __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
+    __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
+    __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
 
-    __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
-    __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
-    __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+    __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
+    __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
+    __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
 
-    __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
-    __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
-    __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+    __m256i sr1 = _mm256_slli_epi64( sr0, 32 );
+    __m256i sg1 = _mm256_slli_epi64( sg0, 16 );
 
-    __m256i sr1 = _mm256_slli_epi64(sr0, 32);
-    __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+    __m256i srb = _mm256_or_si256( sr1, sb0 );
+    __m256i srgb = _mm256_or_si256( srb, sg1 );
 
-    __m256i srb = _mm256_or_si256(sr1, sb0);
-    __m256i srgb = _mm256_or_si256(srb, sg1);
+    if( mode != ModePlanar && useHeuristics )
+    {
+        Plane plane;
+        plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+        return plane;
+    }
 
-    __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
-    __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
-    __m128i t5 = _mm_hadd_epi32(t3, t4);
-    __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
-    __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
+    __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
+    __m128i t5 = _mm_hadd_epi32( t3, t4 );
+    __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+    __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
 
-    __m256i sr = _mm256_broadcastw_epi16(t5);
-    __m256i sg = _mm256_broadcastw_epi16(t6);
-    __m256i sb = _mm256_broadcastw_epi16(t7);
+    __m256i sr = _mm256_broadcastw_epi16( t5 );
+    __m256i sg = _mm256_broadcastw_epi16( t6 );
+    __m256i sb = _mm256_broadcastw_epi16( t7 );
 
-    __m256i r08 = _mm256_cvtepu8_epi16(r8);
-    __m256i g08 = _mm256_cvtepu8_epi16(g8);
-    __m256i b08 = _mm256_cvtepu8_epi16(b8);
+    __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
+    __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
+    __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
 
-    __m256i r16 = _mm256_slli_epi16(r08, 4);
-    __m256i g16 = _mm256_slli_epi16(g08, 4);
-    __m256i b16 = _mm256_slli_epi16(b08, 4);
+    __m256i r16 = _mm256_slli_epi16( r08, 4 );
+    __m256i g16 = _mm256_slli_epi16( g08, 4 );
+    __m256i b16 = _mm256_slli_epi16( b08, 4 );
 
-    __m256i difR0 = _mm256_sub_epi16(r16, sr);
-    __m256i difG0 = _mm256_sub_epi16(g16, sg);
-    __m256i difB0 = _mm256_sub_epi16(b16, sb);
+    __m256i difR0 = _mm256_sub_epi16( r16, sr );
+    __m256i difG0 = _mm256_sub_epi16( g16, sg );
+    __m256i difB0 = _mm256_sub_epi16( b16, sb );
 
-    __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
-    __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
-    __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+    __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+    __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
 
-    __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
-    __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
-    __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+    __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+    __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
 
-    __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
-    __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+    __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
+    __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
 
-    __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+    __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
 
-    __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
-    __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
-    __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+    __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
+    __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
+    __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
 
-    __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
-    __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+    __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
+    __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
 
-    __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+    __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
 
-    __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
-    __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+    __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
+    __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
 
-    const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+    const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
 
-    __m128 scale = _mm_set1_ps(-4.0f / value);
+    __m128 scale = _mm_set1_ps( -4.0f / value );
 
-    __m128 af = _mm_mul_ps(sumRGBxzf, scale);
-    __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+    __m128 af = _mm_mul_ps( sumRGBxzf, scale );
+    __m128 bf = _mm_mul_ps( sumRGByzf, scale );
 
-    __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+    __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
 
     // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
-    __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
-    __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
-    __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+    __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+    __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+    __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
 
     // convert to r6g7b6
-    __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+    __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
 
-    uint64_t rgbho = _mm_extract_epi64(cohv, 0);
-    uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+    uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
+    uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
 
     // Error calculation
     uint64_t error = 0;
-    if( mode != ModePlanar )
+    if( !useHeuristics )
     {
         auto ro0 = ( rgbho >> 48 ) & 0x3F;
         auto go0 = ( rgbho >> 40 ) & 0x7F;
@@ -820,7 +1006,15 @@ static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t
     Plane plane;
 
     plane.plane = result;
-    plane.error = error;
+    if( useHeuristics )
+    {
+        plane.error = 0;
+        mode = ModePlanar;
+    }
+    else
+    {
+        plane.error = error;
+    }
     plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
 
     return plane;
@@ -1570,7 +1764,7 @@ static etcpak_force_inline uint8_t convert7(float f)
     return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
 }
 
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
 {
     int32_t r = 0;
     int32_t g = 0;
@@ -1645,7 +1839,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t*
 
     // Error calculation
     uint64_t error = 0;
-    if( ModePlanar != mode )
+    if( ModePlanar != mode && useHeuristics )
     {
         auto ro0 = coR;
         auto go0 = coG;
@@ -1756,7 +1950,7 @@ static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
     uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
     uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
     return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
-#else 
+#else
     return vdupq_n_s16( vaddvq_u16( accu8 ) );
 #endif
 }
@@ -1783,7 +1977,7 @@ static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
     return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
 }
 
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
 {
     uint8x16x4_t srcBlock = vld4q_u8( src );
 
@@ -1828,7 +2022,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
     int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
 
     uint64_t error = 0;
-    if( mode != ModePlanar )
+    if( mode != ModePlanar && useHeuristics )
     {
         int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
 
@@ -1924,6 +2118,376 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
 
 #endif
 
+#ifdef __AVX2__
+uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
+#endif
+{
+    uint32_t blockErr = 0, bestBlockErr = MaxError;
+
+    uint32_t pixColors;
+    uint8_t possibleColors[4][3];
+    uint8_t colors[2][3];
+
+    decompressColor( colorsRGB444, colors );
+
+#ifdef __AVX2__
+    __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
+#endif
+
+    // test distances
+    for( uint8_t d = startDist; d < 8; ++d )
+    {
+        if( d >= 2 && dist == d - 2 ) break;
+
+        blockErr = 0;
+        pixColors = 0;
+
+        if( tMode )
+        {
+            calculatePaintColors59T( d, colors, possibleColors );
+        }
+        else
+        {
+            calculatePaintColors58H( d, colors, possibleColors );
+        }
+
+#ifdef __AVX2__
+        // RGB ordering
+        __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
+        __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
+        __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
+
+        // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
+        static const __m128i zero = _mm_setzero_si128();
+        __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
+        __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
+        __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
+        __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
+        __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
+        __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
+
+        __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
+        __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
+        __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
+
+        // caculates differences between the pixel colrs and the palette colors
+        __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
+        __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
+        __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
+
+        // luma-based error calculations
+        static const __m256i bWeight = _mm256_set1_epi16( 14 );
+        static const __m256i gWeight = _mm256_set1_epi16( 76 );
+        static const __m256i rWeight = _mm256_set1_epi16( 38 );
+
+        diffb = _mm256_mullo_epi16( diffb, bWeight );
+        diffg = _mm256_mullo_epi16( diffg, gWeight );
+        diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+        // obtains the error with the current palette color
+        __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+
+        // error calucations with the remaining three palette colors
+        static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
+        for( uint8_t c = 1; c < 4; c++ )
+        {
+            __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
+            __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
+            __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
+
+            diffb = _mm256_mullo_epi16( diffb, bWeight );
+            diffg = _mm256_mullo_epi16( diffg, gWeight );
+            diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+            // error comparison with the previous best color
+            __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+            __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
+            __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
+            lowestPixErr = minErr;
+
+            // update pixel colors
+            uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
+            uint32_t prevPixColors = pixColors & ~updPixColors;
+            uint32_t mskPixColors = masks[c] & updPixColors;
+            pixColors = prevPixColors | mskPixColors;
+        }
+
+        // accumulate the block error
+        alignas( 32 ) uint16_t pixErr16[16] = { 0, };
+        _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
+        for( uint8_t p = 0; p < 16; p++ )
+        {
+            blockErr += (int)( pixErr16[p] ) * pixErr16[p];
+        }
+#else
+        for( size_t y = 0; y < 4; ++y )
+        {
+            for( size_t x = 0; x < 4; ++x )
+            {
+                uint32_t bestPixErr = MaxError;
+                pixColors <<= 2; // Make room for next value
+
+                // Loop possible block colors
+                for( uint8_t c = 0; c < 4; ++c )
+                {
+                    int diff[3];
+                    diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
+                    diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
+                    diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
+
+                    const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
+                    uint32_t pixErr = err * err;
+
+                    // Choose best error
+                    if( pixErr < bestPixErr )
+                    {
+                        bestPixErr = pixErr;
+                        pixColors ^= ( pixColors & 3 ); // Reset the two first bits
+                        pixColors |= c;
+                    }
+                }
+                blockErr += bestPixErr;
+            }
+        }
+#endif
+
+        if( blockErr < bestBlockErr )
+        {
+            bestBlockErr = blockErr;
+            dist = d;
+            pixIndices = pixColors;
+        }
+    }
+
+    return bestBlockErr;
+}
+
+
+// main T-/H-mode compression function
+#ifdef __AVX2__
+uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
+#endif
+{
+#ifdef __AVX2__
+    alignas( 8 ) uint8_t luma[16] = { 0, };
+    _mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
+#elif defined __ARM_NEON && defined __aarch64__
+    alignas( 8 ) uint8_t luma[16] = { 0 };
+    vst1q_u8( luma, l.luma8 );
+#else
+    uint8_t* luma = l.val;
+#endif
+
+    uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    // 1) sorts the pairs of (luma, pix_idx)
+    insertionSort( luma, pixIdx );
+
+    // 2) finds the min (left+right)
+    uint8_t minSumRangeIdx = 0;
+    uint16_t minSumRangeValue;
+    uint16_t sum;
+    static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
+    const int16_t temp = luma[15] - luma[0];
+
+    minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
+    for( uint8_t i = 1; i < 14; i++ )
+    {
+        sum = temp - luma[i+1] + luma[i] + diffBonus[i];
+        if( minSumRangeValue > sum )
+        {
+            minSumRangeValue = sum;
+            minSumRangeIdx = i;
+        }
+    }
+
+    sum = luma[14] - luma[0] + diffBonus[14];
+    if( minSumRangeValue > sum )
+    {
+        minSumRangeValue = sum;
+        minSumRangeIdx = 14;
+    }
+    uint8_t lRange, rRange;
+
+    lRange = luma[minSumRangeIdx] - luma[0];
+    rRange = luma[15] - luma[minSumRangeIdx + 1];
+
+    // 3) sets a proper mode
+    bool swap = false;
+    if( lRange >= rRange )
+    {
+        if( lRange >= rRange * 2 )
+        {
+            swap = true;
+            tMode = true;
+        }
+    }
+    else
+    {
+        if( lRange * 2 <= rRange ) tMode = true;
+    }
+    // 4) calculates the two base colors
+    uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
+
+    uint16_t r[4], g[4], b[4];
+    for( uint8_t i = 0; i < 4; ++i )
+    {
+        uint8_t idx = rangeIdx[i] * 4;
+        b[i] = src[idx];
+        g[i] = src[idx + 1];
+        r[i] = src[idx + 2];
+    }
+
+    uint8_t mid_rgb[2][3];
+    if( swap )
+    {
+        mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
+        mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
+        mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
+
+        uint16_t sum_rgb[3] = { 0, 0, 0 };
+        for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+        {
+            uint8_t idx = pixIdx[i] * 4;
+            sum_rgb[B] += src[idx];
+            sum_rgb[G] += src[idx + 1];
+            sum_rgb[R] += src[idx + 2];
+        }
+        const uint8_t temp = 15 - minSumRangeIdx;
+        mid_rgb[0][B] = sum_rgb[B] / temp;
+        mid_rgb[0][G] = sum_rgb[G] / temp;
+        mid_rgb[0][R] = sum_rgb[R] / temp;
+    }
+    else
+    {
+        mid_rgb[0][B] = (b[0] + b[1]) / 2;
+        mid_rgb[0][G] = (g[0] + g[1]) / 2;
+        mid_rgb[0][R] = (r[0] + r[1]) / 2;
+        if( tMode )
+        {
+            uint16_t sum_rgb[3] = { 0, 0, 0 };
+            for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+            {
+                uint8_t idx = pixIdx[i] * 4;
+                sum_rgb[B] += src[idx];
+                sum_rgb[G] += src[idx + 1];
+                sum_rgb[R] += src[idx + 2];
+            }
+            const uint8_t temp = 15 - minSumRangeIdx;
+            mid_rgb[1][B] = sum_rgb[B] / temp;
+            mid_rgb[1][G] = sum_rgb[G] / temp;
+            mid_rgb[1][R] = sum_rgb[R] / temp;
+        }
+        else
+        {
+            mid_rgb[1][B] = (b[2] + b[3]) / 2;
+            mid_rgb[1][G] = (g[2] + g[3]) / 2;
+            mid_rgb[1][R] = (r[2] + r[3]) / 2;
+        }
+    }
+
+    // 5) sets the start distance index
+    uint32_t startDistCandidate;
+    uint32_t avgDist;
+    if( tMode )
+    {
+        if( swap )
+        {
+            avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
+        }
+        else
+        {
+            avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
+        }
+    }
+    else
+    {
+        avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
+    }
+
+    if( avgDist <= 16)
+    {
+        startDistCandidate = 0;
+    }
+    else if( avgDist <= 23 )
+    {
+        startDistCandidate = 1;
+    }
+    else if( avgDist <= 32 )
+    {
+        startDistCandidate = 2;
+    }
+    else if( avgDist <= 41 )
+    {
+        startDistCandidate = 3;
+    }
+    else
+    {
+        startDistCandidate = 4;
+    }
+
+    uint32_t bestErr = MaxError;
+    uint32_t bestPixIndices;
+    uint8_t bestDist = 10;
+    uint8_t colorsRGB444[2][3];
+    compressColor( mid_rgb, colorsRGB444, tMode );
+    compressed1 = 0;
+
+    // 6) finds the best candidate with the lowest error
+#ifdef __AVX2__
+    // Vectorized ver
+    bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
+#else
+    // Scalar ver
+    bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
+#endif
+
+    // 7) outputs the final T or H block
+    if( tMode )
+    {
+        // Put the compress params into the compression block
+        compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
+        compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
+        compressed1 |= ( colorsRGB444[0][B] ) << 15;
+        compressed1 |= ( colorsRGB444[1][R] ) << 11;
+        compressed1 |= ( colorsRGB444[1][G] ) << 7;
+        compressed1 |= ( colorsRGB444[1][B] ) << 3;
+        compressed1 |= bestDist & 0x7;
+    }
+    else
+    {
+        int bestRGB444ColPacked[2];
+        bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
+        bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
+        if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
+        {
+            swapColors( colorsRGB444 );
+            // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
+            bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
+        }
+
+        // Put the compress params into the compression block
+        compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
+        compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
+        compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
+        compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
+        compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
+        compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
+        compressed1 |= ( bestDist >> 1 ) & 0x3;
+    }
+
+    bestPixIndices = indexConversion( bestPixIndices );
+    compressed2 = 0;
+    compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
+
+    return bestErr;
+}
+//#endif
+
 template<class T, class S>
 static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
 {
@@ -2025,7 +2589,7 @@ static inline int16_t hMax( __m128i buffer, uint8_t& idx )
 
     return result;
 }
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
 static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
 {
     const uint8_t max = vmaxvq_u8( buffer );
@@ -2072,7 +2636,7 @@ static inline int16_t hMin( __m128i buffer, uint8_t& idx )
     idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
     return result;
 }
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
 static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
 {
     const uint8_t min = vminvq_u8( buffer );
@@ -2109,8 +2673,153 @@ static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
 }
 #endif
 
-static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
 {
+    // Put bits in twotimer configuration for 59 (red overflows)
+    //
+    // Go from this bit layout:
+    //
+    //     |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
+    //     |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
+    //
+    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+    //     |----------------------------------------index bits---------------------------------------------|
+    //
+    //
+    //  To this:
+    //
+    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+    //      -----------------------------------------------------------------------------------------------
+    //     |// // //|R0a  |//|R0b  |G0         |B0         |R1         |G1         |B1          |da  |df|db|
+    //      -----------------------------------------------------------------------------------------------
+    //
+    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+    //     |----------------------------------------index bits---------------------------------------------|
+    //
+    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+    //      -----------------------------------------------------------------------------------------------
+    //     | base col1    | dcol 2 | base col1    | dcol 2 | base col 1   | dcol 2 | table  | table  |df|fp|
+    //     | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    | B1' (5 bits) | dB2    | cw 1   | cw 2   |bt|bt|
+    //      ------------------------------------------------------------------------------------------------
+
+    uint8_t R0a;
+    uint8_t bit, a, b, c, d, bits;
+
+    R0a = ( thumbT59W1 >> 25 ) & 0x3;
+
+    // Fix middle part
+    thumbTW1 = thumbT59W1 << 1;
+    // Fix R0a (top two bits of R0)
+    thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
+    // Fix db (lowest bit of d)
+    thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
+
+    // Make sure that red overflows:
+    a = ( thumbTW1 >> 28 ) & 0x1;
+    b = ( thumbTW1 >> 27 ) & 0x1;
+    c = ( thumbTW1 >> 25 ) & 0x1;
+    d = ( thumbTW1 >> 24 ) & 0x1;
+
+    // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+    // The following logical expression checks for the presence of any of those:
+    bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+    bits = 0xf * bit;
+    thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
+    thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
+
+    // Set diffbit
+    thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
+    thumbTW2 = thumbT59W2;
+}
+
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
+{
+    // Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
+    //
+    // Go from this bit layout:
+    //
+    //
+    //     |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
+    //     |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
+    //
+    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+    //     |---------------------------------------index bits----------------------------------------------|
+    //
+    //  To this:
+    //
+    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+    //      -----------------------------------------------------------------------------------------------
+    //     |//|R0         |G0      |// // //|G0|B0|//|B0b     |R1         |G1         |B0         |d2|df|d1|
+    //      -----------------------------------------------------------------------------------------------
+    //
+    //     |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+    //     |---------------------------------------index bits----------------------------------------------|
+    //
+    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+    //      -----------------------------------------------------------------------------------------------
+    //     | base col1    | dcol 2 | base col1    | dcol 2 | base col 1   | dcol 2 | table  | table  |df|fp|
+    //     | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    | B1' (5 bits) | dB2    | cw 1   | cw 2   |bt|bt|
+    //      -----------------------------------------------------------------------------------------------
+    //
+    //
+    // Thus, what we are really doing is going from this bit layout:
+    //
+    //
+    //     |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32   |
+    //     |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
+    //
+    //  To this:
+    //
+    //      63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+    //      --------------------------------------------------------------------------------------------------|
+    //     |//|part0               |// // //|part1|//|part2                                          |df|part3|
+    //      --------------------------------------------------------------------------------------------------|
+
+    unsigned int part0, part1, part2, part3;
+    uint8_t bit, a, b, c, d, bits;
+
+    // move parts
+    part0 = ( thumbH58W1 >> 19 ) & 0x7f;
+    part1 = ( thumbH58W1 >> 17 ) & 0x3;
+    part2 = ( thumbH58W1 >> 1 ) & 0xffff;
+    part3 = thumbH58W1 & 0x1;
+    thumbHW1 = 0;
+    thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
+    thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
+    thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
+    thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
+
+    // Make sure that red does not overflow:
+    bit = ( thumbHW1 >> 30 ) & 0x1;
+    thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
+
+    // Make sure that green overflows:
+    a = ( thumbHW1 >> 20 ) & 0x1;
+    b = ( thumbHW1 >> 19 ) & 0x1;
+    c = ( thumbHW1 >> 17 ) & 0x1;
+    d = ( thumbHW1 >> 16 ) & 0x1;
+    // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+    // The following logical expression checks for the presence of any of those:
+    bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+    bits = 0xf * bit;
+    thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
+    thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
+
+    // Set diffbit
+    thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
+    thumbHW2 = thumbH58W2;
+}
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline Channels GetChannels( const uint8_t* src )
+{
+    Channels ch;
 #ifdef __AVX2__
     __m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
     __m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
@@ -2128,30 +2837,10 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
     __m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
 
     // swap channels
-    __m128i b8 = _mm_unpacklo_epi64( rg0, rg1 );
-    __m128i g8 = _mm_unpackhi_epi64( rg0, rg1 );
-    __m128i r8 = _mm_unpacklo_epi64( b0, b1 );
-
-    __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( b8 ), _mm256_set1_epi16( 14 ) );
-    __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( g8 ), _mm256_set1_epi16( 76 ) );
-    __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( r8 ), _mm256_set1_epi16( 38 ) );
-
-    __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
-    __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
-    __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
-    __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
-
-    static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
-    static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
-    __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
-    __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
-    __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
-    luma.luma8 = luma_8bit;
-
-    // min/max calculation
-    luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
-    luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
-#elif defined __ARM_NEON
+    ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
+    ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
+    ch.r8 = _mm_unpacklo_epi64( b0, b1 );
+#elif defined __ARM_NEON && defined __aarch64__
     //load pixel data into 4 rows
     uint8x16_t px0 = vld1q_u8( src + 0 );
     uint8x16_t px1 = vld1q_u8( src + 16 );
@@ -2172,12 +2861,48 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
     uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
     uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
     uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
-    uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[0] ), 14 );
-    uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[1] ), 14 );
-    uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[0] ), 76 );
-    uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[1] ), 76 );
-    uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[0] ), 38 );
-    uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[1] ), 38 );
+    ch.r = red;
+    ch.b = blu;
+    ch.g = grn;
+#endif
+    return ch;
+}
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
+#else
+static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+#endif
+{
+#ifdef __AVX2__
+    __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
+    __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
+    __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
+
+    __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
+    __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
+    __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
+    __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
+
+    static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
+    static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
+    __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
+    __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
+    __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
+    luma.luma8 = luma_8bit;
+
+    // min/max calculation
+    luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
+    luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
+#elif defined __ARM_NEON && defined __aarch64__
+    //load pixel data into 4 rows
+    uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
+    uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
+    uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
+    uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
+    uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
+    uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
 
     //calculate luma for rows 0,1 and 2,3
     uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
@@ -2253,7 +2978,7 @@ static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
     {
         return ModeTH;
     }
-    return 0;
+    return ModeUndecided;
 }
 
 static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
@@ -2267,33 +2992,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
 #endif
 
     uint8_t mode = ModeUndecided;
+    Luma luma;
+#ifdef __AVX2__
+    Channels ch = GetChannels( src );
     if( useHeuristics )
     {
-        Luma luma;
-        CalculateLuma( src, luma );
+        CalculateLuma( ch, luma );
         mode = SelectModeETC2( luma );
     }
-#ifdef __AVX2__
-    auto plane = Planar_AVX2( src, mode );
 
+    auto plane = Planar_AVX2( ch, mode, useHeuristics );
     if( useHeuristics && mode == ModePlanar ) return plane.plane;
 
-    alignas(32) v4i a[8];
-
+    alignas( 32 ) v4i a[8];
     __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
 
     // Get index of minimum error (err0)
-    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
     __m128i errMin0 = _mm_min_epu32(err0, err1);
 
-    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
-    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+    __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+    __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
 
-    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+    __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
 
-    uint32_t mask = _mm_movemask_epi8(errMask);
+    uint32_t mask = _mm_movemask_epi8( errMask );
 
-    size_t idx = _bit_scan_forward(mask) >> 2;
+    size_t idx = _bit_scan_forward( mask ) >> 2;
 
     d = EncodeAverages_AVX2( a, idx );
 
@@ -2309,12 +3034,54 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
         FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
     }
 
-    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+    if( useHeuristics )
+    {
+        if( mode == ModeTH )
+        {
+            uint64_t result = 0;
+            uint64_t error = 0;
+            uint32_t compressed[4] = { 0, 0, 0, 0 };
+            bool tMode = false;
+
+            error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
+            if( tMode )
+            {
+                stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+            }
+            else
+            {
+                stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+            }
+
+            result = (uint32_t)_bswap( compressed[2] );
+            result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+
+            plane.plane = result;
+            plane.error = error;
+        }
+        else
+        {
+            plane.plane = 0;
+            plane.error = MaxError;
+        }
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
 #else
+    if( useHeuristics )
+    {
+#ifdef defined __ARM_NEON && defined __aarch64__
+        Channels ch = GetChannels( src );
+        CalculateLuma( ch, luma );
+#else
+        CalculateLuma( src, luma );
+#endif
+        mode = SelectModeETC2( luma );
+    }
 #ifdef __ARM_NEON
-    auto result = Planar_NEON( src, mode );
+    auto result = Planar_NEON( src, mode, useHeuristics );
 #else
-    auto result = Planar( src, mode );
+    auto result = Planar( src, mode, useHeuristics );
 #endif
     if( result.second == 0 ) return result.first;
 
@@ -2333,6 +3100,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
     auto id = g_id[idx];
     FindBestFit( terr, tsel, a, id, src );
 
+    if( useHeuristics )
+    {
+        if( mode == ModeTH )
+        {
+            uint32_t compressed[4] = { 0, 0, 0, 0 };
+            bool tMode = false;
+
+            result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
+            if( tMode )
+            {
+                stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+            }
+            else
+            {
+                stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+            }
+
+            result.first = (uint32_t)_bswap( compressed[2] );
+            result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+        }
+        else
+        {
+            result.first = 0;
+            result.second = MaxError;
+        }
+    }
+
     return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
 #endif
 }