summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRĂ©mi Verschelde <remi@verschelde.fr>2022-05-12 15:30:20 +0200
committerGitHub <noreply@github.com>2022-05-12 15:30:20 +0200
commit4906c66a49245adcfe35cb9f51316fbdd0a987f2 (patch)
treec27c5070e3c9ae9402e9b325f4711b1c797df20b
parenta624bfe150b6f438075993cdc96ea2703f08bfc7 (diff)
parent9570f3bf45f62d9f35de5426e89f989bf10f8d92 (diff)
Merge pull request #60964 from akien-mga/etcpak-update
-rw-r--r--thirdparty/README.md2
-rw-r--r--thirdparty/etcpak/ProcessRGB.cpp1084
2 files changed, 940 insertions, 146 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 41cd0230cc..d32db920ad 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -115,7 +115,7 @@ will limit its functionality to IPv4 only.
## etcpak
- Upstream: https://github.com/wolfpld/etcpak
-- Version: git (7c3cb6fe708d4ae330b0ab2af1ad472bae2a37a2, 2021)
+- Version: git (10fc4ce627f9a17ed49bf97fcc3796a712033ba1, 2022)
- License: BSD-3-Clause
Files extracted from upstream source:
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
index d60164bcc8..f488f3b282 100644
--- a/thirdparty/etcpak/ProcessRGB.cpp
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -28,6 +28,10 @@
# define _bswap64(x) __builtin_bswap64(x)
#endif
+static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
+// common T-/H-mode table
+static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
// thresholds for the early compression-mode decision scheme
// default: 0.03, 0.09, and 0.38
float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
@@ -36,13 +40,17 @@ static const uint8_t ModeUndecided = 0;
static const uint8_t ModePlanar = 0x1;
static const uint8_t ModeTH = 0x2;
+const unsigned int R = 2;
+const unsigned int G = 1;
+const unsigned int B = 0;
+
struct Luma
{
#ifdef __AVX2__
float max, min;
uint8_t minIdx = 255, maxIdx = 255;
__m128i luma8;
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
float max, min;
uint8_t minIdx = 255, maxIdx = 255;
uint8x16_t luma8;
@@ -52,8 +60,206 @@ struct Luma
#endif
};
+#ifdef __AVX2__
+struct Plane
+{
+ uint64_t plane;
+ uint64_t error;
+ __m256i sum4;
+};
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+struct Channels
+{
+#ifdef __AVX2__
+ __m128i r8, g8, b8;
+#elif defined __ARM_NEON && defined __aarch64__
+ uint8x16x2_t r, g, b;
+#endif
+};
+#endif
+
namespace
{
+static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
+{
+ return val < min ? min : ( val > max ? max : val );
+}
+
+static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
+{
+ return val < min ? min : val;
+}
+
+static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
+{
+ return val > max ? max : val;
+}
+
+// slightly faster than std::sort
+static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
+{
+ for( uint8_t i = 1; i < 16; ++i )
+ {
+ uint8_t value = arr1[i];
+ uint8_t hole = i;
+
+ for( ; hole > 0 && value < arr1[hole - 1]; --hole )
+ {
+ arr1[hole] = arr1[hole - 1];
+ arr2[hole] = arr2[hole - 1];
+ }
+ arr1[hole] = value;
+ arr2[hole] = i;
+ }
+}
+
+//converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
+// into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline int indexConversion( int pixelIndices )
+{
+ int correctIndices = 0;
+ int LSB[4][4];
+ int MSB[4][4];
+ int shift = 0;
+ for( int y = 3; y >= 0; y-- )
+ {
+ for( int x = 3; x >= 0; x-- )
+ {
+ LSB[x][y] = ( pixelIndices >> shift ) & 1;
+ shift++;
+ MSB[x][y] = ( pixelIndices >> shift ) & 1;
+ shift++;
+ }
+ }
+ shift = 0;
+ for( int x = 0; x < 4; x++ )
+ {
+ for( int y = 0; y < 4; y++ )
+ {
+ correctIndices |= ( LSB[x][y] << shift );
+ correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
+ shift++;
+ }
+ }
+ return correctIndices;
+}
+
+// Swapping two RGB-colors
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
+{
+ uint8_t temp = colors[0][R];
+ colors[0][R] = colors[1][R];
+ colors[1][R] = temp;
+
+ temp = colors[0][G];
+ colors[0][G] = colors[1][G];
+ colors[1][G] = temp;
+
+ temp = colors[0][B];
+ colors[0][B] = colors[1][B];
+ colors[1][B] = temp;
+}
+
+
+// calculates quantized colors for T or H modes
+void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
+{
+ if( t_mode )
+ {
+ quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
+ quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
+ quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
+ }
+ else // clamped to [1,14] to get a wider range
+ {
+ quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
+ quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
+ quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
+ }
+
+ // clamped to [1,14] to get a wider range
+ quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
+ quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
+ quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
+}
+
+// three decoding functions come from ETCPACK v2.74 and are slightly changed.
+static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
+{
+ // The color should be retrieved as:
+ //
+ // c = round(255/(r_bits^2-1))*comp_color
+ //
+ // This is similar to bit replication
+ //
+ // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
+ // two copy operations.
+ colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
+ colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
+ colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
+ colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
+ colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
+ colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
+}
+
+// calculates the paint colors from the block colors
+// using a distance d and one of the H- or T-patterns.
+static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+ //////////////////////////////////////////////
+ //
+ // C3 C1 C4----C1---C2
+ // | | |
+ // | | |
+ // |-------| |
+ // | | |
+ // | | |
+ // C4 C2 C3
+ //
+ //////////////////////////////////////////////
+
+ // C4
+ pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+ pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+ pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+ // C3
+ pColors[0][R] = colors[0][R];
+ pColors[0][G] = colors[0][G];
+ pColors[0][B] = colors[0][B];
+ // C2
+ pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+ pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+ pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+ // C1
+ pColors[2][R] = colors[1][R];
+ pColors[2][G] = colors[1][G];
+ pColors[2][B] = colors[1][B];
+}
+
+static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+ pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+ pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+ pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+ // C1
+ pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
+ pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
+ pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
+ // C2
+ pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
+ pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
+ pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
+ // C3
+ pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+ pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+ pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+}
#if defined _MSC_VER && !defined __clang__
static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
@@ -586,127 +792,107 @@ static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cv
return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
}
-struct Plane
+static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
{
- uint64_t plane;
- uint64_t error;
- __m256i sum4;
-};
-
-static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t mode )
-{
- __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
- __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
- __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
- __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
-
- __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-
- __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
- __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
- __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
- __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
-
- // swap channels
- __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
- __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
- __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+ __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
+ __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
+ __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
- __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
- __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
- __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+ __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+ __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+ __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
- __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
- __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
- __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+ __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
+ __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
+ __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
- __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
- __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
- __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+ __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
+ __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
+ __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
- __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
- __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
- __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+ __m256i sr1 = _mm256_slli_epi64( sr0, 32 );
+ __m256i sg1 = _mm256_slli_epi64( sg0, 16 );
- __m256i sr1 = _mm256_slli_epi64(sr0, 32);
- __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+ __m256i srb = _mm256_or_si256( sr1, sb0 );
+ __m256i srgb = _mm256_or_si256( srb, sg1 );
- __m256i srb = _mm256_or_si256(sr1, sb0);
- __m256i srgb = _mm256_or_si256(srb, sg1);
+ if( mode != ModePlanar && useHeuristics )
+ {
+ Plane plane;
+ plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+ return plane;
+ }
- __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
- __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i t5 = _mm_hadd_epi32(t3, t4);
- __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
+ __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
+ __m128i t5 = _mm_hadd_epi32( t3, t4 );
+ __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
- __m256i sr = _mm256_broadcastw_epi16(t5);
- __m256i sg = _mm256_broadcastw_epi16(t6);
- __m256i sb = _mm256_broadcastw_epi16(t7);
+ __m256i sr = _mm256_broadcastw_epi16( t5 );
+ __m256i sg = _mm256_broadcastw_epi16( t6 );
+ __m256i sb = _mm256_broadcastw_epi16( t7 );
- __m256i r08 = _mm256_cvtepu8_epi16(r8);
- __m256i g08 = _mm256_cvtepu8_epi16(g8);
- __m256i b08 = _mm256_cvtepu8_epi16(b8);
+ __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
+ __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
+ __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
- __m256i r16 = _mm256_slli_epi16(r08, 4);
- __m256i g16 = _mm256_slli_epi16(g08, 4);
- __m256i b16 = _mm256_slli_epi16(b08, 4);
+ __m256i r16 = _mm256_slli_epi16( r08, 4 );
+ __m256i g16 = _mm256_slli_epi16( g08, 4 );
+ __m256i b16 = _mm256_slli_epi16( b08, 4 );
- __m256i difR0 = _mm256_sub_epi16(r16, sr);
- __m256i difG0 = _mm256_sub_epi16(g16, sg);
- __m256i difB0 = _mm256_sub_epi16(b16, sb);
+ __m256i difR0 = _mm256_sub_epi16( r16, sr );
+ __m256i difG0 = _mm256_sub_epi16( g16, sg );
+ __m256i difB0 = _mm256_sub_epi16( b16, sb );
- __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
- __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
- __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+ __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+ __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+ __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
- __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
- __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
- __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+ __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+ __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+ __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
- __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
- __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+ __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
+ __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
- __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+ __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
- __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
- __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
- __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+ __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
+ __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
+ __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
- __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
- __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+ __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
+ __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
- __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+ __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
- __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
- __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+ __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
+ __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
- const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+ const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
- __m128 scale = _mm_set1_ps(-4.0f / value);
+ __m128 scale = _mm_set1_ps( -4.0f / value );
- __m128 af = _mm_mul_ps(sumRGBxzf, scale);
- __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+ __m128 af = _mm_mul_ps( sumRGBxzf, scale );
+ __m128 bf = _mm_mul_ps( sumRGByzf, scale );
- __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+ __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
- __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
- __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
- __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+ __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+ __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+ __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
// convert to r6g7b6
- __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+ __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
- uint64_t rgbho = _mm_extract_epi64(cohv, 0);
- uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+ uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
+ uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
// Error calculation
uint64_t error = 0;
- if( mode != ModePlanar )
+ if( !useHeuristics )
{
auto ro0 = ( rgbho >> 48 ) & 0x3F;
auto go0 = ( rgbho >> 40 ) & 0x7F;
@@ -820,7 +1006,15 @@ static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t
Plane plane;
plane.plane = result;
- plane.error = error;
+ if( useHeuristics )
+ {
+ plane.error = 0;
+ mode = ModePlanar;
+ }
+ else
+ {
+ plane.error = error;
+ }
plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
return plane;
@@ -1570,7 +1764,7 @@ static etcpak_force_inline uint8_t convert7(float f)
return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
}
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
{
int32_t r = 0;
int32_t g = 0;
@@ -1645,7 +1839,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t*
// Error calculation
uint64_t error = 0;
- if( ModePlanar != mode )
+ if( ModePlanar != mode && useHeuristics )
{
auto ro0 = coR;
auto go0 = coG;
@@ -1756,7 +1950,7 @@ static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
-#else
+#else
return vdupq_n_s16( vaddvq_u16( accu8 ) );
#endif
}
@@ -1783,7 +1977,7 @@ static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
}
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
{
uint8x16x4_t srcBlock = vld4q_u8( src );
@@ -1828,7 +2022,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
uint64_t error = 0;
- if( mode != ModePlanar )
+ if( mode != ModePlanar && useHeuristics )
{
int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
@@ -1924,6 +2118,376 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
#endif
+#ifdef __AVX2__
+uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
+#endif
+{
+ uint32_t blockErr = 0, bestBlockErr = MaxError;
+
+ uint32_t pixColors;
+ uint8_t possibleColors[4][3];
+ uint8_t colors[2][3];
+
+ decompressColor( colorsRGB444, colors );
+
+#ifdef __AVX2__
+ __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
+#endif
+
+ // test distances
+ for( uint8_t d = startDist; d < 8; ++d )
+ {
+ if( d >= 2 && dist == d - 2 ) break;
+
+ blockErr = 0;
+ pixColors = 0;
+
+ if( tMode )
+ {
+ calculatePaintColors59T( d, colors, possibleColors );
+ }
+ else
+ {
+ calculatePaintColors58H( d, colors, possibleColors );
+ }
+
+#ifdef __AVX2__
+ // RGB ordering
+ __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
+ __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
+ __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
+
+ // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
+ static const __m128i zero = _mm_setzero_si128();
+ __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
+ __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
+ __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
+ __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
+ __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
+ __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
+
+ __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
+ __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
+ __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
+
+ // caculates differences between the pixel colrs and the palette colors
+ __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
+ __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
+ __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
+
+ // luma-based error calculations
+ static const __m256i bWeight = _mm256_set1_epi16( 14 );
+ static const __m256i gWeight = _mm256_set1_epi16( 76 );
+ static const __m256i rWeight = _mm256_set1_epi16( 38 );
+
+ diffb = _mm256_mullo_epi16( diffb, bWeight );
+ diffg = _mm256_mullo_epi16( diffg, gWeight );
+ diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+ // obtains the error with the current palette color
+ __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+
+ // error calucations with the remaining three palette colors
+ static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
+ for( uint8_t c = 1; c < 4; c++ )
+ {
+ __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
+ __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
+ __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
+
+ diffb = _mm256_mullo_epi16( diffb, bWeight );
+ diffg = _mm256_mullo_epi16( diffg, gWeight );
+ diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+ // error comparison with the previous best color
+ __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+ __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
+ __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
+ lowestPixErr = minErr;
+
+ // update pixel colors
+ uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
+ uint32_t prevPixColors = pixColors & ~updPixColors;
+ uint32_t mskPixColors = masks[c] & updPixColors;
+ pixColors = prevPixColors | mskPixColors;
+ }
+
+ // accumulate the block error
+ alignas( 32 ) uint16_t pixErr16[16] = { 0, };
+ _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
+ for( uint8_t p = 0; p < 16; p++ )
+ {
+ blockErr += (int)( pixErr16[p] ) * pixErr16[p];
+ }
+#else
+ for( size_t y = 0; y < 4; ++y )
+ {
+ for( size_t x = 0; x < 4; ++x )
+ {
+ uint32_t bestPixErr = MaxError;
+ pixColors <<= 2; // Make room for next value
+
+ // Loop possible block colors
+ for( uint8_t c = 0; c < 4; ++c )
+ {
+ int diff[3];
+ diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
+ diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
+ diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
+
+ const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
+ uint32_t pixErr = err * err;
+
+ // Choose best error
+ if( pixErr < bestPixErr )
+ {
+ bestPixErr = pixErr;
+ pixColors ^= ( pixColors & 3 ); // Reset the two first bits
+ pixColors |= c;
+ }
+ }
+ blockErr += bestPixErr;
+ }
+ }
+#endif
+
+ if( blockErr < bestBlockErr )
+ {
+ bestBlockErr = blockErr;
+ dist = d;
+ pixIndices = pixColors;
+ }
+ }
+
+ return bestBlockErr;
+}
+
+
+// main T-/H-mode compression function
+#ifdef __AVX2__
+uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
+#endif
+{
+#ifdef __AVX2__
+ alignas( 8 ) uint8_t luma[16] = { 0, };
+ _mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
+#elif defined __ARM_NEON && defined __aarch64__
+ alignas( 8 ) uint8_t luma[16] = { 0 };
+ vst1q_u8( luma, l.luma8 );
+#else
+ uint8_t* luma = l.val;
+#endif
+
+ uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+ // 1) sorts the pairs of (luma, pix_idx)
+ insertionSort( luma, pixIdx );
+
+ // 2) finds the min (left+right)
+ uint8_t minSumRangeIdx = 0;
+ uint16_t minSumRangeValue;
+ uint16_t sum;
+ static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
+ const int16_t temp = luma[15] - luma[0];
+
+ minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
+ for( uint8_t i = 1; i < 14; i++ )
+ {
+ sum = temp - luma[i+1] + luma[i] + diffBonus[i];
+ if( minSumRangeValue > sum )
+ {
+ minSumRangeValue = sum;
+ minSumRangeIdx = i;
+ }
+ }
+
+ sum = luma[14] - luma[0] + diffBonus[14];
+ if( minSumRangeValue > sum )
+ {
+ minSumRangeValue = sum;
+ minSumRangeIdx = 14;
+ }
+ uint8_t lRange, rRange;
+
+ lRange = luma[minSumRangeIdx] - luma[0];
+ rRange = luma[15] - luma[minSumRangeIdx + 1];
+
+ // 3) sets a proper mode
+ bool swap = false;
+ if( lRange >= rRange )
+ {
+ if( lRange >= rRange * 2 )
+ {
+ swap = true;
+ tMode = true;
+ }
+ }
+ else
+ {
+ if( lRange * 2 <= rRange ) tMode = true;
+ }
+ // 4) calculates the two base colors
+ uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
+
+ uint16_t r[4], g[4], b[4];
+ for( uint8_t i = 0; i < 4; ++i )
+ {
+ uint8_t idx = rangeIdx[i] * 4;
+ b[i] = src[idx];
+ g[i] = src[idx + 1];
+ r[i] = src[idx + 2];
+ }
+
+ uint8_t mid_rgb[2][3];
+ if( swap )
+ {
+ mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
+ mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
+ mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
+
+ uint16_t sum_rgb[3] = { 0, 0, 0 };
+ for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+ {
+ uint8_t idx = pixIdx[i] * 4;
+ sum_rgb[B] += src[idx];
+ sum_rgb[G] += src[idx + 1];
+ sum_rgb[R] += src[idx + 2];
+ }
+ const uint8_t temp = 15 - minSumRangeIdx;
+ mid_rgb[0][B] = sum_rgb[B] / temp;
+ mid_rgb[0][G] = sum_rgb[G] / temp;
+ mid_rgb[0][R] = sum_rgb[R] / temp;
+ }
+ else
+ {
+ mid_rgb[0][B] = (b[0] + b[1]) / 2;
+ mid_rgb[0][G] = (g[0] + g[1]) / 2;
+ mid_rgb[0][R] = (r[0] + r[1]) / 2;
+ if( tMode )
+ {
+ uint16_t sum_rgb[3] = { 0, 0, 0 };
+ for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+ {
+ uint8_t idx = pixIdx[i] * 4;
+ sum_rgb[B] += src[idx];
+ sum_rgb[G] += src[idx + 1];
+ sum_rgb[R] += src[idx + 2];
+ }
+ const uint8_t temp = 15 - minSumRangeIdx;
+ mid_rgb[1][B] = sum_rgb[B] / temp;
+ mid_rgb[1][G] = sum_rgb[G] / temp;
+ mid_rgb[1][R] = sum_rgb[R] / temp;
+ }
+ else
+ {
+ mid_rgb[1][B] = (b[2] + b[3]) / 2;
+ mid_rgb[1][G] = (g[2] + g[3]) / 2;
+ mid_rgb[1][R] = (r[2] + r[3]) / 2;
+ }
+ }
+
+ // 5) sets the start distance index
+ uint32_t startDistCandidate;
+ uint32_t avgDist;
+ if( tMode )
+ {
+ if( swap )
+ {
+ avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
+ }
+ else
+ {
+ avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
+ }
+ }
+ else
+ {
+ avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
+ }
+
+ if( avgDist <= 16)
+ {
+ startDistCandidate = 0;
+ }
+ else if( avgDist <= 23 )
+ {
+ startDistCandidate = 1;
+ }
+ else if( avgDist <= 32 )
+ {
+ startDistCandidate = 2;
+ }
+ else if( avgDist <= 41 )
+ {
+ startDistCandidate = 3;
+ }
+ else
+ {
+ startDistCandidate = 4;
+ }
+
+ uint32_t bestErr = MaxError;
+ uint32_t bestPixIndices;
+ uint8_t bestDist = 10;
+ uint8_t colorsRGB444[2][3];
+ compressColor( mid_rgb, colorsRGB444, tMode );
+ compressed1 = 0;
+
+ // 6) finds the best candidate with the lowest error
+#ifdef __AVX2__
+ // Vectorized ver
+ bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
+#else
+ // Scalar ver
+ bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
+#endif
+
+ // 7) outputs the final T or H block
+ if( tMode )
+ {
+ // Put the compress params into the compression block
+ compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
+ compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
+ compressed1 |= ( colorsRGB444[0][B] ) << 15;
+ compressed1 |= ( colorsRGB444[1][R] ) << 11;
+ compressed1 |= ( colorsRGB444[1][G] ) << 7;
+ compressed1 |= ( colorsRGB444[1][B] ) << 3;
+ compressed1 |= bestDist & 0x7;
+ }
+ else
+ {
+ int bestRGB444ColPacked[2];
+ bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
+ bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
+ if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
+ {
+ swapColors( colorsRGB444 );
+ // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
+ bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
+ }
+
+ // Put the compress params into the compression block
+ compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
+ compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
+ compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
+ compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
+ compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
+ compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
+ compressed1 |= ( bestDist >> 1 ) & 0x3;
+ }
+
+ bestPixIndices = indexConversion( bestPixIndices );
+ compressed2 = 0;
+ compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
+
+ return bestErr;
+}
+//#endif
+
template<class T, class S>
static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
{
@@ -2025,7 +2589,7 @@ static inline int16_t hMax( __m128i buffer, uint8_t& idx )
return result;
}
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
{
const uint8_t max = vmaxvq_u8( buffer );
@@ -2072,7 +2636,7 @@ static inline int16_t hMin( __m128i buffer, uint8_t& idx )
idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
return result;
}
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
{
const uint8_t min = vminvq_u8( buffer );
@@ -2109,8 +2673,153 @@ static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
}
#endif
-static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
{
+ // Put bits in twotimer configuration for 59 (red overflows)
+ //
+ // Go from this bit layout:
+ //
+ // |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
+ // |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |----------------------------------------index bits---------------------------------------------|
+ //
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db|
+ // -----------------------------------------------------------------------------------------------
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |----------------------------------------index bits---------------------------------------------|
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
+ // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
+ // ------------------------------------------------------------------------------------------------
+
+ uint8_t R0a;
+ uint8_t bit, a, b, c, d, bits;
+
+ R0a = ( thumbT59W1 >> 25 ) & 0x3;
+
+ // Fix middle part
+ thumbTW1 = thumbT59W1 << 1;
+ // Fix R0a (top two bits of R0)
+ thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
+ // Fix db (lowest bit of d)
+ thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
+
+ // Make sure that red overflows:
+ a = ( thumbTW1 >> 28 ) & 0x1;
+ b = ( thumbTW1 >> 27 ) & 0x1;
+ c = ( thumbTW1 >> 25 ) & 0x1;
+ d = ( thumbTW1 >> 24 ) & 0x1;
+
+ // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+ // The following logical expression checks for the presence of any of those:
+ bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+ bits = 0xf * bit;
+ thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
+ thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
+
+ // Set diffbit
+ thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
+ thumbTW2 = thumbT59W2;
+}
+
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
+{
+ // Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
+ //
+ // Go from this bit layout:
+ //
+ //
+ // |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
+ // |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |---------------------------------------index bits----------------------------------------------|
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1|
+ // -----------------------------------------------------------------------------------------------
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |---------------------------------------index bits----------------------------------------------|
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
+ // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
+ // -----------------------------------------------------------------------------------------------
+ //
+ //
+ // Thus, what we are really doing is going from this bit layout:
+ //
+ //
+ // |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 |
+ // |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // --------------------------------------------------------------------------------------------------|
+ // |//|part0 |// // //|part1|//|part2 |df|part3|
+ // --------------------------------------------------------------------------------------------------|
+
+ unsigned int part0, part1, part2, part3;
+ uint8_t bit, a, b, c, d, bits;
+
+ // move parts
+ part0 = ( thumbH58W1 >> 19 ) & 0x7f;
+ part1 = ( thumbH58W1 >> 17 ) & 0x3;
+ part2 = ( thumbH58W1 >> 1 ) & 0xffff;
+ part3 = thumbH58W1 & 0x1;
+ thumbHW1 = 0;
+ thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
+ thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
+ thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
+ thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
+
+ // Make sure that red does not overflow:
+ bit = ( thumbHW1 >> 30 ) & 0x1;
+ thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
+
+ // Make sure that green overflows:
+ a = ( thumbHW1 >> 20 ) & 0x1;
+ b = ( thumbHW1 >> 19 ) & 0x1;
+ c = ( thumbHW1 >> 17 ) & 0x1;
+ d = ( thumbHW1 >> 16 ) & 0x1;
+ // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+ // The following logical expression checks for the presence of any of those:
+ bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+ bits = 0xf * bit;
+ thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
+ thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
+
+ // Set diffbit
+ thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
+ thumbHW2 = thumbH58W2;
+}
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline Channels GetChannels( const uint8_t* src )
+{
+ Channels ch;
#ifdef __AVX2__
__m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
__m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
@@ -2128,30 +2837,10 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
__m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
// swap channels
- __m128i b8 = _mm_unpacklo_epi64( rg0, rg1 );
- __m128i g8 = _mm_unpackhi_epi64( rg0, rg1 );
- __m128i r8 = _mm_unpacklo_epi64( b0, b1 );
-
- __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( b8 ), _mm256_set1_epi16( 14 ) );
- __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( g8 ), _mm256_set1_epi16( 76 ) );
- __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( r8 ), _mm256_set1_epi16( 38 ) );
-
- __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
- __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
- __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
- __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
-
- static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
- static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
- __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
- __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
- __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
- luma.luma8 = luma_8bit;
-
- // min/max calculation
- luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
- luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
-#elif defined __ARM_NEON
+ ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
+ ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
+ ch.r8 = _mm_unpacklo_epi64( b0, b1 );
+#elif defined __ARM_NEON && defined __aarch64__
//load pixel data into 4 rows
uint8x16_t px0 = vld1q_u8( src + 0 );
uint8x16_t px1 = vld1q_u8( src + 16 );
@@ -2172,12 +2861,48 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
- uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[0] ), 14 );
- uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[1] ), 14 );
- uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[0] ), 76 );
- uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[1] ), 76 );
- uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[0] ), 38 );
- uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[1] ), 38 );
+ ch.r = red;
+ ch.b = blu;
+ ch.g = grn;
+#endif
+ return ch;
+}
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
+#else
+static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+#endif
+{
+#ifdef __AVX2__
+ __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
+ __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
+ __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
+
+ __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
+ __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
+ __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
+ __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
+
+ static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
+ static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
+ __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
+ __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
+ __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
+ luma.luma8 = luma_8bit;
+
+ // min/max calculation
+ luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
+ luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
+#elif defined __ARM_NEON && defined __aarch64__
+ //load pixel data into 4 rows
+ uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
+ uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
+ uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
+ uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
+ uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
+ uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
//calculate luma for rows 0,1 and 2,3
uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
@@ -2253,7 +2978,7 @@ static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
{
return ModeTH;
}
- return 0;
+ return ModeUndecided;
}
static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
@@ -2267,33 +2992,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
#endif
uint8_t mode = ModeUndecided;
+ Luma luma;
+#ifdef __AVX2__
+ Channels ch = GetChannels( src );
if( useHeuristics )
{
- Luma luma;
- CalculateLuma( src, luma );
+ CalculateLuma( ch, luma );
mode = SelectModeETC2( luma );
}
-#ifdef __AVX2__
- auto plane = Planar_AVX2( src, mode );
+ auto plane = Planar_AVX2( ch, mode, useHeuristics );
if( useHeuristics && mode == ModePlanar ) return plane.plane;
- alignas(32) v4i a[8];
-
+ alignas( 32 ) v4i a[8];
__m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
// Get index of minimum error (err0)
- __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+ __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m128i errMin0 = _mm_min_epu32(err0, err1);
- __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
- __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+ __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+ __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
- __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+ __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
- uint32_t mask = _mm_movemask_epi8(errMask);
+ uint32_t mask = _mm_movemask_epi8( errMask );
- size_t idx = _bit_scan_forward(mask) >> 2;
+ size_t idx = _bit_scan_forward( mask ) >> 2;
d = EncodeAverages_AVX2( a, idx );
@@ -2309,12 +3034,54 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
}
- return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+ if( useHeuristics )
+ {
+ if( mode == ModeTH )
+ {
+ uint64_t result = 0;
+ uint64_t error = 0;
+ uint32_t compressed[4] = { 0, 0, 0, 0 };
+ bool tMode = false;
+
+ error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
+ if( tMode )
+ {
+ stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+ else
+ {
+ stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+
+ result = (uint32_t)_bswap( compressed[2] );
+ result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+
+ plane.plane = result;
+ plane.error = error;
+ }
+ else
+ {
+ plane.plane = 0;
+ plane.error = MaxError;
+ }
+ }
+
+ return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
#else
+ if( useHeuristics )
+ {
+#ifdef defined __ARM_NEON && defined __aarch64__
+ Channels ch = GetChannels( src );
+ CalculateLuma( ch, luma );
+#else
+ CalculateLuma( src, luma );
+#endif
+ mode = SelectModeETC2( luma );
+ }
#ifdef __ARM_NEON
- auto result = Planar_NEON( src, mode );
+ auto result = Planar_NEON( src, mode, useHeuristics );
#else
- auto result = Planar( src, mode );
+ auto result = Planar( src, mode, useHeuristics );
#endif
if( result.second == 0 ) return result.first;
@@ -2333,6 +3100,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
auto id = g_id[idx];
FindBestFit( terr, tsel, a, id, src );
+ if( useHeuristics )
+ {
+ if( mode == ModeTH )
+ {
+ uint32_t compressed[4] = { 0, 0, 0, 0 };
+ bool tMode = false;
+
+ result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
+ if( tMode )
+ {
+ stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+ else
+ {
+ stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+
+ result.first = (uint32_t)_bswap( compressed[2] );
+ result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+ }
+ else
+ {
+ result.first = 0;
+ result.second = MaxError;
+ }
+ }
+
return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
#endif
}