summaryrefslogtreecommitdiff
path: root/thirdparty
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty')
-rw-r--r--thirdparty/README.md4
-rw-r--r--thirdparty/enet/godot.cpp17
-rw-r--r--thirdparty/etcpak/ProcessRGB.cpp1084
-rw-r--r--thirdparty/misc/polypartition.cpp12
-rw-r--r--thirdparty/misc/polypartition.h6
-rw-r--r--thirdparty/thorvg/inc/config.h2
-rw-r--r--thirdparty/thorvg/src/lib/sw_engine/tvgSwCommon.h6
-rw-r--r--thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp2
-rw-r--r--thirdparty/thorvg/src/lib/sw_engine/tvgSwShape.cpp2
-rw-r--r--thirdparty/thorvg/src/lib/sw_engine/tvgSwStroke.cpp4
-rw-r--r--thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp19
-rw-r--r--thirdparty/thorvg/src/loaders/svg/tvgSvgLoaderCommon.h3
-rw-r--r--thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp16
-rw-r--r--thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp2
-rwxr-xr-xthirdparty/thorvg/update-thorvg.sh2
15 files changed, 998 insertions, 183 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 41cd0230cc..a7dae8118e 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -115,7 +115,7 @@ will limit its functionality to IPv4 only.
## etcpak
- Upstream: https://github.com/wolfpld/etcpak
-- Version: git (7c3cb6fe708d4ae330b0ab2af1ad472bae2a37a2, 2021)
+- Version: git (f128369e64a5f4715de8125b325e4fe7debb5194, 2022)
- License: BSD-3-Clause
Files extracted from upstream source:
@@ -625,7 +625,7 @@ instead of `miniz.h` as an external dependency.
## thorvg
- Upstream: https://github.com/Samsung/thorvg
-- Version: 0.8.0 (41093c17b3cac440bdcc53f8b69abeb5734696b5, 2022)
+- Version: 0.8.1 (c4ccb1078f4390ec749ab8e05ba7e9e35f81285f, 2022)
- License: MIT
Files extracted from upstream source:
diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp
index 93d3b3bbff..d2fcc4642e 100644
--- a/thirdparty/enet/godot.cpp
+++ b/thirdparty/enet/godot.cpp
@@ -259,7 +259,7 @@ public:
class ENetDTLSServer : public ENetGodotSocket {
Ref<DTLSServer> server;
Ref<UDPServer> udp_server;
- Map<String, Ref<PacketPeerDTLS>> peers;
+ HashMap<String, Ref<PacketPeerDTLS>> peers;
int last_service = 0;
IPAddress local_address;
@@ -331,15 +331,16 @@ public:
List<String> remove;
Error err = ERR_BUSY;
// TODO this needs to be fair!
- for (Map<String, Ref<PacketPeerDTLS>>::Element *E = peers.front(); E; E = E->next()) {
- Ref<PacketPeerDTLS> peer = E->get();
+
+ for (KeyValue<String, Ref<PacketPeerDTLS>> & E : peers) {
+ Ref<PacketPeerDTLS> peer = E.value;
peer->poll();
if (peer->get_status() == PacketPeerDTLS::STATUS_HANDSHAKING) {
continue;
} else if (peer->get_status() != PacketPeerDTLS::STATUS_CONNECTED) {
// Peer disconnected, removing it.
- remove.push_back(E->key());
+ remove.push_back(E.key);
continue;
}
@@ -348,12 +349,12 @@ public:
err = peer->get_packet(&buffer, r_read);
if (err != OK || p_len < r_read) {
// Something wrong with this peer, removing it.
- remove.push_back(E->key());
+ remove.push_back(E.key);
err = FAILED;
continue;
}
- Vector<String> s = E->key().rsplit(":", false, 1);
+ Vector<String> s = E.key.rsplit(":", false, 1);
ERR_CONTINUE(s.size() != 2); // BUG!
memcpy(p_buffer, buffer, r_read);
@@ -376,8 +377,8 @@ public:
}
void close() {
- for (Map<String, Ref<PacketPeerDTLS>>::Element *E = peers.front(); E; E = E->next()) {
- E->get()->disconnect_from_peer();
+ for (KeyValue<String, Ref<PacketPeerDTLS>> &E : peers) {
+ E.value->disconnect_from_peer();
}
peers.clear();
udp_server->stop();
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
index d60164bcc8..fdb0967ce7 100644
--- a/thirdparty/etcpak/ProcessRGB.cpp
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -28,6 +28,10 @@
# define _bswap64(x) __builtin_bswap64(x)
#endif
+static const uint32_t MaxError = 1065369600; // ((38+76+14) * 255)^2
+// common T-/H-mode table
+static uint8_t tableTH[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
// thresholds for the early compression-mode decision scheme
// default: 0.03, 0.09, and 0.38
float ecmd_threshold[3] = { 0.03f, 0.09f, 0.38f };
@@ -36,13 +40,17 @@ static const uint8_t ModeUndecided = 0;
static const uint8_t ModePlanar = 0x1;
static const uint8_t ModeTH = 0x2;
+const unsigned int R = 2;
+const unsigned int G = 1;
+const unsigned int B = 0;
+
struct Luma
{
#ifdef __AVX2__
float max, min;
uint8_t minIdx = 255, maxIdx = 255;
__m128i luma8;
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
float max, min;
uint8_t minIdx = 255, maxIdx = 255;
uint8x16_t luma8;
@@ -52,8 +60,206 @@ struct Luma
#endif
};
+#ifdef __AVX2__
+struct Plane
+{
+ uint64_t plane;
+ uint64_t error;
+ __m256i sum4;
+};
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+struct Channels
+{
+#ifdef __AVX2__
+ __m128i r8, g8, b8;
+#elif defined __ARM_NEON && defined __aarch64__
+ uint8x16x2_t r, g, b;
+#endif
+};
+#endif
+
namespace
{
+static etcpak_force_inline uint8_t clamp( uint8_t min, int16_t val, uint8_t max )
+{
+ return val < min ? min : ( val > max ? max : val );
+}
+
+static etcpak_force_inline uint8_t clampMin( uint8_t min, int16_t val )
+{
+ return val < min ? min : val;
+}
+
+static etcpak_force_inline uint8_t clampMax( int16_t val, uint8_t max )
+{
+ return val > max ? max : val;
+}
+
+// slightly faster than std::sort
+static void insertionSort( uint8_t* arr1, uint8_t* arr2 )
+{
+ for( uint8_t i = 1; i < 16; ++i )
+ {
+ uint8_t value = arr1[i];
+ uint8_t hole = i;
+
+ for( ; hole > 0 && value < arr1[hole - 1]; --hole )
+ {
+ arr1[hole] = arr1[hole - 1];
+ arr2[hole] = arr2[hole - 1];
+ }
+ arr1[hole] = value;
+ arr2[hole] = i;
+ }
+}
+
+//converts indices from |a0|a1|e0|e1|i0|i1|m0|m1|b0|b1|f0|f1|j0|j1|n0|n1|c0|c1|g0|g1|k0|k1|o0|o1|d0|d1|h0|h1|l0|l1|p0|p1| previously used by T- and H-modes
+// into |p0|o0|n0|m0|l0|k0|j0|i0|h0|g0|f0|e0|d0|c0|b0|a0|p1|o1|n1|m1|l1|k1|j1|i1|h1|g1|f1|e1|d1|c1|b1|a1| which should be used for all modes.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline int indexConversion( int pixelIndices )
+{
+ int correctIndices = 0;
+ int LSB[4][4];
+ int MSB[4][4];
+ int shift = 0;
+ for( int y = 3; y >= 0; y-- )
+ {
+ for( int x = 3; x >= 0; x-- )
+ {
+ LSB[x][y] = ( pixelIndices >> shift ) & 1;
+ shift++;
+ MSB[x][y] = ( pixelIndices >> shift ) & 1;
+ shift++;
+ }
+ }
+ shift = 0;
+ for( int x = 0; x < 4; x++ )
+ {
+ for( int y = 0; y < 4; y++ )
+ {
+ correctIndices |= ( LSB[x][y] << shift );
+ correctIndices |= ( MSB[x][y] << ( 16 + shift ) );
+ shift++;
+ }
+ }
+ return correctIndices;
+}
+
+// Swapping two RGB-colors
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static etcpak_force_inline void swapColors( uint8_t( colors )[2][3] )
+{
+ uint8_t temp = colors[0][R];
+ colors[0][R] = colors[1][R];
+ colors[1][R] = temp;
+
+ temp = colors[0][G];
+ colors[0][G] = colors[1][G];
+ colors[1][G] = temp;
+
+ temp = colors[0][B];
+ colors[0][B] = colors[1][B];
+ colors[1][B] = temp;
+}
+
+
+// calculates quantized colors for T or H modes
+void compressColor( uint8_t( currColor )[2][3], uint8_t( quantColor )[2][3], bool t_mode )
+{
+ if( t_mode )
+ {
+ quantColor[0][R] = clampMax( 15 * ( currColor[0][R] + 8 ) / 255, 15 );
+ quantColor[0][G] = clampMax( 15 * ( currColor[0][G] + 8 ) / 255, 15 );
+ quantColor[0][B] = clampMax( 15 * ( currColor[0][B] + 8 ) / 255, 15 );
+ }
+ else // clamped to [1,14] to get a wider range
+ {
+ quantColor[0][R] = clamp( 1, 15 * ( currColor[0][R] + 8 ) / 255, 14 );
+ quantColor[0][G] = clamp( 1, 15 * ( currColor[0][G] + 8 ) / 255, 14 );
+ quantColor[0][B] = clamp( 1, 15 * ( currColor[0][B] + 8 ) / 255, 14 );
+ }
+
+ // clamped to [1,14] to get a wider range
+ quantColor[1][R] = clamp( 1, 15 * ( currColor[1][R] + 8 ) / 255, 14 );
+ quantColor[1][G] = clamp( 1, 15 * ( currColor[1][G] + 8 ) / 255, 14 );
+ quantColor[1][B] = clamp( 1, 15 * ( currColor[1][B] + 8 ) / 255, 14 );
+}
+
+// three decoding functions come from ETCPACK v2.74 and are slightly changed.
+static etcpak_force_inline void decompressColor( uint8_t( colorsRGB444 )[2][3], uint8_t( colors )[2][3] )
+{
+ // The color should be retrieved as:
+ //
+ // c = round(255/(r_bits^2-1))*comp_color
+ //
+ // This is similar to bit replication
+ //
+ // Note -- this code only work for bit replication from 4 bits and up --- 3 bits needs
+ // two copy operations.
+ colors[0][R] = ( colorsRGB444[0][R] << 4 ) | colorsRGB444[0][R];
+ colors[0][G] = ( colorsRGB444[0][G] << 4 ) | colorsRGB444[0][G];
+ colors[0][B] = ( colorsRGB444[0][B] << 4 ) | colorsRGB444[0][B];
+ colors[1][R] = ( colorsRGB444[1][R] << 4 ) | colorsRGB444[1][R];
+ colors[1][G] = ( colorsRGB444[1][G] << 4 ) | colorsRGB444[1][G];
+ colors[1][B] = ( colorsRGB444[1][B] << 4 ) | colorsRGB444[1][B];
+}
+
+// calculates the paint colors from the block colors
+// using a distance d and one of the H- or T-patterns.
+static void calculatePaintColors59T( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+ //////////////////////////////////////////////
+ //
+ // C3 C1 C4----C1---C2
+ // | | |
+ // | | |
+ // |-------| |
+ // | | |
+ // | | |
+ // C4 C2 C3
+ //
+ //////////////////////////////////////////////
+
+ // C4
+ pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+ pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+ pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+ // C3
+ pColors[0][R] = colors[0][R];
+ pColors[0][G] = colors[0][G];
+ pColors[0][B] = colors[0][B];
+ // C2
+ pColors[1][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+ pColors[1][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+ pColors[1][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+ // C1
+ pColors[2][R] = colors[1][R];
+ pColors[2][G] = colors[1][G];
+ pColors[2][B] = colors[1][B];
+}
+
+static void calculatePaintColors58H( uint8_t d, uint8_t( colors )[2][3], uint8_t( pColors )[4][3] )
+{
+ pColors[3][R] = clampMin( 0, colors[1][R] - tableTH[d] );
+ pColors[3][G] = clampMin( 0, colors[1][G] - tableTH[d] );
+ pColors[3][B] = clampMin( 0, colors[1][B] - tableTH[d] );
+
+ // C1
+ pColors[0][R] = clampMax( colors[0][R] + tableTH[d], 255 );
+ pColors[0][G] = clampMax( colors[0][G] + tableTH[d], 255 );
+ pColors[0][B] = clampMax( colors[0][B] + tableTH[d], 255 );
+ // C2
+ pColors[1][R] = clampMin( 0, colors[0][R] - tableTH[d] );
+ pColors[1][G] = clampMin( 0, colors[0][G] - tableTH[d] );
+ pColors[1][B] = clampMin( 0, colors[0][B] - tableTH[d] );
+ // C3
+ pColors[2][R] = clampMax( colors[1][R] + tableTH[d], 255 );
+ pColors[2][G] = clampMax( colors[1][G] + tableTH[d], 255 );
+ pColors[2][B] = clampMax( colors[1][B] + tableTH[d], 255 );
+}
#if defined _MSC_VER && !defined __clang__
static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
@@ -586,127 +792,107 @@ static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cv
return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
}
-struct Plane
+static etcpak_force_inline Plane Planar_AVX2( const Channels& ch, uint8_t& mode, bool useHeuristics )
{
- uint64_t plane;
- uint64_t error;
- __m256i sum4;
-};
-
-static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t mode )
-{
- __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
- __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
- __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
- __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
-
- __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
- __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
-
- __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
- __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
- __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
- __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
-
- // swap channels
- __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
- __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
- __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+ __m128i t0 = _mm_sad_epu8( ch.r8, _mm_setzero_si128() );
+ __m128i t1 = _mm_sad_epu8( ch.g8, _mm_setzero_si128() );
+ __m128i t2 = _mm_sad_epu8( ch.b8, _mm_setzero_si128() );
- __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
- __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
- __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+ __m128i r8s = _mm_shuffle_epi8( ch.r8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+ __m128i g8s = _mm_shuffle_epi8( ch.g8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
+ __m128i b8s = _mm_shuffle_epi8( ch.b8, _mm_set_epi8( 0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0 ) );
- __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
- __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
- __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+ __m128i s0 = _mm_sad_epu8( r8s, _mm_setzero_si128() );
+ __m128i s1 = _mm_sad_epu8( g8s, _mm_setzero_si128() );
+ __m128i s2 = _mm_sad_epu8( b8s, _mm_setzero_si128() );
- __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
- __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
- __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+ __m256i sr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t0 ), s0, 1 );
+ __m256i sg0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t1 ), s1, 1 );
+ __m256i sb0 = _mm256_insertf128_si256( _mm256_castsi128_si256( t2 ), s2, 1 );
- __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
- __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
- __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+ __m256i sr1 = _mm256_slli_epi64( sr0, 32 );
+ __m256i sg1 = _mm256_slli_epi64( sg0, 16 );
- __m256i sr1 = _mm256_slli_epi64(sr0, 32);
- __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+ __m256i srb = _mm256_or_si256( sr1, sb0 );
+ __m256i srgb = _mm256_or_si256( srb, sg1 );
- __m256i srb = _mm256_or_si256(sr1, sb0);
- __m256i srgb = _mm256_or_si256(srb, sg1);
+ if( mode != ModePlanar && useHeuristics )
+ {
+ Plane plane;
+ plane.sum4 = _mm256_permute4x64_epi64( srgb, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+ return plane;
+ }
- __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
- __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
- __m128i t5 = _mm_hadd_epi32(t3, t4);
- __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i t3 = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( t0 ), _mm_castsi128_ps( t1 ), _MM_SHUFFLE( 2, 0, 2, 0 ) ) );
+ __m128i t4 = _mm_shuffle_epi32( t2, _MM_SHUFFLE( 3, 1, 2, 0 ) );
+ __m128i t5 = _mm_hadd_epi32( t3, t4 );
+ __m128i t6 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+ __m128i t7 = _mm_shuffle_epi32( t5, _MM_SHUFFLE( 2, 2, 2, 2 ) );
- __m256i sr = _mm256_broadcastw_epi16(t5);
- __m256i sg = _mm256_broadcastw_epi16(t6);
- __m256i sb = _mm256_broadcastw_epi16(t7);
+ __m256i sr = _mm256_broadcastw_epi16( t5 );
+ __m256i sg = _mm256_broadcastw_epi16( t6 );
+ __m256i sb = _mm256_broadcastw_epi16( t7 );
- __m256i r08 = _mm256_cvtepu8_epi16(r8);
- __m256i g08 = _mm256_cvtepu8_epi16(g8);
- __m256i b08 = _mm256_cvtepu8_epi16(b8);
+ __m256i r08 = _mm256_cvtepu8_epi16( ch.r8 );
+ __m256i g08 = _mm256_cvtepu8_epi16( ch.g8 );
+ __m256i b08 = _mm256_cvtepu8_epi16( ch.b8 );
- __m256i r16 = _mm256_slli_epi16(r08, 4);
- __m256i g16 = _mm256_slli_epi16(g08, 4);
- __m256i b16 = _mm256_slli_epi16(b08, 4);
+ __m256i r16 = _mm256_slli_epi16( r08, 4 );
+ __m256i g16 = _mm256_slli_epi16( g08, 4 );
+ __m256i b16 = _mm256_slli_epi16( b08, 4 );
- __m256i difR0 = _mm256_sub_epi16(r16, sr);
- __m256i difG0 = _mm256_sub_epi16(g16, sg);
- __m256i difB0 = _mm256_sub_epi16(b16, sb);
+ __m256i difR0 = _mm256_sub_epi16( r16, sr );
+ __m256i difG0 = _mm256_sub_epi16( g16, sg );
+ __m256i difB0 = _mm256_sub_epi16( b16, sb );
- __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
- __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
- __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+ __m256i difRyz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+ __m256i difGyz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
+ __m256i difByz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255 ) );
- __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
- __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
- __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+ __m256i difRxz = _mm256_madd_epi16( difR0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+ __m256i difGxz = _mm256_madd_epi16( difG0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
+ __m256i difBxz = _mm256_madd_epi16( difB0, _mm256_set_epi16( 255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255 ) );
- __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
- __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+ __m256i difRGyz = _mm256_hadd_epi32( difRyz, difGyz );
+ __m256i difByzxz = _mm256_hadd_epi32( difByz, difBxz );
- __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+ __m256i difRGxz = _mm256_hadd_epi32( difRxz, difGxz );
- __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
- __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
- __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+ __m128i sumRGyz = _mm_add_epi32( _mm256_castsi256_si128( difRGyz ), _mm256_extracti128_si256( difRGyz, 1 ) );
+ __m128i sumByzxz = _mm_add_epi32( _mm256_castsi256_si128( difByzxz ), _mm256_extracti128_si256( difByzxz, 1 ) );
+ __m128i sumRGxz = _mm_add_epi32( _mm256_castsi256_si128( difRGxz ), _mm256_extracti128_si256( difRGxz, 1 ) );
- __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
- __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+ __m128i sumRGByz = _mm_hadd_epi32( sumRGyz, sumByzxz );
+ __m128i sumRGByzxz = _mm_hadd_epi32( sumRGxz, sumByzxz );
- __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+ __m128i sumRGBxz = _mm_shuffle_epi32( sumRGByzxz, _MM_SHUFFLE( 2, 3, 1, 0 ) );
- __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
- __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+ __m128 sumRGByzf = _mm_cvtepi32_ps( sumRGByz );
+ __m128 sumRGBxzf = _mm_cvtepi32_ps( sumRGBxz );
- const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+ const float value = ( 255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f;
- __m128 scale = _mm_set1_ps(-4.0f / value);
+ __m128 scale = _mm_set1_ps( -4.0f / value );
- __m128 af = _mm_mul_ps(sumRGBxzf, scale);
- __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+ __m128 af = _mm_mul_ps( sumRGBxzf, scale );
+ __m128 bf = _mm_mul_ps( sumRGByzf, scale );
- __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+ __m128 df = _mm_mul_ps( _mm_cvtepi32_ps( t5 ), _mm_set1_ps( 4.0f / 16.0f ) );
// calculating the three colors RGBO, RGBH, and RGBV. RGB = df - af * x - bf * y;
- __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
- __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
- __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+ __m128 cof0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+ __m128 chf0 = _mm_fnmadd_ps( af, _mm_set1_ps( 425.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( -255.0f ), df ) );
+ __m128 cvf0 = _mm_fnmadd_ps( af, _mm_set1_ps( -255.0f ), _mm_fnmadd_ps( bf, _mm_set1_ps( 425.0f ), df ) );
// convert to r6g7b6
- __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+ __m128i cohv = r6g7b6_AVX2( cof0, chf0, cvf0 );
- uint64_t rgbho = _mm_extract_epi64(cohv, 0);
- uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+ uint64_t rgbho = _mm_extract_epi64( cohv, 0 );
+ uint32_t rgbv0 = _mm_extract_epi32( cohv, 2 );
// Error calculation
uint64_t error = 0;
- if( mode != ModePlanar )
+ if( !useHeuristics )
{
auto ro0 = ( rgbho >> 48 ) & 0x3F;
auto go0 = ( rgbho >> 40 ) & 0x7F;
@@ -820,7 +1006,15 @@ static etcpak_force_inline Plane Planar_AVX2( const uint8_t* src, const uint8_t
Plane plane;
plane.plane = result;
- plane.error = error;
+ if( useHeuristics )
+ {
+ plane.error = 0;
+ mode = ModePlanar;
+ }
+ else
+ {
+ plane.error = error;
+ }
plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
return plane;
@@ -1570,7 +1764,7 @@ static etcpak_force_inline uint8_t convert7(float f)
return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
}
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t* src, const uint8_t mode, bool useHeuristics )
{
int32_t r = 0;
int32_t g = 0;
@@ -1645,7 +1839,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar( const uint8_t*
// Error calculation
uint64_t error = 0;
- if( ModePlanar != mode )
+ if( ModePlanar != mode && useHeuristics )
{
auto ro0 = coR;
auto go0 = coG;
@@ -1756,7 +1950,7 @@ static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
-#else
+#else
return vdupq_n_s16( vaddvq_u16( accu8 ) );
#endif
}
@@ -1783,7 +1977,7 @@ static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
}
-static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode )
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src, const uint8_t mode, bool useHeuristics )
{
uint8x16x4_t srcBlock = vld4q_u8( src );
@@ -1828,7 +2022,7 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
uint64_t error = 0;
- if( mode != ModePlanar )
+ if( mode != ModePlanar && useHeuristics )
{
int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
@@ -1924,6 +2118,376 @@ static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint
#endif
+#ifdef __AVX2__
+uint32_t calculateErrorTH( bool tMode, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t calculateErrorTH( bool tMode, uint8_t* src, uint8_t( colorsRGB444 )[2][3], uint8_t& dist, uint32_t& pixIndices, uint8_t startDist )
+#endif
+{
+ uint32_t blockErr = 0, bestBlockErr = MaxError;
+
+ uint32_t pixColors;
+ uint8_t possibleColors[4][3];
+ uint8_t colors[2][3];
+
+ decompressColor( colorsRGB444, colors );
+
+#ifdef __AVX2__
+ __m128i reverseMask = _mm_set_epi8( 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 );
+#endif
+
+ // test distances
+ for( uint8_t d = startDist; d < 8; ++d )
+ {
+ if( d >= 2 && dist == d - 2 ) break;
+
+ blockErr = 0;
+ pixColors = 0;
+
+ if( tMode )
+ {
+ calculatePaintColors59T( d, colors, possibleColors );
+ }
+ else
+ {
+ calculatePaintColors58H( d, colors, possibleColors );
+ }
+
+#ifdef __AVX2__
+ // RGB ordering
+ __m128i b8Rev = _mm_shuffle_epi8( b8, reverseMask );
+ __m128i g8Rev = _mm_shuffle_epi8( g8, reverseMask );
+ __m128i r8Rev = _mm_shuffle_epi8( r8, reverseMask );
+
+ // extends 3x128 bits RGB into 3x256 bits RGB for error comparisions
+ static const __m128i zero = _mm_setzero_si128();
+ __m128i b8Lo = _mm_unpacklo_epi8( b8Rev, zero );
+ __m128i g8Lo = _mm_unpacklo_epi8( g8Rev, zero );
+ __m128i r8Lo = _mm_unpacklo_epi8( r8Rev, zero );
+ __m128i b8Hi = _mm_unpackhi_epi8( b8Rev, zero );
+ __m128i g8Hi = _mm_unpackhi_epi8( g8Rev, zero );
+ __m128i r8Hi = _mm_unpackhi_epi8( r8Rev, zero );
+
+ __m256i b8 = _mm256_set_m128i( b8Hi, b8Lo );
+ __m256i g8 = _mm256_set_m128i( g8Hi, g8Lo );
+ __m256i r8 = _mm256_set_m128i( r8Hi, r8Lo );
+
+ // caculates differences between the pixel colrs and the palette colors
+ __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[0][B] ) ) );
+ __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[0][G] ) ) );
+ __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[0][R] ) ) );
+
+ // luma-based error calculations
+ static const __m256i bWeight = _mm256_set1_epi16( 14 );
+ static const __m256i gWeight = _mm256_set1_epi16( 76 );
+ static const __m256i rWeight = _mm256_set1_epi16( 38 );
+
+ diffb = _mm256_mullo_epi16( diffb, bWeight );
+ diffg = _mm256_mullo_epi16( diffg, gWeight );
+ diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+ // obtains the error with the current palette color
+ __m256i lowestPixErr = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+
+ // error calucations with the remaining three palette colors
+ static const uint32_t masks[4] = { 0, 0x55555555, 0xAAAAAAAA, 0xFFFFFFFF };
+ for( uint8_t c = 1; c < 4; c++ )
+ {
+ __m256i diffb = _mm256_abs_epi16( _mm256_sub_epi16( b8, _mm256_set1_epi16( possibleColors[c][B] ) ) );
+ __m256i diffg = _mm256_abs_epi16( _mm256_sub_epi16( g8, _mm256_set1_epi16( possibleColors[c][G] ) ) );
+ __m256i diffr = _mm256_abs_epi16( _mm256_sub_epi16( r8, _mm256_set1_epi16( possibleColors[c][R] ) ) );
+
+ diffb = _mm256_mullo_epi16( diffb, bWeight );
+ diffg = _mm256_mullo_epi16( diffg, gWeight );
+ diffr = _mm256_mullo_epi16( diffr, rWeight );
+
+ // error comparison with the previous best color
+ __m256i pixErrors = _mm256_add_epi16( _mm256_add_epi16( diffb, diffg ), diffr );
+ __m256i minErr = _mm256_min_epu16( lowestPixErr, pixErrors );
+ __m256i cmpRes = _mm256_cmpeq_epi16( pixErrors, minErr );
+ lowestPixErr = minErr;
+
+ // update pixel colors
+ uint32_t updPixColors = _mm256_movemask_epi8( cmpRes );
+ uint32_t prevPixColors = pixColors & ~updPixColors;
+ uint32_t mskPixColors = masks[c] & updPixColors;
+ pixColors = prevPixColors | mskPixColors;
+ }
+
+ // accumulate the block error
+ alignas( 32 ) uint16_t pixErr16[16] = { 0, };
+ _mm256_storeu_si256( (__m256i*)pixErr16, lowestPixErr );
+ for( uint8_t p = 0; p < 16; p++ )
+ {
+ blockErr += (int)( pixErr16[p] ) * pixErr16[p];
+ }
+#else
+ for( size_t y = 0; y < 4; ++y )
+ {
+ for( size_t x = 0; x < 4; ++x )
+ {
+ uint32_t bestPixErr = MaxError;
+ pixColors <<= 2; // Make room for next value
+
+ // Loop possible block colors
+ for( uint8_t c = 0; c < 4; ++c )
+ {
+ int diff[3];
+ diff[R] = src[4 * ( x * 4 + y ) + R] - possibleColors[c][R];
+ diff[G] = src[4 * ( x * 4 + y ) + G] - possibleColors[c][G];
+ diff[B] = src[4 * ( x * 4 + y ) + B] - possibleColors[c][B];
+
+ const uint32_t err = 38 * abs( diff[R] ) + 76 * abs( diff[G] ) + 14 * abs( diff[B] );
+ uint32_t pixErr = err * err;
+
+ // Choose best error
+ if( pixErr < bestPixErr )
+ {
+ bestPixErr = pixErr;
+ pixColors ^= ( pixColors & 3 ); // Reset the two first bits
+ pixColors |= c;
+ }
+ }
+ blockErr += bestPixErr;
+ }
+ }
+#endif
+
+ if( blockErr < bestBlockErr )
+ {
+ bestBlockErr = blockErr;
+ dist = d;
+ pixIndices = pixColors;
+ }
+ }
+
+ return bestBlockErr;
+}
+
+
+// main T-/H-mode compression function
+#ifdef __AVX2__
+uint32_t compressBlockTH( uint8_t* src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool& tMode, __m128i r8, __m128i g8, __m128i b8 )
+#else
+uint32_t compressBlockTH( uint8_t *src, Luma& l, uint32_t& compressed1, uint32_t& compressed2, bool &tMode )
+#endif
+{
+#ifdef __AVX2__
+ alignas( 8 ) uint8_t luma[16] = { 0, };
+ _mm_storeu_si128 ( (__m128i* )luma, l.luma8 );
+#elif defined __ARM_NEON && defined __aarch64__
+ alignas( 8 ) uint8_t luma[16] = { 0 };
+ vst1q_u8( luma, l.luma8 );
+#else
+ uint8_t* luma = l.val;
+#endif
+
+ uint8_t pixIdx[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+ // 1) sorts the pairs of (luma, pix_idx)
+ insertionSort( luma, pixIdx );
+
+ // 2) finds the min (left+right)
+ uint8_t minSumRangeIdx = 0;
+ uint16_t minSumRangeValue;
+ uint16_t sum;
+ static const uint8_t diffBonus[15] = {8, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 8};
+ const int16_t temp = luma[15] - luma[0];
+
+ minSumRangeValue = luma[15] - luma[1] + diffBonus[0];
+ for( uint8_t i = 1; i < 14; i++ )
+ {
+ sum = temp - luma[i+1] + luma[i] + diffBonus[i];
+ if( minSumRangeValue > sum )
+ {
+ minSumRangeValue = sum;
+ minSumRangeIdx = i;
+ }
+ }
+
+ sum = luma[14] - luma[0] + diffBonus[14];
+ if( minSumRangeValue > sum )
+ {
+ minSumRangeValue = sum;
+ minSumRangeIdx = 14;
+ }
+ uint8_t lRange, rRange;
+
+ lRange = luma[minSumRangeIdx] - luma[0];
+ rRange = luma[15] - luma[minSumRangeIdx + 1];
+
+ // 3) sets a proper mode
+ bool swap = false;
+ if( lRange >= rRange )
+ {
+ if( lRange >= rRange * 2 )
+ {
+ swap = true;
+ tMode = true;
+ }
+ }
+ else
+ {
+ if( lRange * 2 <= rRange ) tMode = true;
+ }
+ // 4) calculates the two base colors
+ uint8_t rangeIdx[4] = { pixIdx[0], pixIdx[minSumRangeIdx], pixIdx[minSumRangeIdx + 1], pixIdx[15] };
+
+ uint16_t r[4], g[4], b[4];
+ for( uint8_t i = 0; i < 4; ++i )
+ {
+ uint8_t idx = rangeIdx[i] * 4;
+ b[i] = src[idx];
+ g[i] = src[idx + 1];
+ r[i] = src[idx + 2];
+ }
+
+ uint8_t mid_rgb[2][3];
+ if( swap )
+ {
+ mid_rgb[1][B] = ( b[0] + b[1] ) / 2;
+ mid_rgb[1][G] = ( g[0] + g[1] ) / 2;
+ mid_rgb[1][R] = ( r[0] + r[1] ) / 2;
+
+ uint16_t sum_rgb[3] = { 0, 0, 0 };
+ for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+ {
+ uint8_t idx = pixIdx[i] * 4;
+ sum_rgb[B] += src[idx];
+ sum_rgb[G] += src[idx + 1];
+ sum_rgb[R] += src[idx + 2];
+ }
+ const uint8_t temp = 15 - minSumRangeIdx;
+ mid_rgb[0][B] = sum_rgb[B] / temp;
+ mid_rgb[0][G] = sum_rgb[G] / temp;
+ mid_rgb[0][R] = sum_rgb[R] / temp;
+ }
+ else
+ {
+ mid_rgb[0][B] = (b[0] + b[1]) / 2;
+ mid_rgb[0][G] = (g[0] + g[1]) / 2;
+ mid_rgb[0][R] = (r[0] + r[1]) / 2;
+ if( tMode )
+ {
+ uint16_t sum_rgb[3] = { 0, 0, 0 };
+ for( uint8_t i = minSumRangeIdx + 1; i < 16; i++ )
+ {
+ uint8_t idx = pixIdx[i] * 4;
+ sum_rgb[B] += src[idx];
+ sum_rgb[G] += src[idx + 1];
+ sum_rgb[R] += src[idx + 2];
+ }
+ const uint8_t temp = 15 - minSumRangeIdx;
+ mid_rgb[1][B] = sum_rgb[B] / temp;
+ mid_rgb[1][G] = sum_rgb[G] / temp;
+ mid_rgb[1][R] = sum_rgb[R] / temp;
+ }
+ else
+ {
+ mid_rgb[1][B] = (b[2] + b[3]) / 2;
+ mid_rgb[1][G] = (g[2] + g[3]) / 2;
+ mid_rgb[1][R] = (r[2] + r[3]) / 2;
+ }
+ }
+
+ // 5) sets the start distance index
+ uint32_t startDistCandidate;
+ uint32_t avgDist;
+ if( tMode )
+ {
+ if( swap )
+ {
+ avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] ) / 6;
+ }
+ else
+ {
+ avgDist = ( b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 6;
+ }
+ }
+ else
+ {
+ avgDist = ( b[1] - b[0] + g[1] - g[0] + r[1] - r[0] + b[3] - b[2] + g[3] - g[2] + r[3] - r[2] ) / 12;
+ }
+
+ if( avgDist <= 16)
+ {
+ startDistCandidate = 0;
+ }
+ else if( avgDist <= 23 )
+ {
+ startDistCandidate = 1;
+ }
+ else if( avgDist <= 32 )
+ {
+ startDistCandidate = 2;
+ }
+ else if( avgDist <= 41 )
+ {
+ startDistCandidate = 3;
+ }
+ else
+ {
+ startDistCandidate = 4;
+ }
+
+ uint32_t bestErr = MaxError;
+ uint32_t bestPixIndices;
+ uint8_t bestDist = 10;
+ uint8_t colorsRGB444[2][3];
+ compressColor( mid_rgb, colorsRGB444, tMode );
+ compressed1 = 0;
+
+ // 6) finds the best candidate with the lowest error
+#ifdef __AVX2__
+ // Vectorized ver
+ bestErr = calculateErrorTH( tMode, colorsRGB444, bestDist, bestPixIndices, startDistCandidate, r8, g8, b8 );
+#else
+ // Scalar ver
+ bestErr = calculateErrorTH( tMode, src, colorsRGB444, bestDist, bestPixIndices, startDistCandidate );
+#endif
+
+ // 7) outputs the final T or H block
+ if( tMode )
+ {
+ // Put the compress params into the compression block
+ compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 23;
+ compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 19;
+ compressed1 |= ( colorsRGB444[0][B] ) << 15;
+ compressed1 |= ( colorsRGB444[1][R] ) << 11;
+ compressed1 |= ( colorsRGB444[1][G] ) << 7;
+ compressed1 |= ( colorsRGB444[1][B] ) << 3;
+ compressed1 |= bestDist & 0x7;
+ }
+ else
+ {
+ int bestRGB444ColPacked[2];
+ bestRGB444ColPacked[0] = (colorsRGB444[0][R] << 8) + (colorsRGB444[0][G] << 4) + colorsRGB444[0][B];
+ bestRGB444ColPacked[1] = (colorsRGB444[1][R] << 8) + (colorsRGB444[1][G] << 4) + colorsRGB444[1][B];
+ if( ( bestRGB444ColPacked[0] >= bestRGB444ColPacked[1] ) ^ ( ( bestDist & 1 ) == 1 ) )
+ {
+ swapColors( colorsRGB444 );
+ // Reshuffle pixel indices to to exchange C1 with C3, and C2 with C4
+ bestPixIndices = ( 0x55555555 & bestPixIndices ) | ( 0xaaaaaaaa & ( ~bestPixIndices ) );
+ }
+
+ // Put the compress params into the compression block
+ compressed1 |= ( colorsRGB444[0][R] & 0xf ) << 22;
+ compressed1 |= ( colorsRGB444[0][G] & 0xf ) << 18;
+ compressed1 |= ( colorsRGB444[0][B] & 0xf ) << 14;
+ compressed1 |= ( colorsRGB444[1][R] & 0xf ) << 10;
+ compressed1 |= ( colorsRGB444[1][G] & 0xf ) << 6;
+ compressed1 |= ( colorsRGB444[1][B] & 0xf ) << 2;
+ compressed1 |= ( bestDist >> 1 ) & 0x3;
+ }
+
+ bestPixIndices = indexConversion( bestPixIndices );
+ compressed2 = 0;
+ compressed2 = ( compressed2 & ~( ( 0x2 << 31 ) - 1 ) ) | ( bestPixIndices & ( ( 2 << 31 ) - 1 ) );
+
+ return bestErr;
+}
+//#endif
+
template<class T, class S>
static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
{
@@ -2025,7 +2589,7 @@ static inline int16_t hMax( __m128i buffer, uint8_t& idx )
return result;
}
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
static inline int16_t hMax( uint8x16_t buffer, uint8_t& idx )
{
const uint8_t max = vmaxvq_u8( buffer );
@@ -2072,7 +2636,7 @@ static inline int16_t hMin( __m128i buffer, uint8_t& idx )
idx = _tzcnt_u32( _mm_movemask_epi8( mask ) );
return result;
}
-#elif defined __ARM_NEON
+#elif defined __ARM_NEON && defined __aarch64__
static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
{
const uint8_t min = vminvq_u8( buffer );
@@ -2109,8 +2673,153 @@ static inline int16_t hMin( uint8x16_t buffer, uint8_t& idx )
}
#endif
-static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff59bits( unsigned int thumbT59W1, unsigned int thumbT59W2, unsigned int& thumbTW1, unsigned int& thumbTW2 )
{
+ // Put bits in twotimer configuration for 59 (red overflows)
+ //
+ // Go from this bit layout:
+ //
+ // |63 62 61 60 59|58 57 56 55|54 53 52 51|50 49 48 47|46 45 44 43|42 41 40 39|38 37 36 35|34 33 32|
+ // |----empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|--dist--|
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |----------------------------------------index bits---------------------------------------------|
+ //
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // |// // //|R0a |//|R0b |G0 |B0 |R1 |G1 |B1 |da |df|db|
+ // -----------------------------------------------------------------------------------------------
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |----------------------------------------index bits---------------------------------------------|
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
+ // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
+ // ------------------------------------------------------------------------------------------------
+
+ uint8_t R0a;
+ uint8_t bit, a, b, c, d, bits;
+
+ R0a = ( thumbT59W1 >> 25 ) & 0x3;
+
+ // Fix middle part
+ thumbTW1 = thumbT59W1 << 1;
+ // Fix R0a (top two bits of R0)
+ thumbTW1 = ( thumbTW1 & ~( 0x3 << 27 ) ) | ( ( R0a & 0x3 ) << 27 );
+ // Fix db (lowest bit of d)
+ thumbTW1 = ( thumbTW1 & ~0x1 ) | ( thumbT59W1 & 0x1 );
+
+ // Make sure that red overflows:
+ a = ( thumbTW1 >> 28 ) & 0x1;
+ b = ( thumbTW1 >> 27 ) & 0x1;
+ c = ( thumbTW1 >> 25 ) & 0x1;
+ d = ( thumbTW1 >> 24 ) & 0x1;
+
+ // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+ // The following logical expression checks for the presence of any of those:
+ bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+ bits = 0xf * bit;
+ thumbTW1 = ( thumbTW1 & ~( 0x7 << 29 ) ) | ( bits & 0x7 ) << 29;
+ thumbTW1 = ( thumbTW1 & ~( 0x1 << 26 ) ) | ( !bit & 0x1 ) << 26;
+
+ // Set diffbit
+ thumbTW1 = ( thumbTW1 & ~0x2 ) | 0x2;
+ thumbTW2 = thumbT59W2;
+}
+
+// During search it is not convenient to store the bits the way they are stored in the
+// file format. Hence, after search, it is converted to this format.
+// NO WARRANTY --- SEE STATEMENT IN TOP OF FILE (C) Ericsson AB 2005-2013. All Rights Reserved.
+static inline void stuff58bits( unsigned int thumbH58W1, unsigned int thumbH58W2, unsigned int& thumbHW1, unsigned int& thumbHW2 )
+{
+ // Put bits in twotimer configuration for 58 (red doesn't overflow, green does)
+ //
+ // Go from this bit layout:
+ //
+ //
+ // |63 62 61 60 59 58|57 56 55 54|53 52 51 50|49 48 47 46|45 44 43 42|41 40 39 38|37 36 35 34|33 32|
+ // |-------empty-----|---red 0---|--green 0--|--blue 0---|---red 1---|--green 1--|--blue 1---|d2 d1|
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |---------------------------------------index bits----------------------------------------------|
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // |//|R0 |G0 |// // //|G0|B0|//|B0b |R1 |G1 |B0 |d2|df|d1|
+ // -----------------------------------------------------------------------------------------------
+ //
+ // |31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00|
+ // |---------------------------------------index bits----------------------------------------------|
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // -----------------------------------------------------------------------------------------------
+ // | base col1 | dcol 2 | base col1 | dcol 2 | base col 1 | dcol 2 | table | table |df|fp|
+ // | R1' (5 bits) | dR2 | G1' (5 bits) | dG2 | B1' (5 bits) | dB2 | cw 1 | cw 2 |bt|bt|
+ // -----------------------------------------------------------------------------------------------
+ //
+ //
+ // Thus, what we are really doing is going from this bit layout:
+ //
+ //
+ // |63 62 61 60 59 58|57 56 55 54 53 52 51|50 49|48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33|32 |
+ // |-------empty-----|part0---------------|part1|part2------------------------------------------|part3|
+ //
+ // To this:
+ //
+ // 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+ // --------------------------------------------------------------------------------------------------|
+ // |//|part0 |// // //|part1|//|part2 |df|part3|
+ // --------------------------------------------------------------------------------------------------|
+
+ unsigned int part0, part1, part2, part3;
+ uint8_t bit, a, b, c, d, bits;
+
+ // move parts
+ part0 = ( thumbH58W1 >> 19 ) & 0x7f;
+ part1 = ( thumbH58W1 >> 17 ) & 0x3;
+ part2 = ( thumbH58W1 >> 1 ) & 0xffff;
+ part3 = thumbH58W1 & 0x1;
+ thumbHW1 = 0;
+ thumbHW1 = ( thumbHW1 & ~( 0x7f << 24 ) ) | ( ( part0 & 0x7f ) << 24 );
+ thumbHW1 = ( thumbHW1 & ~( 0x3 << 19 ) ) | ( ( part1 & 0x3 ) << 19 );
+ thumbHW1 = ( thumbHW1 & ~( 0xffff << 2 ) ) | ( ( part2 & 0xffff ) << 2 );
+ thumbHW1 = ( thumbHW1 & ~0x1 ) | ( part3 & 0x1 );
+
+ // Make sure that red does not overflow:
+ bit = ( thumbHW1 >> 30 ) & 0x1;
+ thumbHW1 = ( thumbHW1 & ~( 0x1 << 31 ) ) | ( ( !bit & 0x1 ) << 31 );
+
+ // Make sure that green overflows:
+ a = ( thumbHW1 >> 20 ) & 0x1;
+ b = ( thumbHW1 >> 19 ) & 0x1;
+ c = ( thumbHW1 >> 17 ) & 0x1;
+ d = ( thumbHW1 >> 16 ) & 0x1;
+ // The following bit abcd bit sequences should be padded with ones: 0111, 1010, 1011, 1101, 1110, 1111
+ // The following logical expression checks for the presence of any of those:
+ bit = ( a & c ) | ( !a & b & c & d ) | ( a & b & !c & d );
+ bits = 0xf * bit;
+ thumbHW1 = ( thumbHW1 & ~( 0x7 << 21 ) ) | ( ( bits & 0x7 ) << 21 );
+ thumbHW1 = ( thumbHW1 & ~( 0x1 << 18 ) ) | ( ( !bit & 0x1 ) << 18 );
+
+ // Set diffbit
+ thumbHW1 = ( thumbHW1 & ~0x2 ) | 0x2;
+ thumbHW2 = thumbH58W2;
+}
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline Channels GetChannels( const uint8_t* src )
+{
+ Channels ch;
#ifdef __AVX2__
__m128i d0 = _mm_loadu_si128( ( (__m128i*)src ) + 0 );
__m128i d1 = _mm_loadu_si128( ( (__m128i*)src ) + 1 );
@@ -2128,30 +2837,10 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
__m128i b1 = _mm_unpackhi_epi32( rgb2, rgb3 );
// swap channels
- __m128i b8 = _mm_unpacklo_epi64( rg0, rg1 );
- __m128i g8 = _mm_unpackhi_epi64( rg0, rg1 );
- __m128i r8 = _mm_unpacklo_epi64( b0, b1 );
-
- __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( b8 ), _mm256_set1_epi16( 14 ) );
- __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( g8 ), _mm256_set1_epi16( 76 ) );
- __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( r8 ), _mm256_set1_epi16( 38 ) );
-
- __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
- __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
- __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
- __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
-
- static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
- static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
- __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
- __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
- __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
- luma.luma8 = luma_8bit;
-
- // min/max calculation
- luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
- luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
-#elif defined __ARM_NEON
+ ch.b8 = _mm_unpacklo_epi64( rg0, rg1 );
+ ch.g8 = _mm_unpackhi_epi64( rg0, rg1 );
+ ch.r8 = _mm_unpacklo_epi64( b0, b1 );
+#elif defined __ARM_NEON && defined __aarch64__
//load pixel data into 4 rows
uint8x16_t px0 = vld1q_u8( src + 0 );
uint8x16_t px1 = vld1q_u8( src + 16 );
@@ -2172,12 +2861,48 @@ static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
uint8x16x2_t red = vzipq_u8( rr, uint8x16_t() );
uint8x16x2_t grn = vzipq_u8( gg, uint8x16_t() );
uint8x16x2_t blu = vzipq_u8( bb, uint8x16_t() );
- uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[0] ), 14 );
- uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( red.val[1] ), 14 );
- uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[0] ), 76 );
- uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( grn.val[1] ), 76 );
- uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[0] ), 38 );
- uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( blu.val[1] ), 38 );
+ ch.r = red;
+ ch.b = blu;
+ ch.g = grn;
+#endif
+ return ch;
+}
+#endif
+
+#if defined __AVX2__ || (defined __ARM_NEON && defined __aarch64__)
+static etcpak_force_inline void CalculateLuma( Channels& ch, Luma& luma )
+#else
+static etcpak_force_inline void CalculateLuma( const uint8_t* src, Luma& luma )
+#endif
+{
+#ifdef __AVX2__
+ __m256i b16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.b8 ), _mm256_set1_epi16( 14 ) );
+ __m256i g16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.g8 ), _mm256_set1_epi16( 76 ) );
+ __m256i r16_luma = _mm256_mullo_epi16( _mm256_cvtepu8_epi16( ch.r8 ), _mm256_set1_epi16( 38 ) );
+
+ __m256i luma_16bit = _mm256_add_epi16( _mm256_add_epi16( g16_luma, r16_luma ), b16_luma );
+ __m256i luma_8bit_m256i = _mm256_srli_epi16( luma_16bit, 7 );
+ __m128i luma_8bit_lo = _mm256_extractf128_si256( luma_8bit_m256i, 0 );
+ __m128i luma_8bit_hi = _mm256_extractf128_si256( luma_8bit_m256i, 1 );
+
+ static const __m128i interleaving_mask_lo = _mm_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 );
+ static const __m128i interleaving_mask_hi = _mm_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 );
+ __m128i luma_8bit_lo_moved = _mm_shuffle_epi8( luma_8bit_lo, interleaving_mask_lo );
+ __m128i luma_8bit_hi_moved = _mm_shuffle_epi8( luma_8bit_hi, interleaving_mask_hi );
+ __m128i luma_8bit = _mm_or_si128( luma_8bit_hi_moved, luma_8bit_lo_moved );
+ luma.luma8 = luma_8bit;
+
+ // min/max calculation
+ luma.min = hMin( luma_8bit, luma.minIdx ) * 0.00392156f;
+ luma.max = hMax( luma_8bit, luma.maxIdx ) * 0.00392156f;
+#elif defined __ARM_NEON && defined __aarch64__
+ //load pixel data into 4 rows
+ uint16x8_t red0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[0] ), 14 );
+ uint16x8_t red1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.r.val[1] ), 14 );
+ uint16x8_t grn0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[0] ), 76 );
+ uint16x8_t grn1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.g.val[1] ), 76 );
+ uint16x8_t blu0 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[0] ), 38 );
+ uint16x8_t blu1 = vmulq_n_u16( vreinterpretq_u16_u8( ch.b.val[1] ), 38 );
//calculate luma for rows 0,1 and 2,3
uint16x8_t lum_r01 = vaddq_u16( vaddq_u16( red0, grn0 ), blu0 );
@@ -2253,7 +2978,7 @@ static etcpak_force_inline uint8_t SelectModeETC2( const Luma& luma )
{
return ModeTH;
}
- return 0;
+ return ModeUndecided;
}
static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool useHeuristics )
@@ -2267,33 +2992,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
#endif
uint8_t mode = ModeUndecided;
+ Luma luma;
+#ifdef __AVX2__
+ Channels ch = GetChannels( src );
if( useHeuristics )
{
- Luma luma;
- CalculateLuma( src, luma );
+ CalculateLuma( ch, luma );
mode = SelectModeETC2( luma );
}
-#ifdef __AVX2__
- auto plane = Planar_AVX2( src, mode );
+ auto plane = Planar_AVX2( ch, mode, useHeuristics );
if( useHeuristics && mode == ModePlanar ) return plane.plane;
- alignas(32) v4i a[8];
-
+ alignas( 32 ) v4i a[8];
__m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
// Get index of minimum error (err0)
- __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+ __m128i err1 = _mm_shuffle_epi32( err0, _MM_SHUFFLE( 2, 3, 0, 1 ) );
__m128i errMin0 = _mm_min_epu32(err0, err1);
- __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
- __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+ __m128i errMin1 = _mm_shuffle_epi32( errMin0, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+ __m128i errMin2 = _mm_min_epu32( errMin1, errMin0 );
- __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+ __m128i errMask = _mm_cmpeq_epi32( errMin2, err0 );
- uint32_t mask = _mm_movemask_epi8(errMask);
+ uint32_t mask = _mm_movemask_epi8( errMask );
- size_t idx = _bit_scan_forward(mask) >> 2;
+ size_t idx = _bit_scan_forward( mask ) >> 2;
d = EncodeAverages_AVX2( a, idx );
@@ -2309,12 +3034,54 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
}
- return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+ if( useHeuristics )
+ {
+ if( mode == ModeTH )
+ {
+ uint64_t result = 0;
+ uint64_t error = 0;
+ uint32_t compressed[4] = { 0, 0, 0, 0 };
+ bool tMode = false;
+
+ error = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode, ch.r8, ch.g8, ch.b8 );
+ if( tMode )
+ {
+ stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+ else
+ {
+ stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+
+ result = (uint32_t)_bswap( compressed[2] );
+ result |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+
+ plane.plane = result;
+ plane.error = error;
+ }
+ else
+ {
+ plane.plane = 0;
+ plane.error = MaxError;
+ }
+ }
+
+ return EncodeSelectors_AVX2( d, terr, tsel, ( idx % 2 ) == 1, plane.plane, plane.error );
#else
+ if( useHeuristics )
+ {
+#if defined __ARM_NEON && defined __aarch64__
+ Channels ch = GetChannels( src );
+ CalculateLuma( ch, luma );
+#else
+ CalculateLuma( src, luma );
+#endif
+ mode = SelectModeETC2( luma );
+ }
#ifdef __ARM_NEON
- auto result = Planar_NEON( src, mode );
+ auto result = Planar_NEON( src, mode, useHeuristics );
#else
- auto result = Planar( src, mode );
+ auto result = Planar( src, mode, useHeuristics );
#endif
if( result.second == 0 ) return result.first;
@@ -2333,6 +3100,33 @@ static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src, bool us
auto id = g_id[idx];
FindBestFit( terr, tsel, a, id, src );
+ if( useHeuristics )
+ {
+ if( mode == ModeTH )
+ {
+ uint32_t compressed[4] = { 0, 0, 0, 0 };
+ bool tMode = false;
+
+ result.second = compressBlockTH( (uint8_t*)src, luma, compressed[0], compressed[1], tMode );
+ if( tMode )
+ {
+ stuff59bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+ else
+ {
+ stuff58bits( compressed[0], compressed[1], compressed[2], compressed[3] );
+ }
+
+ result.first = (uint32_t)_bswap( compressed[2] );
+ result.first |= static_cast<uint64_t>( _bswap( compressed[3] ) ) << 32;
+ }
+ else
+ {
+ result.first = 0;
+ result.second = MaxError;
+ }
+ }
+
return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
#endif
}
diff --git a/thirdparty/misc/polypartition.cpp b/thirdparty/misc/polypartition.cpp
index df144c57a6..a725125ed0 100644
--- a/thirdparty/misc/polypartition.cpp
+++ b/thirdparty/misc/polypartition.cpp
@@ -1357,12 +1357,12 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
// Note that while set doesn't actually have to be implemented as
// a tree, complexity requirements for operations are the same as
// for the balanced binary search tree.
- Set<ScanLineEdge> edgeTree;
+ RBSet<ScanLineEdge> edgeTree;
// Store iterators to the edge tree elements.
// This makes deleting existing edges much faster.
- Set<ScanLineEdge>::Element **edgeTreeIterators, *edgeIter;
- edgeTreeIterators = new Set<ScanLineEdge>::Element *[maxnumvertices];
- //Pair<Set<ScanLineEdge>::iterator, bool> edgeTreeRet;
+ RBSet<ScanLineEdge>::Element **edgeTreeIterators, *edgeIter;
+ edgeTreeIterators = new RBSet<ScanLineEdge>::Element *[maxnumvertices];
+ //Pair<RBSet<ScanLineEdge>::iterator, bool> edgeTreeRet;
for (i = 0; i < numvertices; i++) {
edgeTreeIterators[i] = nullptr;
}
@@ -1569,8 +1569,8 @@ int TPPLPartition::MonotonePartition(TPPLPolyList *inpolys, TPPLPolyList *monoto
// Adds a diagonal to the doubly-connected list of vertices.
void TPPLPartition::AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
- TPPLVertexType *vertextypes, Set<ScanLineEdge>::Element **edgeTreeIterators,
- Set<ScanLineEdge> *edgeTree, long *helpers) {
+ TPPLVertexType *vertextypes, RBSet<ScanLineEdge>::Element **edgeTreeIterators,
+ RBSet<ScanLineEdge> *edgeTree, long *helpers) {
long newindex1, newindex2;
newindex1 = *numvertices;
diff --git a/thirdparty/misc/polypartition.h b/thirdparty/misc/polypartition.h
index b2d905a3ef..fae7909079 100644
--- a/thirdparty/misc/polypartition.h
+++ b/thirdparty/misc/polypartition.h
@@ -26,7 +26,7 @@
#include "core/math/vector2.h"
#include "core/templates/list.h"
-#include "core/templates/set.h"
+#include "core/templates/rb_set.h"
typedef double tppl_float;
@@ -224,8 +224,8 @@ public:
// Helper functions for MonotonePartition.
bool Below(TPPLPoint &p1, TPPLPoint &p2);
void AddDiagonal(MonotoneVertex *vertices, long *numvertices, long index1, long index2,
- TPPLVertexType *vertextypes, Set<ScanLineEdge>::Element **edgeTreeIterators,
- Set<ScanLineEdge> *edgeTree, long *helpers);
+ TPPLVertexType *vertextypes, RBSet<ScanLineEdge>::Element **edgeTreeIterators,
+ RBSet<ScanLineEdge> *edgeTree, long *helpers);
// Triangulates a monotone polygon, used in Triangulate_MONO.
int TriangulateMonotone(TPPLPoly *inPoly, TPPLPolyList *triangles);
diff --git a/thirdparty/thorvg/inc/config.h b/thirdparty/thorvg/inc/config.h
index d72574bc65..879b70442b 100644
--- a/thirdparty/thorvg/inc/config.h
+++ b/thirdparty/thorvg/inc/config.h
@@ -13,5 +13,5 @@
#define THORVG_JPG_LOADER_SUPPORT 1
-#define THORVG_VERSION_STRING "0.8.0"
+#define THORVG_VERSION_STRING "0.8.1"
#endif
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwCommon.h b/thirdparty/thorvg/src/lib/sw_engine/tvgSwCommon.h
index be7042d6de..157fdb8f82 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwCommon.h
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwCommon.h
@@ -99,9 +99,9 @@ struct SwSize
struct SwOutline
{
SwPoint* pts; //the outline's points
- uint16_t ptsCnt; //number of points in the glyph
- uint16_t reservedPtsCnt;
- uint16_t* cntrs; //the contour end points
+ uint32_t ptsCnt; //number of points in the glyph
+ uint32_t reservedPtsCnt;
+ uint32_t* cntrs; //the contour end points
uint16_t cntrsCnt; //number of contours in glyph
uint16_t reservedCntrsCnt;
uint8_t* types; //curve type
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
index f24d2d6f27..c02e28b432 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwImage.cpp
@@ -46,7 +46,7 @@ static bool _genOutline(SwImage* image, const Matrix* transform, SwMpool* mpool,
if (outline->reservedCntrsCnt < 1) {
outline->reservedCntrsCnt = 1;
- outline->cntrs = static_cast<uint16_t*>(realloc(outline->cntrs, outline->reservedCntrsCnt * sizeof(uint16_t)));
+ outline->cntrs = static_cast<uint32_t*>(realloc(outline->cntrs, outline->reservedCntrsCnt * sizeof(uint32_t)));
outline->closed = static_cast<bool*>(realloc(outline->closed, outline->reservedCntrsCnt * sizeof(bool)));
outline->closed[0] = true;
}
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwShape.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwShape.cpp
index 2a2c6a1da3..e5b540bcc3 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwShape.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwShape.cpp
@@ -64,7 +64,7 @@ static bool _growOutlineContour(SwOutline& outline, uint32_t n)
{
if (outline.reservedCntrsCnt >= outline.cntrsCnt + n) return false;
outline.reservedCntrsCnt = outline.cntrsCnt + n;
- outline.cntrs = static_cast<uint16_t*>(realloc(outline.cntrs, outline.reservedCntrsCnt * sizeof(uint16_t)));
+ outline.cntrs = static_cast<uint32_t*>(realloc(outline.cntrs, outline.reservedCntrsCnt * sizeof(uint32_t)));
return true;
}
diff --git a/thirdparty/thorvg/src/lib/sw_engine/tvgSwStroke.cpp b/thirdparty/thorvg/src/lib/sw_engine/tvgSwStroke.cpp
index 04aa9a36ec..fa213cc5d3 100644
--- a/thirdparty/thorvg/src/lib/sw_engine/tvgSwStroke.cpp
+++ b/thirdparty/thorvg/src/lib/sw_engine/tvgSwStroke.cpp
@@ -780,7 +780,7 @@ static void _exportBorderOutline(const SwStroke& stroke, SwOutline* outline, uin
auto src = border->tags;
auto tags = outline->types + outline->ptsCnt;
auto cntrs = outline->cntrs + outline->cntrsCnt;
- uint16_t idx = outline->ptsCnt;
+ auto idx = outline->ptsCnt;
while (cnt > 0) {
@@ -921,7 +921,7 @@ SwOutline* strokeExportOutline(SwStroke* stroke, SwMpool* mpool, unsigned tid)
outline->reservedPtsCnt = ptsCnt;
}
if (outline->reservedCntrsCnt < cntrsCnt) {
- outline->cntrs = static_cast<uint16_t*>(realloc(outline->cntrs, sizeof(uint16_t) * cntrsCnt));
+ outline->cntrs = static_cast<uint32_t*>(realloc(outline->cntrs, sizeof(uint32_t) * cntrsCnt));
outline->reservedCntrsCnt = cntrsCnt;
}
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
index a842b7fa8b..42bfd4de70 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoader.cpp
@@ -337,7 +337,10 @@ static unsigned char _parserColor(const char* value, char** end)
r = svgUtilStrtof(value, end);
*end = _skipSpace(*end, nullptr);
- if (**end == '%') r = 255 * r / 100;
+ if (**end == '%') {
+ r = 255 * r / 100;
+ (*end)++;
+ }
*end = _skipSpace(*end, nullptr);
if (r < 0 || r > 255) {
@@ -1145,10 +1148,13 @@ static bool _attrParseSymbolNode(void* data, const char* key, const char* value)
if (!strcmp(key, "viewBox")) {
if (!_parseNumber(&value, &symbol->vx) || !_parseNumber(&value, &symbol->vy)) return false;
if (!_parseNumber(&value, &symbol->vw) || !_parseNumber(&value, &symbol->vh)) return false;
+ symbol->hasViewBox = true;
} else if (!strcmp(key, "width")) {
symbol->w = _toFloat(loader->svgParse, value, SvgParserLengthType::Horizontal);
+ symbol->hasWidth = true;
} else if (!strcmp(key, "height")) {
symbol->h = _toFloat(loader->svgParse, value, SvgParserLengthType::Vertical);
+ symbol->hasHeight = true;
} else if (!strcmp(key, "preserveAspectRatio")) {
if (!strcmp(value, "none")) symbol->preserveAspect = false;
} else if (!strcmp(key, "overflow")) {
@@ -1306,6 +1312,12 @@ static SvgNode* _createSymbolNode(SvgLoaderData* loader, SvgNode* parent, const
loader->svgParse->node->node.symbol.preserveAspect = true;
loader->svgParse->node->node.symbol.overflowVisible = false;
+ loader->svgParse->node->node.symbol.hasViewBox = false;
+ loader->svgParse->node->node.symbol.hasWidth = false;
+ loader->svgParse->node->node.symbol.hasHeight = false;
+ loader->svgParse->node->node.symbol.vx = 0.0f;
+ loader->svgParse->node->node.symbol.vy = 0.0f;
+
func(buf, bufLength, _attrParseSymbolNode, loader);
return loader->svgParse->node;
@@ -2722,6 +2734,7 @@ static void _svgLoaderParserXmlOpen(SvgLoaderData* loader, const char* content,
}
/* default value for opacity */
loader->svgParse->gradStop = {0.0f, 0, 0, 0, 255};
+ loader->svgParse->flags = SvgStopStyleFlags::StopDefault;
simpleXmlParseAttributes(attrs, attrsLength, _attrParseStops, loader);
loader->latestGradient->stops.push(loader->svgParse->gradStop);
} else if (!isIgnoreUnsupportedLogElements(tagName)) {
@@ -2865,7 +2878,7 @@ static SvgStyleGradient* _gradientDup(Array<SvgStyleGradient*>* gradients, const
auto gradList = gradients->data;
for (uint32_t i = 0; i < gradients->count; ++i) {
- if (!strcmp((*gradList)->id, id)) {
+ if ((*gradList)->id && !strcmp((*gradList)->id, id)) {
result = _cloneGradient(*gradList);
break;
}
@@ -2875,7 +2888,7 @@ static SvgStyleGradient* _gradientDup(Array<SvgStyleGradient*>* gradients, const
if (result && result->ref) {
gradList = gradients->data;
for (uint32_t i = 0; i < gradients->count; ++i) {
- if (!strcmp((*gradList)->id, result->ref)) {
+ if ((*gradList)->id && !strcmp((*gradList)->id, result->ref)) {
if (result->stops.count == 0) _cloneGradStops(result->stops, (*gradList)->stops);
//TODO: Properly inherit other property
break;
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoaderCommon.h b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoaderCommon.h
index 1f25e82adc..dc9ed558c3 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgSvgLoaderCommon.h
+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgLoaderCommon.h
@@ -173,6 +173,9 @@ struct SvgSymbolNode
float vx, vy, vw, vh;
bool preserveAspect;
bool overflowVisible;
+ bool hasViewBox;
+ bool hasWidth;
+ bool hasHeight;
};
struct SvgUseNode
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
index 90705f2523..a3f34fd46b 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgSvgSceneBuilder.cpp
@@ -576,15 +576,17 @@ static unique_ptr<Scene> _useBuildHelper(const SvgNode* node, const Box& vBox, c
if (node->node.use.symbol) {
auto symbol = node->node.use.symbol->node.symbol;
- auto width = symbol.w;
+ auto width = (symbol.hasWidth ? symbol.w : vBox.w);
if (node->node.use.isWidthSet) width = node->node.use.w;
- auto height = symbol.h;
+ auto height = (symbol.hasHeight ? symbol.h : vBox.h);;
if (node->node.use.isHeightSet) height = node->node.use.h;
+ auto vw = (symbol.hasViewBox ? symbol.vw : width);
+ auto vh = (symbol.hasViewBox ? symbol.vh : height);
Matrix mViewBox = {1, 0, 0, 0, 1, 0, 0, 0, 1};
- if ((!mathEqual(width, symbol.vw) || !mathEqual(height, symbol.vh)) && symbol.vw > 0 && symbol.vh > 0) {
- auto sx = width / symbol.vw;
- auto sy = height / symbol.vh;
+ if ((!mathEqual(width, vw) || !mathEqual(height, vh)) && vw > 0 && vh > 0) {
+ auto sx = width / vw;
+ auto sy = height / vh;
if (symbol.preserveAspect) {
if (sx < sy) sy = sx;
else sx = sy;
@@ -592,8 +594,8 @@ static unique_ptr<Scene> _useBuildHelper(const SvgNode* node, const Box& vBox, c
auto tvx = symbol.vx * sx;
auto tvy = symbol.vy * sy;
- auto tvw = symbol.vw * sx;
- auto tvh = symbol.vh * sy;
+ auto tvw = vw * sx;
+ auto tvh = vh * sy;
tvy -= (symbol.h - tvh) * 0.5f;
tvx -= (symbol.w - tvw) * 0.5f;
mViewBox = {sx, 0, -tvx, 0, sy, -tvy, 0, 0, 1};
diff --git a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
index a12689c7dd..245bc7603a 100644
--- a/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
+++ b/thirdparty/thorvg/src/loaders/svg/tvgXmlParser.cpp
@@ -26,6 +26,8 @@
#ifdef _WIN32
#include <malloc.h>
+#elif __FreeBSD__
+ #include<stdlib.h>
#else
#include <alloca.h>
#endif
diff --git a/thirdparty/thorvg/update-thorvg.sh b/thirdparty/thorvg/update-thorvg.sh
index 29b5677983..77badb8b60 100755
--- a/thirdparty/thorvg/update-thorvg.sh
+++ b/thirdparty/thorvg/update-thorvg.sh
@@ -1,4 +1,4 @@
-VERSION=0.8.0
+VERSION=0.8.1
rm -rf AUTHORS inc LICENSE src *.zip
curl -L -O https://github.com/Samsung/thorvg/archive/refs/tags/v$VERSION.zip
bsdtar --strip-components=1 -xvf *.zip