diff options
Diffstat (limited to 'thirdparty/meshoptimizer/vertexcodec.cpp')
-rw-r--r-- | thirdparty/meshoptimizer/vertexcodec.cpp | 88 |
1 files changed, 71 insertions, 17 deletions
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp index 7925ea862c..4bd11121d2 100644 --- a/thirdparty/meshoptimizer/vertexcodec.cpp +++ b/thirdparty/meshoptimizer/vertexcodec.cpp @@ -50,6 +50,12 @@ #define SIMD_TARGET #endif +// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap +// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) +#define SIMD_LATENCYOPT +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi typedef int unaligned_int; #endif +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4)); @@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8)); @@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { - static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; - - uint8x16_t byte_mask = vld1q_u8(byte_mask_data); - uint8x16_t masked = vandq_u8(mask, byte_mask); + // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 + const uint64_t magic = 0x000103070f1f3f80ull; -#ifdef __aarch64__ - // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc - mask0 = vaddv_u8(vget_low_u8(masked)); - mask1 = vaddv_u8(vget_high_u8(masked)); -#else - // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8) - uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked)); - uint8x8_t sum2 = vpadd_u8(sum1, sum1); - uint8x8_t sum3 = vpadd_u8(sum2, sum2); + uint64x2_t mask2 = vreinterpretq_u64_u8(mask); - mask0 = vget_lane_u8(sum3, 0); - mask1 = vget_lane_u8(sum3, 1); -#endif + mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56); + mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) @@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 1: { +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel2 = vld1_u8(data); uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0]; uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22); @@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel4 = vld1_u8(data); uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15))); uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]); @@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1 // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; - // TODO: This can use v8x16_bitmask in the future mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56); mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56); } |