diff options
Diffstat (limited to 'thirdparty')
203 files changed, 27065 insertions, 20557 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md index 383ee3197b..27f1613e9e 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -51,7 +51,7 @@ Files extracted from upstream source: ## cvtt - Upstream: https://github.com/elasota/ConvectionKernels -- Version: git (dc2dbbe0ae2cf2be06ef56d1021e2222a56c7fe2, 2021) +- Version: git (350416daa4e98f1c17ffc273b134d0120a2ef230, 2022) - License: MIT Files extracted from upstream source: @@ -74,7 +74,7 @@ Files extracted from upstream source: ## embree - Upstream: https://github.com/embree/embree -- Version: 3.13.1 (12b99393438a4cc9e478e33459eed78bec6233fd, 2021) +- Version: 3.13.5 (698442324ccddd11725fb8875275dc1384f7fb40, 2022) - License: Apache 2.0 Files extracted from upstream: @@ -173,6 +173,23 @@ comments. Apply the patches in the `patches/` folder when syncing on newer upstr commits. +## glad + +- Upstream: https://github.com/Dav1dde/glad +- Version: 2.0.2 (f237a2bfcec0d9b82b90ec9af4af265c40de7183, 2022) +- License: CC0 1.0 and Apache 2.0 + +Files extracted from upstream source: +- `LICENSE` + +Files generated from [upstream web instance](https://gen.glad.sh/): +- `KHR/khrplatform.h` +- `gl.c` +- `glad/gl.h` +- `glx.c` +- `glad/glx.h` + + ## glslang - Upstream: https://github.com/KhronosGroup/glslang @@ -290,18 +307,15 @@ Files extracted from upstream source: ## libtheora - Upstream: https://www.theora.org -- Version: 1.1.1 (2010) +- Version: git (7180717276af1ebc7da15c83162d6c5d6203aabf, 2020) - License: BSD-3-Clause Files extracted from upstream source: -- all .c, .h in lib/ +- all .c, .h in lib/, except arm/ and c64x/ folders - all .h files in include/theora/ as theora/ - COPYING and LICENSE -Upstream patches included in the `patches` directory have been applied -on top of the 1.1.1 source (not included in any stable release yet). - ## libvorbis @@ -577,7 +591,7 @@ in 10.40, it can be found in the `patches` folder. ## recastnavigation - Upstream: https://github.com/recastnavigation/recastnavigation -- Version: git (5a870d427e47abd4a8e4ce58a95582ec049434d5, 2022) +- Version: git (4fef0446609b23d6ac180ed822817571525528a1, 2022) - License: zlib Files extracted from upstream source: diff --git a/thirdparty/cvtt/ConvectionKernels_BC67.cpp b/thirdparty/cvtt/ConvectionKernels_BC67.cpp index 791859b232..021d658c08 100644 --- a/thirdparty/cvtt/ConvectionKernels_BC67.cpp +++ b/thirdparty/cvtt/ConvectionKernels_BC67.cpp @@ -726,10 +726,10 @@ namespace cvtt if (carry) { uint32_t bitMask = (1 << carry) - 1; - for (int i = 0; i < 4; i++) + for (int i = 0; i < entriesRemaining; i++) { m_vector[i] >>= carry; - if (i != 3) + if (i != entriesRemaining - 1) m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - carry); } } @@ -3058,14 +3058,11 @@ void cvtt::Internal::BC6HComputer::SignExtendSingle(int &v, int bits) void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned) { - UnpackingVector pv; - pv.Init(pBC); - int numModeBits = 2; - int modeBits = pv.Unpack(2); + int modeBits = pBC[0] & 0x3; if (modeBits != 0 && modeBits != 1) { - modeBits |= pv.Unpack(3) << 2; + modeBits = pBC[0] & 0x1f; numModeBits += 3; } @@ -3102,6 +3099,9 @@ void cvtt::Internal::BC6HComputer::UnpackOne(PixelBlockF16 &output, const uint8_ for (int ch = 0; ch < 3; ch++) eps[subset][epi][ch] = 0; + UnpackingVector pv; + pv.Init(pBC); + { uint32_t header[3]; uint16_t codedEPs[2][2][3]; diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h index 645681ac63..6d411e4852 100644 --- a/thirdparty/embree/common/algorithms/parallel_for.h +++ b/thirdparty/embree/common/algorithms/parallel_for.h @@ -26,7 +26,6 @@ namespace embree abort(); // -- GODOT end -- } - #elif defined(TASKING_TBB) #if TBB_INTERFACE_VERSION >= 12002 tbb::task_group_context context; diff --git a/thirdparty/embree/common/algorithms/parallel_for_for.h b/thirdparty/embree/common/algorithms/parallel_for_for.h index 92c37a4a38..7838ef11b3 100644 --- a/thirdparty/embree/common/algorithms/parallel_for_for.h +++ b/thirdparty/embree/common/algorithms/parallel_for_for.h @@ -30,15 +30,20 @@ namespace embree template<typename ArrayArray> __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) { init(array2,minStepSize); + } + + template<typename SizeFunc> + __forceinline ParallelForForState (const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) { + init(numArrays,getSize,minStepSize); } - template<typename ArrayArray> - __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + template<typename SizeFunc> + __forceinline void init ( const size_t numArrays, const SizeFunc& getSize, const size_t minStepSize ) { /* first calculate total number of elements */ size_t N = 0; - for (size_t i=0; i<array2.size(); i++) { - N += array2[i] ? array2[i]->size() : 0; + for (size_t i=0; i<numArrays; i++) { + N += getSize(i); } this->N = N; @@ -54,8 +59,8 @@ namespace embree size_t k0 = (++taskIndex)*N/taskCount; for (size_t i=0, k=0; taskIndex < taskCount; i++) { - assert(i<array2.size()); - size_t j=0, M = array2[i] ? array2[i]->size() : 0; + assert(i<numArrays); + size_t j=0, M = getSize(i); while (j<M && k+M-j >= k0 && taskIndex < taskCount) { assert(taskIndex<taskCount); i0[taskIndex] = i; @@ -67,6 +72,12 @@ namespace embree } } + template<typename ArrayArray> + __forceinline void init ( ArrayArray& array2, const size_t minStepSize ) + { + init(array2.size(),[&](size_t i) { return array2[i] ? array2[i]->size() : 0; },minStepSize); + } + __forceinline size_t size() const { return N; } diff --git a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h index b15b44a991..8c3f4aace7 100644 --- a/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h +++ b/thirdparty/embree/common/algorithms/parallel_for_for_prefix_sum.h @@ -17,15 +17,20 @@ namespace embree __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize) : ParallelForForState(array2,minStepSize) {} + template<typename SizeFunc> + __forceinline ParallelForForPrefixSumState (size_t numArrays, const SizeFunc& getSize, const size_t minStepSize) + : ParallelForForState(numArrays,getSize,minStepSize) {} + ParallelPrefixSumState<Value> prefix_state; }; - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) + template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, + const SizeFunc& getSize, const Value& identity, const Func& func, const Reduction& reduction) { /* calculate number of tasks to use */ const size_t taskCount = state.taskCount; + /* perform parallel prefix sum */ parallel_for(taskCount, [&](const size_t taskIndex) { @@ -38,9 +43,9 @@ namespace embree size_t k=k0; Value N=identity; for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t size = getSize(i); const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i)); + if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k)); k+=r1-r0; j0 = 0; } state.prefix_state.counts[taskIndex] = N; @@ -58,9 +63,10 @@ namespace embree return sum; } - template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> - __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, - const Value& identity, const Func& func, const Reduction& reduction) + template<typename SizeFunc, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1_( ParallelForForPrefixSumState<Value>& state, Index minStepSize, + const SizeFunc& getSize, + const Value& identity, const Func& func, const Reduction& reduction) { /* calculate number of tasks to use */ const size_t taskCount = state.taskCount; @@ -76,9 +82,9 @@ namespace embree size_t k=k0; Value N=identity; for (size_t i=i0; k<k1; i++) { - const size_t size = array2[i] ? array2[i]->size() : 0; + const size_t size = getSize(i); const size_t r0 = j0, r1 = min(size,r0+k1-k); - if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N))); + if (r1 > r0) N = reduction(N, func((Index)i,range<Index>((Index)r0,(Index)r1),(Index)k,reduction(state.prefix_state.sums[taskIndex],N))); k+=r1-r0; j0 = 0; } state.prefix_state.counts[taskIndex] = N; @@ -96,6 +102,30 @@ namespace embree return sum; } + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, + ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum0_(state,minStepSize, + [&](Index i) { return array2[i] ? array2[i]->size() : 0; }, + identity, + [&](Index i, const range<Index>& r, Index k) { return func(array2[i], r, k, i); }, + reduction); + } + + template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction> + __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, + ArrayArray& array2, Index minStepSize, + const Value& identity, const Func& func, const Reduction& reduction) + { + return parallel_for_for_prefix_sum1_(state,minStepSize, + [&](Index i) { return array2[i] ? array2[i]->size() : 0; }, + identity, + [&](Index i, const range<Index>& r, Index k, const Value& base) { return func(array2[i], r, k, i, base); }, + reduction); + } + template<typename ArrayArray, typename Value, typename Func, typename Reduction> __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction) diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h index 8271372ea4..cd0078f2e6 100644 --- a/thirdparty/embree/common/algorithms/parallel_reduce.h +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h @@ -26,7 +26,7 @@ namespace embree const Index threadCount = (Index) TaskScheduler::threadCount(); taskCount = min(taskCount,threadCount,maxTasks); - /* parallel invokation of all tasks */ + /* parallel invocation of all tasks */ dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack parallel_for(taskCount, [&](const Index taskIndex) { const Index k0 = first+(taskIndex+0)*(last-first)/taskCount; diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h index bc43155358..e4eb3df9a4 100644 --- a/thirdparty/embree/common/math/bbox.h +++ b/thirdparty/embree/common/math/bbox.h @@ -77,7 +77,7 @@ namespace embree return lower > upper; } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline bool BBox<Vec3fa>::empty() const { return !all(le_mask(lower,upper)); } @@ -196,11 +196,11 @@ namespace embree } template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper)); } template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper)); } /*! blending */ @@ -228,11 +228,11 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined (__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined (__AVX__) #include "../simd/avx.h" #endif diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h index 529584ea16..e62e4ad2a4 100644 --- a/thirdparty/embree/common/math/color.h +++ b/thirdparty/embree/common/math/color.h @@ -152,21 +152,38 @@ namespace embree } __forceinline const Color rcp ( const Color& a ) { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Color)reciprocal; +#else #if defined(__AVX512VL__) const Color r = _mm_rcp14_ps(a.m128); #else const Color r = _mm_rcp_ps(a.m128); #endif - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r) + +#endif //defined(__aarch64__) } __forceinline const Color rsqrt( const Color& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif //defined(__aarch64__) } __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp index 03919ae20c..f51c642bfc 100644 --- a/thirdparty/embree/common/math/constants.cpp +++ b/thirdparty/embree/common/math/constants.cpp @@ -5,23 +5,4 @@ namespace embree { - TrueTy True; - FalseTy False; - ZeroTy zero; - OneTy one; - NegInfTy neg_inf; - PosInfTy inf; - PosInfTy pos_inf; - NaNTy nan; - UlpTy ulp; - PiTy pi; - OneOverPiTy one_over_pi; - TwoPiTy two_pi; - OneOverTwoPiTy one_over_two_pi; - FourPiTy four_pi; - OneOverFourPiTy one_over_four_pi; - StepTy step; - ReverseStepTy reverse_step; - EmptyTy empty; - UndefinedTy undefined; } diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h index 578473a8ab..07a1a868ba 100644 --- a/thirdparty/embree/common/math/constants.h +++ b/thirdparty/embree/common/math/constants.h @@ -24,13 +24,13 @@ namespace embree __forceinline operator bool( ) const { return true; } }; - extern MAYBE_UNUSED TrueTy True; + const constexpr TrueTy True = TrueTy(); struct FalseTy { __forceinline operator bool( ) const { return false; } }; - extern MAYBE_UNUSED FalseTy False; + const constexpr FalseTy False = FalseTy(); struct ZeroTy { @@ -48,7 +48,7 @@ namespace embree __forceinline operator unsigned char ( ) const { return 0; } }; - extern MAYBE_UNUSED ZeroTy zero; + const constexpr ZeroTy zero = ZeroTy(); struct OneTy { @@ -66,7 +66,7 @@ namespace embree __forceinline operator unsigned char ( ) const { return 1; } }; - extern MAYBE_UNUSED OneTy one; + const constexpr OneTy one = OneTy(); struct NegInfTy { @@ -85,7 +85,7 @@ namespace embree }; - extern MAYBE_UNUSED NegInfTy neg_inf; + const constexpr NegInfTy neg_inf = NegInfTy(); struct PosInfTy { @@ -103,8 +103,8 @@ namespace embree __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); } }; - extern MAYBE_UNUSED PosInfTy inf; - extern MAYBE_UNUSED PosInfTy pos_inf; + const constexpr PosInfTy inf = PosInfTy(); + const constexpr PosInfTy pos_inf = PosInfTy(); struct NaNTy { @@ -112,15 +112,15 @@ namespace embree __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); } }; - extern MAYBE_UNUSED NaNTy nan; + const constexpr NaNTy nan = NaNTy(); struct UlpTy { __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); } __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); } }; - - extern MAYBE_UNUSED UlpTy ulp; + + const constexpr UlpTy ulp = UlpTy(); struct PiTy { @@ -128,7 +128,7 @@ namespace embree __forceinline operator float ( ) const { return float(M_PI); } }; - extern MAYBE_UNUSED PiTy pi; + const constexpr PiTy pi = PiTy(); struct OneOverPiTy { @@ -136,7 +136,7 @@ namespace embree __forceinline operator float ( ) const { return float(M_1_PI); } }; - extern MAYBE_UNUSED OneOverPiTy one_over_pi; + const constexpr OneOverPiTy one_over_pi = OneOverPiTy(); struct TwoPiTy { @@ -144,7 +144,7 @@ namespace embree __forceinline operator float ( ) const { return float(2.0*M_PI); } }; - extern MAYBE_UNUSED TwoPiTy two_pi; + const constexpr TwoPiTy two_pi = TwoPiTy(); struct OneOverTwoPiTy { @@ -152,7 +152,7 @@ namespace embree __forceinline operator float ( ) const { return float(0.5*M_1_PI); } }; - extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy(); struct FourPiTy { @@ -160,7 +160,7 @@ namespace embree __forceinline operator float ( ) const { return float(4.0*M_PI); } }; - extern MAYBE_UNUSED FourPiTy four_pi; + const constexpr FourPiTy four_pi = FourPiTy(); struct OneOverFourPiTy { @@ -168,30 +168,42 @@ namespace embree __forceinline operator float ( ) const { return float(0.25*M_1_PI); } }; - extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy(); struct StepTy { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator char ( ) const { return 0; } + __forceinline operator unsigned char ( ) const { return 0; } }; - extern MAYBE_UNUSED StepTy step; + const constexpr StepTy step = StepTy(); struct ReverseStepTy { }; - extern MAYBE_UNUSED ReverseStepTy reverse_step; + const constexpr ReverseStepTy reverse_step = ReverseStepTy(); struct EmptyTy { }; - extern MAYBE_UNUSED EmptyTy empty; + const constexpr EmptyTy empty = EmptyTy(); struct FullTy { }; - extern MAYBE_UNUSED FullTy full; + const constexpr FullTy full = FullTy(); struct UndefinedTy { }; - extern MAYBE_UNUSED UndefinedTy undefined; + const constexpr UndefinedTy undefined = UndefinedTy(); } diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h index 4bc54c1a6a..7930c17727 100644 --- a/thirdparty/embree/common/math/math.h +++ b/thirdparty/embree/common/math/math.h @@ -53,6 +53,16 @@ namespace embree __forceinline float rcp ( const float x ) { +#if defined(__aarch64__) + // Move scalar to vector register and do rcp. + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) @@ -66,30 +76,71 @@ namespace embree #else return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); #endif + +#endif //defined(__aarch64__) } __forceinline float signmsk ( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = 0x80000000; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#endif } __forceinline float xorf( const float x, const float y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128 b; + a[0] = x; + b[0] = y; + a = _mm_xor_ps(a, b); + return a[0]; +#else return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); +#endif } __forceinline float andf( const float x, const unsigned y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = y; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); +#endif } __forceinline float rsqrt( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + a[0] = x; + __m128 value = _mm_rsqrt_ps(a); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + return value[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); #else __m128 r = _mm_rsqrt_ss(a); #endif - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); -#if defined(__ARM_NEON) - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), + _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + return _mm_cvtss_f32(c); #endif - return _mm_cvtss_f32(r); } #if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) @@ -146,7 +197,17 @@ namespace embree __forceinline double floor( const double x ) { return ::floor (x); } __forceinline double ceil ( const double x ) { return ::ceil (x); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float mini(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_min_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float mini(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -155,7 +216,17 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float maxi(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_max_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float maxi(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -172,9 +243,12 @@ namespace embree __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } __forceinline float min(float a, float b) { return a<b ? a:b; } __forceinline double min(double a, double b) { return a<b ? a:b; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long min(long a, long b) { return a<b ? a:b; } +#endif template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } @@ -189,9 +263,12 @@ namespace embree __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } __forceinline float max(float a, float b) { return a<b ? b:a; } __forceinline double max(double a, double b) { return a<b ? b:a; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long max(long a, long b) { return a<b ? b:a; } +#endif template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } @@ -231,6 +308,15 @@ namespace embree __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + +#elif defined (__aarch64__) && defined(__clang__) +#pragma clang fp contract(fast) +__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } +__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } +__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } +__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } +#pragma clang fp contract(on) + #else __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } @@ -326,7 +412,7 @@ namespace embree return x | (y << 1) | (z << 2); } -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) template<> __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h index 080800efcd..78efccda72 100644 --- a/thirdparty/embree/common/math/quaternion.h +++ b/thirdparty/embree/common/math/quaternion.h @@ -242,13 +242,17 @@ namespace embree T cosTheta = dot(q0, q1_); QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_); cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); - if (unlikely(all(cosTheta > 0.9995f))) { - return normalize(lerp(q0, q1, t)); - } + + // spherical linear interpolation const T phi = t * fastapprox::acos(cosTheta); T sinPhi, cosPhi; fastapprox::sincos(phi, sinPhi, cosPhi); QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); - return msub(cosPhi, q0, qperp); + QuaternionT<T> qslerp = msub(cosPhi, q0, qperp); + + // regular linear interpolation as fallback + QuaternionT<T> qlerp = normalize(lerp(q0, q1, t)); + + return select(cosTheta > 0.9995f, qlerp, qslerp); } } diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h index fd16c26e81..daf9dd96d2 100644 --- a/thirdparty/embree/common/math/transcendental.h +++ b/thirdparty/embree/common/math/transcendental.h @@ -27,7 +27,7 @@ __forceinline T sin(const T &v) // Reduced range version of x auto x = v - kReal * piOverTwoVec; auto kMod4 = k & 3; - auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto sinUseCos = (kMod4 == 1) | (kMod4 == 3); auto flipSign = (kMod4 > 1); // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, @@ -76,8 +76,8 @@ __forceinline T cos(const T &v) auto x = v - kReal * piOverTwoVec; auto kMod4 = k & 3; - auto cosUseCos = (kMod4 == 0 | kMod4 == 2); - auto flipSign = (kMod4 == 1 | kMod4 == 2); + auto cosUseCos = (kMod4 == 0) | (kMod4 == 2); + auto flipSign = (kMod4 == 1) | (kMod4 == 2); const float sinC2 = -0.16666667163372039794921875; const float sinC4 = +8.333347737789154052734375e-3; diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h index d62aef51f3..f6d98ffa0d 100644 --- a/thirdparty/embree/common/math/vec2.h +++ b/thirdparty/embree/common/math/vec2.h @@ -144,7 +144,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); } @@ -205,11 +205,11 @@ namespace embree #include "vec2fa.h" -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined(__AVX__) #include "../simd/avx.h" #endif @@ -221,7 +221,7 @@ namespace embree { template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} #endif diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h index a51fb68fd0..4f222894c2 100644 --- a/thirdparty/embree/common/math/vec2fa.h +++ b/thirdparty/embree/common/math/vec2fa.h @@ -97,6 +97,12 @@ namespace embree __forceinline Vec2fa rcp ( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec2fa)reciprocal; +#else #if defined(__AVX512VL__) const Vec2fa r = _mm_rcp14_ps(a.m128); #else @@ -104,13 +110,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); + const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n #else - const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -118,12 +126,21 @@ namespace embree __forceinline Vec2fa rsqrt( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif } __forceinline Vec2fa zero_fix(const Vec2fa& a) { @@ -156,7 +173,7 @@ namespace embree __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -165,7 +182,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -227,7 +244,7 @@ namespace embree __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h index ce94eff327..254f6c4011 100644 --- a/thirdparty/embree/common/math/vec3.h +++ b/thirdparty/embree/common/math/vec3.h @@ -197,7 +197,7 @@ namespace embree template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); } @@ -207,7 +207,6 @@ namespace embree template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); } - template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) { const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; @@ -266,11 +265,11 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined(__AVX__) #include "../simd/avx.h" #endif @@ -291,14 +290,14 @@ namespace embree template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } -#elif defined(__SSE__) +#elif defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); } #endif -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h index 586039741d..8564cf6d10 100644 --- a/thirdparty/embree/common/math/vec3fa.h +++ b/thirdparty/embree/common/math/vec3fa.h @@ -55,7 +55,13 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// static __forceinline Vec3fa load( const void* const a ) { +#if defined(__aarch64__) + __m128 t = _mm_load_ps((float*)a); + t[3] = 0.0f; + return Vec3fa(t); +#else return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); +#endif } static __forceinline Vec3fa loadu( const void* const a ) { @@ -89,12 +95,20 @@ namespace embree __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } __forceinline Vec3fa operator -( const Vec3fa& a ) { +#if defined(__aarch64__) + return vnegq_f32(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return _mm_xor_ps(a.m128, mask); +#endif } __forceinline Vec3fa abs ( const Vec3fa& a ) { +#if defined(__aarch64__) + return _mm_abs_ps(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return _mm_and_ps(a.m128, mask); +#endif } __forceinline Vec3fa sign ( const Vec3fa& a ) { return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); @@ -102,6 +116,10 @@ namespace embree __forceinline Vec3fa rcp ( const Vec3fa& a ) { +#if defined(__aarch64__) + return vdivq_f32(vdupq_n_f32(1.0f),a.m128); +#else + #if defined(__AVX512VL__) const Vec3fa r = _mm_rcp14_ps(a.m128); #else @@ -109,13 +127,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); + const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n #else - const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -123,12 +143,20 @@ namespace embree __forceinline Vec3fa rsqrt( const Vec3fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif } __forceinline Vec3fa zero_fix(const Vec3fa& a) { @@ -161,7 +189,7 @@ namespace embree __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -170,7 +198,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -187,16 +215,16 @@ namespace embree /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__ARM_NEON) __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } #else __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } #endif __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } @@ -218,8 +246,26 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline float reduce_add(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = 0.0f; + return vaddvq_f32(t); + } - __forceinline float reduce_add(const Vec3fa& v) { + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vminvq_f32(t); + } + __forceinline float reduce_max(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vmaxvq_f32(t); + } +#else + __forceinline float reduce_add(const Vec3fa& v) { const vfloat4 a(v.m128); const vfloat4 b = shuffle<1>(a); const vfloat4 c = shuffle<2>(a); @@ -229,6 +275,7 @@ namespace embree __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } +#endif //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators @@ -241,8 +288,13 @@ namespace embree __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } - __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + #if defined(__aarch64__) + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } +#else + __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +#endif __forceinline bool isvalid ( const Vec3fa& v ) { return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); @@ -261,7 +313,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) @@ -335,7 +387,11 @@ namespace embree /// Rounding Functions //////////////////////////////////////////////////////////////////////////////// -#if defined (__SSE4_1__) +#if defined(__aarch64__) + __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } + __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } +#elif defined (__SSE4_1__) __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } @@ -393,8 +449,10 @@ namespace embree __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } - __forceinline Vec3fx( const Vec3fa& other, const float w1) { -#if defined (__SSE4_1__) + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__aarch64__) + m128 = other.m128; m128[3] = w1; +#elif defined (__SSE4_1__) m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); #else const vint4 mask(-1,-1,-1,0); @@ -526,7 +584,7 @@ namespace embree __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -535,7 +593,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -626,7 +684,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h index 694804c40d..d4cc3125cd 100644 --- a/thirdparty/embree/common/math/vec3ia.h +++ b/thirdparty/embree/common/math/vec3ia.h @@ -65,7 +65,9 @@ namespace embree __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } -#if defined(__SSSE3__) +#if (defined(__aarch64__)) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } +#elif defined(__SSSE3__) __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } #endif @@ -81,7 +83,7 @@ namespace embree __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } @@ -116,7 +118,7 @@ namespace embree __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } #endif @@ -127,18 +129,38 @@ namespace embree __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } +#if !defined(__ARM_NEON) __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } +#endif //////////////////////////////////////////////////////////////////////////////// - /// Reductions + /// Select //////////////////////////////////////////////////////////////////////////////// + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); } + __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); } +#else __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } - +#endif + //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// @@ -156,19 +178,7 @@ namespace embree __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); -#endif - } - -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } #else diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h index 0ed107928a..10c53f47b4 100644 --- a/thirdparty/embree/common/math/vec4.h +++ b/thirdparty/embree/common/math/vec4.h @@ -149,7 +149,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } @@ -205,7 +205,7 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif @@ -225,7 +225,7 @@ namespace embree template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } -#elif defined(__SSE__) +#elif defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); } diff --git a/thirdparty/embree/common/simd/arm/avx2neon.h b/thirdparty/embree/common/simd/arm/avx2neon.h new file mode 100644 index 0000000000..dd321d3d64 --- /dev/null +++ b/thirdparty/embree/common/simd/arm/avx2neon.h @@ -0,0 +1,1196 @@ +#pragma once + +#if !defined(__aarch64__) +#error "avx2neon is only supported for AARCH64" +#endif + +#include "sse2neon.h" + +#define AVX2NEON_ABI static inline __attribute__((always_inline)) + + +struct __m256 { + __m128 lo,hi; + __m256() {} +}; + + + + +struct __m256i { + __m128i lo,hi; + explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {} + operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;} + __m256i() {} +}; + + + + +struct __m256d { + float64x2_t lo,hi; + __m256d() {} + __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} + __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {} +}; + +#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;} + + +#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;} +#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;} + +#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;} + + +#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;} + + + +#define _mm_stream_load_si128 _mm_load_si128 +#define _mm256_stream_load_si256 _mm256_load_si256 + + +AVX2NEON_ABI +__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8) +{ + __m128 af = _mm_castsi128_ps(a); + __m128 bf = _mm_castsi128_ps(b); + __m128 blendf = _mm_blend_ps(af, bf, imm8); + return _mm_castps_si128(blendf); +} + +AVX2NEON_ABI +int _mm_movemask_popcnt(__m128 a) +{ + return __builtin_popcount(_mm_movemask_ps(a)); +} + +AVX2NEON_ABI +__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask) +{ + float32x4_t res; + uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask); + for (int i=0;i<4;i++) { + if (mask_u32[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0; + } + return vreinterpretq_m128_f32(res); +} + +AVX2NEON_ABI +void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) +{ + float32x4_t a_f32 = vreinterpretq_f32_m128(a); + uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask); + for (int i=0;i<4;i++) { + if (mask_u32[i] & 0x80000000) mem_addr[i] = a_f32[i]; + } +} + +AVX2NEON_ABI +void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a) +{ + uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask); + int32x4_t a_s32 = vreinterpretq_s32_m128i(a); + for (int i=0;i<4;i++) { + if (mask_u32[i] & 0x80000000) mem_addr[i] = a_s32[i]; + } +} + + +#define _mm_fmadd_ss _mm_fmadd_ps +#define _mm_fmsub_ss _mm_fmsub_ps +#define _mm_fnmsub_ss _mm_fnmsub_ps +#define _mm_fnmadd_ss _mm_fnmadd_ps + +template<int code> +AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b) +{ + float v; + v = 0; + v += (code & 0x10) ? a[0]*b[0] : 0; + v += (code & 0x20) ? a[1]*b[1] : 0; + v += (code & 0x40) ? a[2]*b[2] : 0; + v += (code & 0x80) ? a[3]*b[3] : 0; + float32x4_t res; + res[0] = (code & 0x1) ? v : 0; + res[1] = (code & 0x2) ? v : 0; + res[2] = (code & 0x4) ? v : 0; + res[3] = (code & 0x8) ? v : 0; + return res; +} + +template<> +inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + m[3] = 0; + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +template<> +inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b) +{ + float v; + float32x4_t m = _mm_mul_ps(a,b); + v = vaddvq_f32(m); + return _mm_set1_ps(v); +} + +#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b)) + + +AVX2NEON_ABI +__m128 _mm_permutevar_ps (__m128 a, __m128i b) +{ + uint32x4_t b_u32 = vreinterpretq_u32_m128i(b); + float32x4_t x; + for (int i=0;i<4;i++) + { + x[i] = a[b_u32[i]]; + } + return vreinterpretq_m128_f32(x); +} + +AVX2NEON_ABI +__m256i _mm256_setzero_si256() +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_setzero_ps() +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(0.0f); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_undefined_si256() +{ + return _mm256_setzero_si256(); +} + +AVX2NEON_ABI +__m256 _mm256_undefined_ps() +{ + return _mm256_setzero_ps(); +} + +CAST_SIMD_TYPE(__m256d, _mm256_castps_pd, __m256, float64x2_t) +CAST_SIMD_TYPE(__m256i, _mm256_castps_si256, __m256, __m128i) +CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i, __m128) +CAST_SIMD_TYPE(__m256, _mm256_castpd_ps , __m256d, __m128) +CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i, float64x2_t) +CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d, __m128i) + + + + +AVX2NEON_ABI +__m128 _mm256_castps256_ps128 (__m256 a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256i _mm256_castsi128_si256 (__m128i a) +{ + __m256i res; + res.lo = a ; + res.hi = vdupq_n_s32(0); + return res; +} + +AVX2NEON_ABI +__m128i _mm256_castsi256_si128 (__m256i a) +{ + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_castps128_ps256 (__m128 a) +{ + __m256 res; + res.lo = a; + res.hi = vdupq_n_f32(0); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_broadcast_ss (float const * mem_addr) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(*mem_addr); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) +{ + __m256i res; + res.lo = _mm_set_epi32(e3,e2,e1,e0); + res.hi = _mm_set_epi32(e7,e6,e5,e4); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_set1_epi32 (int a) +{ + __m256i res; + res.lo = res.hi = vdupq_n_s32(a); + return res; +} +AVX2NEON_ABI +__m256i _mm256_set1_epi8 (int a) +{ + __m256i res; + res.lo = res.hi = vdupq_n_s8(a); + return res; +} +AVX2NEON_ABI +__m256i _mm256_set1_epi16 (int a) +{ + __m256i res; + res.lo = res.hi = vdupq_n_s16(a); + return res; +} + + + + +AVX2NEON_ABI +int _mm256_movemask_ps(const __m256& v) +{ + return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo); +} + +template<int imm8> +AVX2NEON_ABI +__m256 __mm256_permute_ps (const __m256& a) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8); + return res; + +} + +#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a) + + +template<int imm8> +AVX2NEON_ABI +__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b) +{ + __m256 res; + res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8); + res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8); + return res; + +} + +template<int imm8> +AVX2NEON_ABI +__m256i __mm256_shuffle_epi32 (const __m256i a) +{ + __m256i res; + res.lo = _mm_shuffle_epi32(a.lo,imm8); + res.hi = _mm_shuffle_epi32(a.hi,imm8); + return res; + +} + +template<int imm8> +AVX2NEON_ABI +__m256i __mm256_srli_si256 (__m256i a) +{ + __m256i res; + res.lo = _mm_srli_si128(a.lo,imm8); + res.hi = _mm_srli_si128(a.hi,imm8); + return res; +} + +template<int imm8> +AVX2NEON_ABI +__m256i __mm256_slli_si256 (__m256i a) +{ + __m256i res; + res.lo = _mm_slli_si128(a.lo,imm8); + res.hi = _mm_slli_si128(a.hi,imm8); + return res; +} + + +#define _mm256_srli_si256(a,b) __mm256_srli_si256<b>(a) +#define _mm256_slli_si256(a,b) __mm256_slli_si256<b>(a) + + + +#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b) +#define _mm256_shuffle_epi32(a,c) __mm256_shuffle_epi32<c>(a) + + +AVX2NEON_ABI +__m256i _mm256_set1_epi64x (long long a) +{ + __m256i res; + int64x2_t t = vdupq_n_s64(a); + res.lo = res.hi = __m128i(t); + return res; +} + + +AVX2NEON_ABI +__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) +{ + __m256 res; + __m128 tmp; + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + + + res.lo = tmp; + imm8 >>= 4; + + switch (imm8 & 0x7) + { + case 0: tmp = a.lo; break; + case 1: tmp = a.hi; break; + case 2: tmp = b.lo; break; + case 3: tmp = b.hi; break; + } + if (imm8 & 0x8) + tmp = _mm_setzero_ps(); + + res.hi = tmp; + + return res; +} + +AVX2NEON_ABI +__m256 _mm256_moveldup_ps (__m256 a) +{ + __m256 res; + res.lo = _mm_moveldup_ps(a.lo); + res.hi = _mm_moveldup_ps(a.hi); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_movehdup_ps (__m256 a) +{ + __m256 res; + res.lo = _mm_movehdup_ps(a.lo); + res.hi = _mm_movehdup_ps(a.hi); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) +{ + __m256 res = a; + if (imm8 & 1) res.hi = b; + else res.lo = b; + return res; +} + + +AVX2NEON_ABI +__m128 _mm256_extractf128_ps (__m256 a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + + +AVX2NEON_ABI +__m256d _mm256_movedup_pd (__m256d a) +{ + __m256d res; + res.lo = _mm_movedup_pd(a.lo); + res.hi = _mm_movedup_pd(a.hi); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_abs_epi32(__m256i a) +{ + __m256i res; + res.lo = vabsq_s32(a.lo); + res.hi = vabsq_s32(a.hi); + return res; +} + +UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps) +UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps) +UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32) +UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32) +UNARY_AVX_OP(__m256i,_mm256_abs_epi16,_mm_abs_epi16) + + +BINARY_AVX_OP(__m256i,_mm256_add_epi8,_mm_add_epi8) +BINARY_AVX_OP(__m256i,_mm256_adds_epi8,_mm_adds_epi8) + +BINARY_AVX_OP(__m256i,_mm256_hadd_epi32,_mm_hadd_epi32) +BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32) +BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32) +BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32) + +BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32) +BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32) +BINARY_AVX_OP(__m256i,_mm256_min_epi16,_mm_min_epi16) +BINARY_AVX_OP(__m256i,_mm256_max_epi16,_mm_max_epi16) +BINARY_AVX_OP(__m256i,_mm256_min_epi8,_mm_min_epi8) +BINARY_AVX_OP(__m256i,_mm256_max_epi8,_mm_max_epi8) +BINARY_AVX_OP(__m256i,_mm256_min_epu16,_mm_min_epu16) +BINARY_AVX_OP(__m256i,_mm256_max_epu16,_mm_max_epu16) +BINARY_AVX_OP(__m256i,_mm256_min_epu8,_mm_min_epu8) +BINARY_AVX_OP(__m256i,_mm256_max_epu8,_mm_max_epu8) +BINARY_AVX_OP(__m256i,_mm256_sign_epi16,_mm_sign_epi16) + + +BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t) +BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t) + +BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps) +BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps) + +BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps) +BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps) +BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps) +BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps) + +BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps) +BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps) +BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps) +BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps) + +BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t) +BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t) + + + +BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128) +BINARY_AVX_OP(__m256i,_mm256_andnot_si256,_mm_andnot_si128) +BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128) +BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128) + + +BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps) +BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps) +TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps) +TERNARY_AVX_OP(__m256i,_mm256_blendv_epi8,_mm_blendv_epi8) + + +TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps) +TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps) +TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps) + + + +BINARY_AVX_OP(__m256i,_mm256_packs_epi32,_mm_packs_epi32) +BINARY_AVX_OP(__m256i,_mm256_packs_epi16,_mm_packs_epi16) +BINARY_AVX_OP(__m256i,_mm256_packus_epi32,_mm_packus_epi32) +BINARY_AVX_OP(__m256i,_mm256_packus_epi16,_mm_packus_epi16) + + +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi64,_mm_unpackhi_epi64) +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32) +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi16,_mm_unpackhi_epi16) +BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi8,_mm_unpackhi_epi8) + +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi64,_mm_unpacklo_epi64) +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32) +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi16,_mm_unpacklo_epi16) +BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi8,_mm_unpacklo_epi8) + +BINARY_AVX_OP(__m256i,_mm256_mulhrs_epi16,_mm_mulhrs_epi16) +BINARY_AVX_OP(__m256i,_mm256_mulhi_epu16,_mm_mulhi_epu16) +BINARY_AVX_OP(__m256i,_mm256_mulhi_epi16,_mm_mulhi_epi16) +//BINARY_AVX_OP(__m256i,_mm256_mullo_epu16,_mm_mullo_epu16) +BINARY_AVX_OP(__m256i,_mm256_mullo_epi16,_mm_mullo_epi16) + +BINARY_AVX_OP(__m256i,_mm256_subs_epu16,_mm_subs_epu16) +BINARY_AVX_OP(__m256i,_mm256_adds_epu16,_mm_adds_epu16) +BINARY_AVX_OP(__m256i,_mm256_subs_epi16,_mm_subs_epi16) +BINARY_AVX_OP(__m256i,_mm256_adds_epi16,_mm_adds_epi16) +BINARY_AVX_OP(__m256i,_mm256_sub_epi16,_mm_sub_epi16) +BINARY_AVX_OP(__m256i,_mm256_add_epi16,_mm_add_epi16) +BINARY_AVX_OP(__m256i,_mm256_sub_epi8,_mm_sub_epi8) + + +BINARY_AVX_OP(__m256i,_mm256_hadd_epi16,_mm_hadd_epi16) +BINARY_AVX_OP(__m256i,_mm256_hadds_epi16,_mm_hadds_epi16) + + + + +BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32) +BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32) + +BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi8,_mm_cmpeq_epi8) +BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi8,_mm_cmpgt_epi8) + +BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi16,_mm_cmpeq_epi16) +BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi16,_mm_cmpgt_epi16) + + +BINARY_AVX_OP(__m256i,_mm256_shuffle_epi8,_mm_shuffle_epi8) + + +BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps) +BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps) +BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps) +BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps) +BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps) + + +AVX2NEON_ABI +__m256i _mm256_cvtps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvtps_epi32(a.lo); + res.hi = _mm_cvtps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_cvttps_epi32 (__m256 a) +{ + __m256i res; + res.lo = _mm_cvttps_epi32(a.lo); + res.hi = _mm_cvttps_epi32(a.hi); + return res; + +} + +AVX2NEON_ABI +__m256 _mm256_loadu_ps (float const * mem_addr) +{ + __m256 res; + res.lo = *(__m128 *)(mem_addr + 0); + res.hi = *(__m128 *)(mem_addr + 4); + return res; +} +#define _mm256_load_ps _mm256_loadu_ps + + +AVX2NEON_ABI +int _mm256_testz_ps (const __m256& a, const __m256& b) +{ + __m256 t = a; + if (&a != &b) + t = _mm256_and_ps(a,b); + + int32x4_t l = vshrq_n_s32(vreinterpretq_s32_m128(t.lo),31); + int32x4_t h = vshrq_n_s32(vreinterpretq_s32_m128(t.hi),31); + return vaddvq_s32(vaddq_s32(l,h)) == 0; +} + + +AVX2NEON_ABI +__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) +{ + __m256i res; + int64x2_t t0 = {e0,e1}; + int64x2_t t1 = {e2,e3}; + res.lo = __m128i(t0); + res.hi = __m128i(t1); + return res; +} +AVX2NEON_ABI +__m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3) +{ + __m256i res; + int64x2_t t0 = {e0,e1}; + int64x2_t t1 = {e2,e3}; + res.lo = __m128i(t0); + res.hi = __m128i(t1); + return res; +} + + + +AVX2NEON_ABI +__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) +{ + int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15}; + int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5, char e6, char e7, char e8, char e9, char e10, char e11, char e12, char e13, char e14, char e15, char e16, char e17, char e18, char e19, char e20, char e21, char e22, char e23, char e24, char e25, char e26, char e27, char e28, char e29, char e30, char e31) +{ + int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15}; + int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) +{ + int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7}; + int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_setr_epi16 (short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15) +{ + int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7}; + int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15}; + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + + + +AVX2NEON_ABI +int _mm256_movemask_epi8(const __m256i& a) +{ + return (_mm_movemask_epi8(a.hi) << 16) | _mm_movemask_epi8(a.lo); +} + + +AVX2NEON_ABI +int _mm256_testz_si256(const __m256i& a,const __m256i& b) +{ + uint32x4_t lo = vandq_u32(a.lo,b.lo); + uint32x4_t hi = vandq_u32(a.hi,b.hi); + + return (vaddvq_u32(lo) + vaddvq_u32(hi)) == 0; +} + +AVX2NEON_ABI +__m256d _mm256_setzero_pd () +{ + __m256d res; + res.lo = res.hi = vdupq_n_f64(0); + return res; +} + +AVX2NEON_ABI +int _mm256_movemask_pd (__m256d a) +{ + return (_mm_movemask_pd(a.hi) << 2) | _mm_movemask_pd(a.lo); +} + +AVX2NEON_ABI +__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b) +{ + __m256i res; + res.lo = _mm_cmpeq_epi64(a.lo, b.lo); + res.hi = _mm_cmpeq_epi64(a.hi, b.hi); + return res; +} + +AVX2NEON_ABI +__m256d _mm256_cmpeq_pd (__m256d a, __m256d b) +{ + __m256d res; + res.lo = _mm_cmpeq_pd(a.lo, b.lo); + res.hi = _mm_cmpeq_pd(a.hi, b.hi); + return res; +} + + +AVX2NEON_ABI +int _mm256_testz_pd (const __m256d& a, const __m256d& b) +{ + __m256d t = a; + + if (&a != &b) + t = _mm256_and_pd(a,b); + + return _mm256_movemask_pd(t) == 0; +} + +AVX2NEON_ABI +__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) +{ + __m256d res; + res.lo = _mm_blendv_pd(a.lo, b.lo, mask.lo); + res.hi = _mm_blendv_pd(a.hi, b.hi, mask.hi); + return res; +} + +template<int imm8> +AVX2NEON_ABI +__m256 __mm256_dp_ps (__m256 a, __m256 b) +{ + __m256 res; + res.lo = _mm_dp_ps(a.lo, b.lo, imm8); + res.hi = _mm_dp_ps(a.hi, b.hi, imm8); + return res; +} + +#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b) + +AVX2NEON_ABI +double _mm256_permute4x64_pd_select(__m256d a, const int imm8) +{ + switch (imm8 & 3) { + case 0: + return ((float64x2_t)a.lo)[0]; + case 1: + return ((float64x2_t)a.lo)[1]; + case 2: + return ((float64x2_t)a.hi)[0]; + case 3: + return ((float64x2_t)a.hi)[1]; + } + __builtin_unreachable(); + return 0; +} + +AVX2NEON_ABI +__m256d _mm256_permute4x64_pd (__m256d a, const int imm8) +{ + float64x2_t lo,hi; + lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0); + lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2); + hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4); + hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6); + + __m256d res; + res.lo = lo; res.hi = hi; + return res; +} + +AVX2NEON_ABI +__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) +{ + return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8)); +} + + +AVX2NEON_ABI +__m256i _mm256_loadu_si256 (__m256i const * mem_addr) +{ + __m256i res; + res.lo = *(__m128i *)((int32_t *)mem_addr + 0); + res.hi = *(__m128i *)((int32_t *)mem_addr + 4); + return res; +} + +#define _mm256_load_si256 _mm256_loadu_si256 + +AVX2NEON_ABI +void _mm256_storeu_ps (float * mem_addr, __m256 a) +{ + *(__m128 *)(mem_addr + 0) = a.lo; + *(__m128 *)(mem_addr + 4) = a.hi; +} + +#define _mm256_store_ps _mm256_storeu_ps +#define _mm256_stream_ps _mm256_storeu_ps + + +AVX2NEON_ABI +void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) +{ + *(__m128i *)((int32_t *)mem_addr + 0) = a.lo; + *(__m128i *)((int32_t *)mem_addr + 4) = a.hi; +} + +#define _mm256_store_si256 _mm256_storeu_si256 + + + +AVX2NEON_ABI +__m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8) +{ + uint8x16x2_t tbl = {a.lo, a.hi}; + + uint8_t sz = sizeof(uint64_t); + uint8_t u64[4] = { + (uint8_t)(((imm8 >> 0) & 0x3) * sz), + (uint8_t)(((imm8 >> 2) & 0x3) * sz), + (uint8_t)(((imm8 >> 4) & 0x3) * sz), + (uint8_t)(((imm8 >> 6) & 0x3) * sz), + }; + + uint8x16_t idx_lo = { + // lo[0] bytes + (uint8_t)(u64[0]+0), (uint8_t)(u64[0]+1), (uint8_t)(u64[0]+2), (uint8_t)(u64[0]+3), + (uint8_t)(u64[0]+4), (uint8_t)(u64[0]+5), (uint8_t)(u64[0]+6), (uint8_t)(u64[0]+7), + + // lo[1] bytes + (uint8_t)(u64[1]+0), (uint8_t)(u64[1]+1), (uint8_t)(u64[1]+2), (uint8_t)(u64[1]+3), + (uint8_t)(u64[1]+4), (uint8_t)(u64[1]+5), (uint8_t)(u64[1]+6), (uint8_t)(u64[1]+7), + }; + uint8x16_t idx_hi = { + // hi[0] bytes + (uint8_t)(u64[2]+0), (uint8_t)(u64[2]+1), (uint8_t)(u64[2]+2), (uint8_t)(u64[2]+3), + (uint8_t)(u64[2]+4), (uint8_t)(u64[2]+5), (uint8_t)(u64[2]+6), (uint8_t)(u64[2]+7), + + // hi[1] bytes + (uint8_t)(u64[3]+0), (uint8_t)(u64[3]+1), (uint8_t)(u64[3]+2), (uint8_t)(u64[3]+3), + (uint8_t)(u64[3]+4), (uint8_t)(u64[3]+5), (uint8_t)(u64[3]+6), (uint8_t)(u64[3]+7), + }; + + uint8x16_t lo = vqtbl2q_u8(tbl, idx_lo); + uint8x16_t hi = vqtbl2q_u8(tbl, idx_hi); + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_permute2x128_si256(const __m256i a,const __m256i b, const int imm8) +{ + return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); +} + + + +AVX2NEON_ABI +__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) +{ + __m256 res; + res.lo = _mm_maskload_ps(mem_addr,mask.lo); + res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepu8_epi32 (__m128i a) +{ + uint8x16_t a_u8 = vreinterpretq_u8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(a_u8)); // 00HH 00GG 00FF 00EE 00DD 00CC 00BB 00AA + uint32x4_t lo = vmovl_u16(vget_low_u16(u16x8)); // 0000 00DD 0000 00CC 0000 00BB 0000 00AA + uint32x4_t hi = vmovl_high_u16(u16x8); // 0000 00HH 0000 00GG 0000 00FF 0000 00EE + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepi8_epi32 (__m128i a) +{ + int8x16_t a_s8 = vreinterpretq_s8_m128i(a); // xxxx xxxx xxxx xxxx HHGG FFEE DDCC BBAA + int16x8_t s16x8 = vmovl_s8(vget_low_s8(a_s8)); // ssHH ssGG ssFF ssEE ssDD ssCC ssBB ssAA + int32x4_t lo = vmovl_s16(vget_low_s16(s16x8)); // ssss ssDD ssss ssCC ssss ssBB ssss ssAA + int32x4_t hi = vmovl_high_s16(s16x8); // ssss ssHH ssss ssGG ssss ssFF ssss ssEE + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_cvtepi16_epi32 (__m128i a) +{ + int16x8_t a_s16 = vreinterpretq_s16_m128i(a); // HHHH GGGG FFFF EEEE DDDD CCCC BBBB AAAA + int32x4_t lo = vmovl_s16(vget_low_s16(a_s16)); // ssss DDDD ssss CCCC ssss BBBB ssss AAAA + int32x4_t hi = vmovl_high_s16(a_s16); // ssss HHHH ssss GGGG ssss FFFF ssss EEEE + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + + +AVX2NEON_ABI +void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) +{ + _mm_maskstore_epi32(mem_addr,mask.lo,a.lo); + _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi); +} + +AVX2NEON_ABI +__m256i _mm256_slli_epi64 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_slli_epi64(a.lo,imm8); + res.hi = _mm_slli_epi64(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_slli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_slli_epi32(a.lo,imm8); + res.hi = _mm_slli_epi32(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i __mm256_slli_epi16 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_slli_epi16(a.lo,imm8); + res.hi = _mm_slli_epi16(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_srli_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srli_epi32(a.lo,imm8); + res.hi = _mm_srli_epi32(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i __mm256_srli_epi16 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srli_epi16(a.lo,imm8); + res.hi = _mm_srli_epi16(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cvtepu16_epi32(__m128i a) +{ + __m256i res; + res.lo = vmovl_u16(vget_low_u16(a)); + res.hi = vmovl_high_u16(a); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_cvtepu8_epi16(__m128i a) +{ + __m256i res; + res.lo = vmovl_u8(vget_low_u8(a)); + res.hi = vmovl_high_u8(a); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_srai_epi32 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srai_epi32(a.lo,imm8); + res.hi = _mm_srai_epi32(a.hi,imm8); + return res; +} + +AVX2NEON_ABI +__m256i _mm256_srai_epi16 (__m256i a, int imm8) +{ + __m256i res; + res.lo = _mm_srai_epi16(a.lo,imm8); + res.hi = _mm_srai_epi16(a.hi,imm8); + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_sllv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,count.lo); + res.hi = vshlq_s32(a.hi,count.hi); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_srav_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo)); + res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi)); + return res; + +} + +AVX2NEON_ABI +__m256i _mm256_srlv_epi32 (__m256i a, __m256i count) +{ + __m256i res; + res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo))); + res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi))); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) +{ + return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8)); +} + + +AVX2NEON_ABI +__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) +{ + if (imm8 & 1) return a.hi; + return a.lo; +} + +AVX2NEON_ABI +__m256 _mm256_set1_ps(float x) +{ + __m256 res; + res.lo = res.hi = vdupq_n_f32(x); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) +{ + __m256 res; + res.lo = _mm_set_ps(e3,e2,e1,e0); + res.hi = _mm_set_ps(e7,e6,e5,e4); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_broadcast_ps (__m128 const * mem_addr) +{ + __m256 res; + res.lo = res.hi = *mem_addr; + return res; +} + +AVX2NEON_ABI +__m256 _mm256_cvtepi32_ps (__m256i a) +{ + __m256 res; + res.lo = _mm_cvtepi32_ps(a.lo); + res.hi = _mm_cvtepi32_ps(a.hi); + return res; +} +AVX2NEON_ABI +void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) +{ + uint32x4_t mask_lo = mask.lo; + uint32x4_t mask_hi = mask.hi; + float32x4_t a_lo = a.lo; + float32x4_t a_hi = a.hi; + + for (int i=0;i<4;i++) { + if (mask_lo[i] & 0x80000000) mem_addr[i] = a_lo[i]; + if (mask_hi[i] & 0x80000000) mem_addr[i+4] = a_hi[i]; + } +} + +AVX2NEON_ABI +__m256d _mm256_andnot_pd (__m256d a, __m256d b) +{ + __m256d res; + res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo))); + res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi))); + return res; +} + +AVX2NEON_ABI +__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) +{ + __m256 res; + res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf); + res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4); + return res; + +} + + +AVX2NEON_ABI +__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8) +{ + return __m256i(_mm256_blend_ps(__m256(a),__m256(b),imm8)); + +} + +AVX2NEON_ABI +__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8) +{ + __m256i res; + res.lo = _mm_blend_epi16(a.lo,b.lo,imm8); + res.hi = _mm_blend_epi16(a.hi,b.hi,imm8); + return res; +} + + + +AVX2NEON_ABI +__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale) +{ + int32x4_t vindex_lo = vindex.lo; + int32x4_t vindex_hi = vindex.hi; + int32x4_t lo,hi; + for (int i=0;i<4;i++) + { + lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale)); + hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale)); + } + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} + + +AVX2NEON_ABI +__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale) +{ + uint32x4_t mask_lo = mask.lo; + uint32x4_t mask_hi = mask.hi; + int32x4_t vindex_lo = vindex.lo; + int32x4_t vindex_hi = vindex.hi; + int32x4_t lo,hi; + lo = hi = _mm_setzero_si128(); + for (int i=0;i<4;i++) + { + if (mask_lo[i] >> 31) lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale)); + if (mask_hi[i] >> 31) hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale)); + } + + __m256i res; + res.lo = lo; res.hi = hi; + return res; +} diff --git a/thirdparty/embree/common/simd/arm/emulation.h b/thirdparty/embree/common/simd/arm/emulation.h index 1c3875fb27..4327298019 100644 --- a/thirdparty/embree/common/simd/arm/emulation.h +++ b/thirdparty/embree/common/simd/arm/emulation.h @@ -11,33 +11,28 @@ #include "sse2neon.h" -__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { - __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c))); - return _mm_fmadd_ps(a, b, neg_c); -} - -__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c), - vreinterpretq_f32_m128(b), - vreinterpretq_f32_m128(a))); -#else - return _mm_sub_ps(c, _mm_mul_ps(a, b)); -#endif -} +__forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); } + +__forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); } +__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); } +__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); } +__forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); } -__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { - return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c)))); +__forceinline __m128 _mm_broadcast_ss (float const * mem_addr) +{ + return vdupq_n_f32(*mem_addr); } +// AVX2 emulation leverages Intel FMA defs above. Include after them. +#include "avx2neon.h" /* Dummy defines for floating point control */ #define _MM_MASK_MASK 0x1f80 #define _MM_MASK_DIV_ZERO 0x200 -#define _MM_FLUSH_ZERO_ON 0x8000 +// #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_MASK_DENORM 0x100 #define _MM_SET_EXCEPTION_MASK(x) -#define _MM_SET_FLUSH_ZERO_MODE(x) +// #define _MM_SET_FLUSH_ZERO_MODE(x) __forceinline int _mm_getcsr() { @@ -48,3 +43,43 @@ __forceinline void _mm_mfence() { __sync_synchronize(); } + +__forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +__forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr) +{ + uint16x8_t t0 = vld1q_u16((uint16_t*)ptr); + uint32x4_t t1 = vmovl_u16(vget_low_u16(t0)); + return vreinterpretq_s32_u32(t1); +} + +__forceinline __m128i _mm_load4epi8_f32(__m128i *ptr) +{ + int8x8_t t0 = vld1_s8((int8_t*)ptr); + int16x8_t t1 = vmovl_s8(t0); + int32x4_t t2 = vmovl_s16(vget_low_s16(t1)); + float32x4_t t3 = vcvtq_f32_s32(t2); + return vreinterpretq_s32_f32(t3); +} + +__forceinline __m128i _mm_load4epu8_f32(__m128i *ptr) +{ + uint8x8_t t0 = vld1_u8((uint8_t*)ptr); + uint16x8_t t1 = vmovl_u8(t0); + uint32x4_t t2 = vmovl_u16(vget_low_u16(t1)); + return vreinterpretq_s32_u32(t2); +} + +__forceinline __m128i _mm_load4epi16_f32(__m128i *ptr) +{ + int16x8_t t0 = vld1q_s16((int16_t*)ptr); + int32x4_t t1 = vmovl_s16(vget_low_s16(t0)); + float32x4_t t2 = vcvtq_f32_s32(t1); + return vreinterpretq_s32_f32(t2); +} diff --git a/thirdparty/embree/common/simd/arm/sse2neon.h b/thirdparty/embree/common/simd/arm/sse2neon.h index 7eb25cf2c5..43416662d7 100644 --- a/thirdparty/embree/common/simd/arm/sse2neon.h +++ b/thirdparty/embree/common/simd/arm/sse2neon.h @@ -52,7 +52,7 @@ /* Enable precise implementation of math operations * This would slow down the computation a bit, but gives consistent result with - * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) */ /* _mm_min_ps and _mm_max_ps */ #ifndef SSE2NEON_PRECISE_MINMAX @@ -66,36 +66,29 @@ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif -#ifndef SSE2NEON_PRECISE_RSQRT -#define SSE2NEON_PRECISE_RSQRT (0) +/* _mm_dp_pd */ +#ifndef SSE2NEON_PRECISE_DP +#define SSE2NEON_PRECISE_DP (0) #endif +/* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#ifndef likely -#define likely(x) __builtin_expect(!!(x), 1) -#endif -#ifndef unlikely -#define unlikely(x) __builtin_expect(!!(x), 0) -#endif -#else -#error "Macro name collisions may happen with unsupported compiler." -#ifdef FORCE_INLINE -#undef FORCE_INLINE -#endif +#define _sse2neon_likely(x) __builtin_expect(!!(x), 1) +#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) +#else /* non-GNU / non-clang compilers */ +#warning "Macro name collisions may happen with unsupported compiler." +#ifndef FORCE_INLINE #define FORCE_INLINE static inline +#endif #ifndef ALIGN_STRUCT #define ALIGN_STRUCT(x) __declspec(align(x)) #endif -#endif -#ifndef likely -#define likely(x) (x) -#endif -#ifndef unlikely -#define unlikely(x) (x) +#define _sse2neon_likely(x) (x) +#define _sse2neon_unlikely(x) (x) #endif #include <stdint.h> @@ -155,6 +148,14 @@ * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ +#if defined(__aarch64__) +#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\ +2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } ) +#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\ +2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\ +4)+16+3) } ) +#endif + #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) @@ -169,6 +170,14 @@ #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 +/* Flush zero mode macros. */ +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 +/* Denormals are zeros mode macros. */ +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 /* indicate immediate constant argument in a given range */ #define __constrange(a, b) const @@ -189,7 +198,10 @@ typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif -typedef int64x2_t __m128i; /* 128-bit vector containing integers */ +// Note: upstream sse2neon declares __m128i as int64x2_t. However, there's +// many places within embree that assume __m128i can be indexed as a +// 4 element u32. +typedef int32x4_t __m128i; /* 128-bit vector containing integers */ /* type-safe casting between types */ @@ -221,28 +233,28 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */ #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) -#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) -#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) -#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) -#define vreinterpretq_m128i_s64(x) (x) +#define vreinterpretq_m128i_s8(x) vreinterpretq_s32_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s32_s16(x) +#define vreinterpretq_m128i_s32(x) (x) +#define vreinterpretq_m128i_s64(x) vreinterpretq_s32_s64(x) -#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) -#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) -#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) -#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) +#define vreinterpretq_m128i_u8(x) vreinterpretq_s32_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s32_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s32_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s32_u64(x) -#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) -#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s32(x) -#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) -#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) -#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) -#define vreinterpretq_s64_m128i(x) (x) +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s32(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s32(x) +#define vreinterpretq_s32_m128i(x) (x) +#define vreinterpretq_s64_m128i(x) vreinterpretq_s64_s32(x) -#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) -#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) -#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) -#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s32(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s32(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s32(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s32(x) #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) @@ -281,6 +293,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */ #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) #define vreinterpretq_f64_m128d(x) (x) @@ -303,10 +316,10 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */ #endif // A struct is defined in this header file called 'SIMDVec' which can be used -// by applications which attempt to access the contents of an _m128 struct +// by applications which attempt to access the contents of an __m128 struct // directly. It is important to note that accessing the __m128 struct directly // is bad coding practice by Microsoft: @see: -// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// https://docs.microsoft.com/en-us/cpp/cpp/m128 // // However, some legacy source code may try to access the contents of an __m128 // struct directly so the developer can use the SIMDVec as an alias for it. Any @@ -342,13 +355,48 @@ typedef union ALIGN_STRUCT(16) SIMDVec { #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) +/* SSE macros */ +#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode +#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode +#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode +#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode + +// Function declaration +// SSE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(); +FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); +FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); +FORCE_INLINE __m128 _mm_set_ps1(float); +FORCE_INLINE __m128 _mm_setzero_ps(void); +// SSE2 +FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_castps_si128(__m128); +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); +FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); +FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); +FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); +FORCE_INLINE __m128d _mm_set_pd(double, double); +FORCE_INLINE __m128i _mm_set1_epi32(int); +FORCE_INLINE __m128i _mm_setzero_si128(); +// SSE4.1 +FORCE_INLINE __m128d _mm_ceil_pd(__m128d); +FORCE_INLINE __m128 _mm_ceil_ps(__m128); +FORCE_INLINE __m128d _mm_floor_pd(__m128d); +FORCE_INLINE __m128 _mm_floor_ps(__m128); +FORCE_INLINE __m128d _mm_round_pd(__m128d, int); +FORCE_INLINE __m128 _mm_round_ps(__m128, int); +// SSE4.2 +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); + /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type -#if defined(__GNUC__) && !defined(__clang__) && \ - ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \ - (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ - (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ <= 10 && defined(__arm__)) || \ + (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ + (__GNUC__ <= 9 && defined(__aarch64__))) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; @@ -443,8 +491,6 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +------+------+------+------+------+------+-------------+ */ -/* Set/get methods */ - /* Constants for use with _mm_prefetch. */ enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ @@ -457,1098 +503,1568 @@ enum _mm_hint { _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ }; -// Loads one cache line of data from address p to a location closer to the -// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx -FORCE_INLINE void _mm_prefetch(const void *p, int i) +// The bit field mapping to the FPCR(floating-point control register) +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t bit24 : 1; + uint8_t res2 : 7; +#if defined(__aarch64__) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { - (void) i; - __builtin_prefetch(p); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); } -// Pause the processor. This is typically used in spin-wait loops and depending -// on the x86 processor typical values are in the 40-100 cycle range. The -// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most -// Arm cores. Experience with several databases has shown has shown an 'isb' is -// a reasonable approximation. -FORCE_INLINE void _mm_pause() +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { - __asm__ __volatile__("isb\n"); + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); } -// Copy the lower single-precision (32-bit) floating-point element of a to dst. -// -// dst[31:0] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 -FORCE_INLINE float _mm_cvtss_f32(__m128 a) +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { - return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); } -// Convert the lower single-precision (32-bit) floating-point element in b to a -// double-precision (64-bit) floating-point element, store the result in the -// lower element of dst, and copy the upper element from a to the upper element -// of dst. +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) +{ + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. // -// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -// dst[127:64] := a[127:64] +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd -FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { - double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); #else - return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); #endif -} + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 -#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 64-bit integer, and store the result in dst. -// -// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 -FORCE_INLINE int _mm_cvtss_si64(__m128 a) -{ + // De-interleave #if defined(__aarch64__) - return vgetq_lane_s64( - vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); #else - float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t diff = data - floor(data); - if (diff > 0.5) - return (int64_t) ceil(data); - if (unlikely(diff == 0.5)) { - int64_t f = (int64_t) floor(data); - int64_t c = (int64_t) ceil(data); - return c & 1 ? f : c; - } - return (int64_t) floor(data); + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); #endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); } +#endif // ARMv7 polyfill -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers with truncation, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi -FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { - return vreinterpret_m64_s32( - vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); } -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer with truncation, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si -FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { - return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); } -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers with truncation, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 -#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) +// rotates the least significant 32 bits into the most significant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer with truncation, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 -#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) +// rotates the most significant 32 bits into the least significant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. -// -// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 -FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { - return vgetq_lane_s64( - vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); } -// Sets the 128-bit value to zero -// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx -FORCE_INLINE __m128i _mm_setzero_si128(void) +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { - return vreinterpretq_m128i_s32(vdupq_n_s32(0)); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); } -// Clears the four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setzero_ps(void) +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { - return vreinterpretq_m128_f32(vdupq_n_f32(0)); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); } -// Return vector of type __m128d with all elements set to zero. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd -FORCE_INLINE __m128d _mm_setzero_pd(void) +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) #if defined(__aarch64__) - return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) #else - return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) #endif -} -// Sets the four single-precision, floating-point values to w. +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. // -// r0 := r1 := r2 := r3 := w +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } // -// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set1_ps(float _w) +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +/* SSE */ + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Sets the four single-precision, floating-point values to w. -// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps1(float _w) +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of <a>. + return vreinterpretq_m128_f32(vaddq_f32(a, value)); } -// Sets the four single-precision, floating-point values to the four inputs. -// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { - float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; - return vreinterpretq_m128_f32(vld1q_f32(data)); + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } -// Copy single-precision (32-bit) floating-point element a to the lower element -// of dst, and zero the upper 3 elements. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss -FORCE_INLINE __m128 _mm_set_ss(float a) +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { - float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; - return vreinterpretq_m128_f32(vld1q_f32(data)); + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap } -// Sets the four single-precision, floating-point values to the four inputs in -// reverse order. -// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { - float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; - return vreinterpretq_m128_f32(vld1q_f32(data)); + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); } -// Sets the 8 signed 16-bit integer values in reverse order. +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. // -// Return Value -// r0 := w0 -// r1 := w1 -// ... -// r7 := w7 -FORCE_INLINE __m128i _mm_setr_epi16(short w0, - short w1, - short w2, - short w3, - short w4, - short w5, - short w6, - short w7) +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { - int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; - return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } -// Sets the 4 signed 32-bit integer values in reverse order -// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx -FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { - int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; - return vreinterpretq_m128i_s32(vld1q_s32(data)); + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Set packed 64-bit integers in dst with the supplied values in reverse order. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 -FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { - return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); } -// Sets the 16 signed 8-bit integer values to b. +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for greater than. // -// r0 := b -// r1 := b -// ... -// r15 := b +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 // -// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s8(vdupq_n_s8(w)); + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Broadcast double-precision (64-bit) floating-point value a to all elements of -// dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd -FORCE_INLINE __m128d _mm_set1_pd(double d) +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vdupq_n_f64(d)); -#else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); -#endif + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); } -// Sets the 8 signed 16-bit integer values to w. +// Compares for less than or equal. // -// r0 := w -// r1 := w -// ... -// r7 := w +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 // -// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set1_epi16(short w) +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s16(vdupq_n_s16(w)); + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Sets the 16 signed 8-bit integer values. -// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set_epi8(signed char b15, - signed char b14, - signed char b13, - signed char b12, - signed char b11, - signed char b10, - signed char b9, - signed char b8, - signed char b7, - signed char b6, - signed char b5, - signed char b4, - signed char b3, - signed char b2, - signed char b1, - signed char b0) +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); + return _mm_move_ss(a, _mm_cmple_ps(a, b)); } -// Sets the 8 signed 16-bit integer values. -// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set_epi16(short i7, - short i6, - short i5, - short i4, - short i3, - short i2, - short i1, - short i0) +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { - int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return vreinterpretq_m128i_s16(vld1q_s16(data)); + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Sets the 16 signed 8-bit integer values in reverse order. -// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx -FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, - signed char b1, - signed char b2, - signed char b3, - signed char b4, - signed char b5, - signed char b6, - signed char b7, - signed char b8, - signed char b9, - signed char b10, - signed char b11, - signed char b12, - signed char b13, - signed char b14, - signed char b15) +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); } -// Sets the 4 signed 32-bit integer values to i. -// -// r0 := i -// r1 := i -// r2 := i -// r3 := I -// -// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi32(int _i) +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } -// Sets the 2 signed 64-bit integer values to i. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) -FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { - return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); } -// Sets the 2 signed 64-bit integer values to i. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x -FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); + return vreinterpretq_m128_u32(vmvnq_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } -// Sets the 4 signed 32-bit integer values. -// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { - int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; - return vreinterpretq_m128i_s32(vld1q_s32(data)); + return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); } -// Returns the __m128i structure with its two 64-bit integer values -// initialized to the values of the two 64-bit integers passed in. -// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx -FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s64( - vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); + return vreinterpretq_m128_u32(vmvnq_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } -// Returns the __m128i structure with its two 64-bit integer values -// initialized to the values of the two 64-bit integers passed in. -// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx -FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { - return _mm_set_epi64x((int64_t) i1, (int64_t) i2); + return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); } -// Set packed double-precision (64-bit) floating-point elements in dst with the -// supplied values. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd -FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { - double ALIGN_STRUCT(16) data[2] = {e0, e1}; -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); -#else - return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); -#endif + return vreinterpretq_m128_u32(vmvnq_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } -// Set packed double-precision (64-bit) floating-point elements in dst with the -// supplied values in reverse order. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd -FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { - return _mm_set_pd(e0, e1); + return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); } -// Copy double-precision (64-bit) floating-point element a to the lower element -// of dst, and zero the upper element. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd -FORCE_INLINE __m128d _mm_set_sd(double a) +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { - return _mm_set_pd(0, a); + return vreinterpretq_m128_u32(vmvnq_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } -// Broadcast double-precision (64-bit) floating-point value a to all elements of -// dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 -#define _mm_set_pd1 _mm_set1_pd +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); +} -// Stores four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx -FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { - vst1q_f32(p, vreinterpretq_f32_m128(a)); + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); } -// Store the lower single-precision (32-bit) floating-point element from a into -// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// -// MEM[mem_addr+31:mem_addr] := a[31:0] -// MEM[mem_addr+63:mem_addr+32] := a[31:0] -// MEM[mem_addr+95:mem_addr+64] := a[31:0] -// MEM[mem_addr+127:mem_addr+96] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 -FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - vst1q_f32(p, vdupq_n_f32(a0)); + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); } -// Store the lower single-precision (32-bit) floating-point element from a into -// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// -// MEM[mem_addr+31:mem_addr] := a[31:0] -// MEM[mem_addr+63:mem_addr+32] := a[31:0] -// MEM[mem_addr+95:mem_addr+64] := a[31:0] -// MEM[mem_addr+127:mem_addr+96] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps -#define _mm_store1_ps _mm_store_ps1 +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} -// Store 4 single-precision (32-bit) floating-point elements from a into memory -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// MEM[mem_addr+31:mem_addr] := a[127:96] -// MEM[mem_addr+63:mem_addr+32] := a[95:64] -// MEM[mem_addr+95:mem_addr+64] := a[63:32] -// MEM[mem_addr+127:mem_addr+96] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps -FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { - float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); - float32x4_t rev = vextq_f32(tmp, tmp, 2); - vst1q_f32(p, rev); + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); } -// Stores four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { - vst1q_f32(p, vreinterpretq_f32_m128(a)); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_eq_b, 0) & 0x1; } -// Stores four 32-bit integer values as (as a __m128i value) at the address p. -// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_ge_b, 0) & 0x1; } -// Stores four 32-bit integer values as (as a __m128i value) at the address p. -// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_gt_b, 0) & 0x1; } -// Stores the lower single - precision, floating - point value. -// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx -FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { - vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_le_b, 0) & 0x1; } -// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point -// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary -// or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd -FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { -#if defined(__aarch64__) - vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); -#else - vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); -#endif + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_lt_b, 0) & 0x1; } -// Store the upper double-precision (64-bit) floating-point element from a into -// memory. +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + return !_mm_comieq_ss(a, b); +} + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. // -// MEM[mem_addr+63:mem_addr] := a[127:64] +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd -FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { -#if defined(__aarch64__) - vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); -#else - vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); -#endif + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); } -// Store the lower double-precision (64-bit) floating-point element from a into -// memory. +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. // -// MEM[mem_addr+63:mem_addr] := a[63:0] +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd -FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { #if defined(__aarch64__) - vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); #else - vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); + return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); #endif } -// Store 2 double-precision (64-bit) floating-point elements from a into memory -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. // -// MEM[mem_addr+63:mem_addr] := a[127:64] -// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd -FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { - float32x4_t f = vreinterpretq_f32_m128d(a); - _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } -// Store the lower double-precision (64-bit) floating-point element from a into -// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 -FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { #if defined(__aarch64__) - float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); - vst1q_f64((float64_t *) mem_addr, - vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); + return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), + 0); #else - float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); - vst1q_f32((float32_t *) mem_addr, - vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int32_t) data; #endif } -// Store the lower double-precision (64-bit) floating-point element from a into -// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd -#define _mm_store1_pd _mm_store_pd1 - -// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point -// elements) from a into memory. mem_addr does not need to be aligned on any -// particular boundary. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd -FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { - _mm_store_pd(mem_addr, a); + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); } -// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. -// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx -FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { - uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); - uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); - *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); } -// Stores the lower two single-precision floating point values of a to the -// address p. +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. // -// *p0 := a0 -// *p1 := a1 +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) // -// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx -FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { - *p = vreinterpret_m64_f32(vget_low_f32(a)); + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); } -// Stores the upper two single-precision, floating-point values of a to the -// address p. +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. // -// *p0 := a2 -// *p1 := a3 +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx -FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { - *p = vreinterpret_m64_f32(vget_high_f32(a)); + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); } -// Loads a single single-precision, floating-point value, copying it into all -// four words -// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load1_ps(const float *p) +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// +// FOR j := 0 to 3 +// i := 16*j +// k := 32*j +// IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) +// dst[i+15:i] := 0x7FFF +// ELSE +// dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { - return vreinterpretq_m128_f32(vld1q_dup_f32(p)); + const __m128 i16Min = _mm_set_ps1((float) INT16_MIN); + const __m128 i16Max = _mm_set_ps1((float) INT16_MAX); + const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); + const __m128i maxMask = _mm_castps_si128( + _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max))); + const __m128i betweenMask = _mm_castps_si128( + _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max))); + const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), + _mm_setzero_si128()); + __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX)); + __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN)); + __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); + __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); + return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32))); } -// Load a single-precision (32-bit) floating-point element from memory into all -// elements of dst. +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. // -// dst[31:0] := MEM[mem_addr+31:mem_addr] -// dst[63:32] := MEM[mem_addr+31:mem_addr] -// dst[95:64] := MEM[mem_addr+31:mem_addr] -// dst[127:96] := MEM[mem_addr+31:mem_addr] +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 -#define _mm_load_ps1 _mm_load1_ps +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) -// Sets the lower two single-precision, floating-point values with 64 -// bits of data loaded from the address p; the upper two values are passed -// through from a. +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 8-bit integers, and store the results in lower 4 elements of dst. +// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values +// between 0x7F and 0x7FFFFFFF. // -// Return Value -// r0 := *p0 -// r1 := *p1 -// r2 := a2 -// r3 := a3 +// FOR j := 0 to 3 +// i := 8*j +// k := 32*j +// IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) +// dst[i+7:i] := 0x7F +// ELSE +// dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) +// FI +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx -FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8 +FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) +{ + const __m128 i8Min = _mm_set_ps1((float) INT8_MIN); + const __m128 i8Max = _mm_set_ps1((float) INT8_MAX); + const __m128 i32Max = _mm_set_ps1((float) INT32_MAX); + const __m128i maxMask = _mm_castps_si128( + _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max))); + const __m128i betweenMask = _mm_castps_si128( + _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max))); + const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask), + _mm_setzero_si128()); + __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX)); + __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN)); + __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a)); + __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt); + int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32)); + int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16)); + uint32_t bitMask[2] = {0xFFFFFFFF, 0}; + int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask)); + + return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { return vreinterpretq_m128_f32( - vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); } -// Load 4 single-precision (32-bit) floating-point elements from memory into dst -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. // -// dst[31:0] := MEM[mem_addr+127:mem_addr+96] -// dst[63:32] := MEM[mem_addr+95:mem_addr+64] -// dst[95:64] := MEM[mem_addr+63:mem_addr+32] -// dst[127:96] := MEM[mem_addr+31:mem_addr] +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps -FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { - float32x4_t v = vrev64q_f32(vld1q_f32(p)); - return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); } -// Sets the upper two single-precision, floating-point values with 64 -// bits of data loaded from the address p; the lower two values are passed -// through from a. +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. // -// r0 := a0 -// r1 := a1 -// r2 := *p0 -// r3 := *p1 +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] // -// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx -FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { return vreinterpretq_m128_f32( - vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } -// Loads four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load_ps(const float *p) +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) { - return vreinterpretq_m128_f32(vld1q_f32(p)); + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } -// Loads four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 +FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { - // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are - // equivalent for neon - return vreinterpretq_m128_f32(vld1q_f32(p)); +#if defined(__aarch64__) + return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int64_t) data; +#endif } -// Load unaligned 16-bit integer from memory into the first element of dst. +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. // -// dst[15:0] := MEM[mem_addr+15:mem_addr] -// dst[MAX:16] := 0 +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 -FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { - return vreinterpretq_m128i_s16( - vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); } -// Load unaligned 64-bit integer from memory into the first element of dst. +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. // -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[MAX:64] := 0 +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 -FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { - return vreinterpretq_m128i_s64( - vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); } -// Load a double-precision (64-bit) floating-point element from memory into the -// lower of dst, and zero the upper element. mem_addr does not need to be -// aligned on any particular boundary. +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. // -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := 0 +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd -FORCE_INLINE __m128d _mm_load_sd(const double *p) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); -#else - const float *fp = (const float *) p; - float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; - return vreinterpretq_m128d_f32(vld1q_f32(data)); -#endif + return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } -// Loads two double-precision from 16-byte aligned memory, floating-point -// values. +// Divides the four single-precision, floating-point values of a and b. // -// dst[127:0] := MEM[mem_addr+127:mem_addr] +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd -FORCE_INLINE __m128d _mm_load_pd(const double *p) +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_f64(p)); +#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else - const float *fp = (const float *) p; - float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; - return vreinterpretq_m128d_f32(vld1q_f32(data)); + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#endif + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); #endif } -// Loads two double-precision from unaligned memory, floating-point values. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd -FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { - return _mm_load_pd(p); + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } -// Loads an single - precision, floating - point value into the low word and -// clears the upper three words. -// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_load_ss(const float *p) -{ - return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); -} +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16 +#define _mm_extract_pi16(a, imm) \ + (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) -// Load 64-bit integer from memory into the first element of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 -FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +// Free aligned memory that was allocated with _mm_malloc. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free +FORCE_INLINE void _mm_free(void *addr) { - /* Load the lower 64 bits of the value pointed to by p into the - * lower 64 bits of the result, zeroing the upper 64 bits of the result. - */ - return vreinterpretq_m128i_s32( - vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); + free(addr); } -// Load a double-precision (64-bit) floating-point element from memory into the -// lower element of dst, and copy the upper element from a to dst. mem_addr does -// not need to be aligned on any particular boundary. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := a[127:64] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd -FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +// Macro: Get the flush zero bits from the MXCSR control and status register. +// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or +// _MM_FLUSH_ZERO_OFF +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE +FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode() { + union { + fpcr_bitfield field; #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); + uint64_t value; #else - return vreinterpretq_m128d_f32( - vcombine_f32(vld1_f32((const float *) p), - vget_high_f32(vreinterpretq_f32_m128d(a)))); + uint32_t value; #endif -} + } r; -// Load 2 double-precision (64-bit) floating-point elements from memory into dst -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// dst[63:0] := MEM[mem_addr+127:mem_addr+64] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd -FORCE_INLINE __m128d _mm_loadr_pd(const double *p) -{ #if defined(__aarch64__) - float64x2_t v = vld1q_f64(p); - return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else - int64x2_t v = vld1q_s64((const int64_t *) p); - return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif + + return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; } -// Sets the low word to the single-precision, floating-point value of b -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) -FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +// Macro: Get the rounding mode bits from the MXCSR control and status register. +// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, +// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE() { - return vreinterpretq_m128_f32( - vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), - vreinterpretq_f32_m128(a), 0)); + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + if (r.field.bit22) { + return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; + } else { + return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + } } -// Move the lower double-precision (64-bit) floating-point element from b to the -// lower element of dst, and copy the upper element from a to the upper element -// of dst. -// -// dst[63:0] := b[63:0] -// dst[127:64] := a[127:64] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd -FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + __extension__({ \ + vreinterpret_m64_s16( \ + vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ + }) + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) { - return vreinterpretq_m128d_f32( - vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), - vget_high_f32(vreinterpretq_f32_m128d(a)))); + return vreinterpretq_m128_f32(vld1q_f32(p)); } -// Copy the lower 64-bit integer in a to the lower element of dst, and zero the -// upper element. +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. // -// dst[63:0] := a[63:0] -// dst[127:64] := 0 +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 -FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) { - return vreinterpretq_m128i_s64( - vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } -// Return vector of type __m128 with undefined elements. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps -FORCE_INLINE __m128 _mm_undefined_ps(void) +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) { -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wuninitialized" -#endif - __m128 a; - return a; -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); } -/* Logic/Binary operations */ - -// Computes the bitwise AND-NOT of the four single-precision, floating-point -// values of a and b. +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. // -// r0 := ~a0 & b0 -// r1 := ~a1 & b1 -// r2 := ~a2 & b2 -// r3 := ~a3 & b3 +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 // -// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx -FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { - return vreinterpretq_m128_s32( - vbicq_s32(vreinterpretq_s32_m128(b), - vreinterpretq_s32_m128(a))); // *NOTE* argument swap + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); } -// Compute the bitwise NOT of packed double-precision (64-bit) floating-point -// elements in a and then AND with b, and store the results in dst. +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. // -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -// ENDFOR +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd -FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { - // *NOTE* argument swap - return vreinterpretq_m128d_s64( - vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); } -// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the -// 128-bit value in a. +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. // -// r := (~a) & b +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] // -// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { - return vreinterpretq_m128i_s32( - vbicq_s32(vreinterpretq_s32_m128i(b), - vreinterpretq_s32_m128i(a))); // *NOTE* argument swap + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); } -// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in -// b. -// -// r := a & b -// -// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { - return vreinterpretq_m128i_s32( - vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); } -// Computes the bitwise AND of the four single-precision, floating-point values -// of a and b. +// Load unaligned 16-bit integer from memory into the first element of dst. // -// r0 := a0 & b0 -// r1 := a1 & b1 -// r2 := a2 & b2 -// r3 := a3 & b3 +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 // -// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { - return vreinterpretq_m128_s32( - vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); } -// Compute the bitwise AND of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. +// Load unaligned 64-bit integer from memory into the first element of dst. // -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := a[i+63:i] AND b[i+63:i] -// ENDFOR +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd -FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { - return vreinterpretq_m128d_s64( - vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); } -// Computes the bitwise OR of the four single-precision, floating-point values -// of a and b. -// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx -FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { - return vreinterpretq_m128_s32( - vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; } -// Computes bitwise EXOR (exclusive-or) of the four single-precision, -// floating-point values of a and b. -// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64 +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { - return vreinterpretq_m128_s32( - veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); + int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x8_t masked = + vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), + vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); + vst1_s8((int8_t *) mem_addr, masked); } -// Compute the bitwise XOR of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq +#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. // -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) // ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd -FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { - return vreinterpretq_m128d_s64( - veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } -// Compute the bitwise OR of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd -FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { - return vreinterpretq_m128d_s64( - vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif } -// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. // -// r := a | b +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx -FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { - return vreinterpretq_m128i_s32( - vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } -// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in -// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx -FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { - return vreinterpretq_m128i_s32( - veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } -// Duplicate the low double-precision (64-bit) floating-point element from a, -// and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd -FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { -#if (__aarch64__) - return vreinterpretq_m128d_f64( - vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); -#else - return vreinterpretq_m128d_u64( - vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); -#endif + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } -// Duplicate odd-indexed single-precision (32-bit) floating-point elements -// from a, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps -FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { -#if __has_builtin(__builtin_shufflevector) - return vreinterpretq_m128_f32(__builtin_shufflevector( - vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); #else - float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); - float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); - float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; - return vreinterpretq_m128_f32(vld1q_f32(data)); + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } -// Duplicate even-indexed single-precision (32-bit) floating-point elements -// from a, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps -FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { -#if __has_builtin(__builtin_shufflevector) - return vreinterpretq_m128_f32(__builtin_shufflevector( - vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); -#else - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); - float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -#endif + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); } // Moves the upper two values of B into the lower two values of A. @@ -1577,315 +2093,419 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } -// Compute the absolute value of packed signed 32-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 3 -// i := j*32 -// dst[i+31:i] := ABS(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 -FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8 +FORCE_INLINE int _mm_movemask_pi8(__m64 a) { - return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); + uint8x8_t input = vreinterpret_u8_m64(a); +#if defined(__aarch64__) + static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t tmp = vshr_n_u8(input, 7); + return vaddv_u8(vshl_u8(tmp, shift)); +#else + // Refer the implementation of `_mm_movemask_epi8` + uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); + uint32x2_t paired16 = + vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + uint8x8_t paired32 = + vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); + return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); +#endif } -// Compute the absolute value of packed signed 16-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// dst[i+15:i] := ABS(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 -FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) { - return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif } -// Compute the absolute value of packed signed 8-bit integers in a, and store -// the unsigned results in dst. +// Multiplies the four single-precision, floating-point values of a and b. // -// FOR j := 0 to 15 -// i := j*8 -// dst[i+7:i] := ABS(a[i+7:i]) -// ENDFOR +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 -FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { - return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } -// Compute the absolute value of packed signed 32-bit integers in a, and store -// the unsigned results in dst. +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. // -// FOR j := 0 to 1 -// i := j*32 -// dst[i+31:i] := ABS(a[i+31:i]) -// ENDFOR +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 -FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { - return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); + return _mm_move_ss(a, _mm_mul_ps(a, b)); } -// Compute the absolute value of packed signed 16-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := ABS(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 -FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { - return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); } -// Compute the absolute value of packed signed 8-bit integers in a, and store -// the unsigned results in dst. +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. // // FOR j := 0 to 7 // i := j*8 -// dst[i+7:i] := ABS(a[i+7:i]) +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 -FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) -{ - return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); -} +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) -// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift -// the result right by imm8 bytes, and store the low 16 bytes in dst. +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. // -// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -// dst[127:0] := tmp[127:0] +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 -#define _mm_alignr_epi8(a, b, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) >= 32)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - uint8x16_t tmp_low, tmp_high; \ - if (imm >= 16) { \ - const int idx = imm - 16; \ - tmp_low = vreinterpretq_u8_m128i(a); \ - tmp_high = vdupq_n_u8(0); \ - ret = \ - vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ - } else { \ - const int idx = imm; \ - tmp_low = vreinterpretq_u8_m128i(b); \ - tmp_high = vreinterpretq_u8_m128i(a); \ - ret = \ - vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ - } \ - } \ - ret; \ - }) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) -// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift -// the result right by imm8 bytes, and store the low 8 bytes in dst. -// -// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) -// dst[63:0] := tmp[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 -#define _mm_alignr_pi8(a, b, imm) \ - __extension__({ \ - __m64 ret; \ - if (unlikely((imm) >= 16)) { \ - ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ - } else { \ - uint8x8_t tmp_low, tmp_high; \ - if (imm >= 8) { \ - const int idx = imm - 8; \ - tmp_low = vreinterpret_u8_m64(a); \ - tmp_high = vdup_n_u8(0); \ - ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ - } else { \ - const int idx = imm; \ - tmp_low = vreinterpret_u8_m64(b); \ - tmp_high = vreinterpret_u8_m64(a); \ - ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ - } \ - } \ - ret; \ - }) +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); -} +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) -// takes the lower two 32-bit values from a and swaps them and places in high -// end of result takes the higher two 32 bit values from b and swaps them and -// places in low end of result. -FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb +#define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) { - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); + (void) i; + __builtin_prefetch(p); } -FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw +#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { - float32x2_t a21 = vget_high_f32( - vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b03 = vget_low_f32( - vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); } -FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { - float32x2_t a03 = vget_low_f32( - vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b21 = vget_high_f32( - vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); + return _mm_move_ss(a, _mm_rcp_ps(a)); } -FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// The current precision is 1% error. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); +#if SSE2NEON_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + return vreinterpretq_m128_f32(out); } -FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); } -FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); + uint64x1_t t = vpaddl_u32(vpaddl_u16( + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); + return vreinterpret_m64_u16( + vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } -// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the -// high -FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +// Macro: Set the flush zero bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The flush zero may contain any of the +// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE +FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) { - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif } -FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { - float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); } -FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) { - float32x2_t a22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } -FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif } -FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) { - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32x2_t a22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); } -FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) { - float32x2_t a33 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); - return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } -FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +FORCE_INLINE void _mm_setcsr(unsigned int a) { - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); + _MM_SET_ROUNDING_MODE(a); } -FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); } -FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) { - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); + return vreinterpretq_m128_f32(vdupq_n_f32(0)); } -// NEON does not support a general purpose permute intrinsic -// Selects four specific single-precision, floating-point values from a and b, -// based on the mask i. -// -// C equivalent: -// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, -// __constrange(0, 255) int imm) { -// __m128 ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; -// return ret; -// } -// -// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx -#define _mm_shuffle_ps_default(a, b, imm) \ +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16 +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pi16(a, imm) \ __extension__({ \ - float32x4_t ret; \ - ret = vmovq_n_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ - ret, 1); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ - ret, 2); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ - ret, 3); \ - vreinterpretq_m128_f32(ret); \ + vreinterpret_m64_s16(__builtin_shufflevector( \ + vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ + ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \ }) +#else +#define _mm_shuffle_pi16(a, imm) \ + __extension__({ \ + int16x4_t ret; \ + ret = \ + vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpret_m64_s16(ret); \ + }) +#endif + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) // int imm) @@ -1963,1876 +2583,1721 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) }) #endif -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of a and places it into the high end of the result. -FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +#if SSE2NEON_PRECISE_SQRT + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#elif defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + return vreinterpretq_m128_f32(sq); +#endif } -// takes the lower two 32-bit values from a and swaps them and places in low end -// of result takes the higher two 32 bit values from a and swaps them and places -// in high end of result. -FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); } -// rotates the least significant 32 bits into the most signficant 32 bits, and -// shifts the rest down -FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { - return vreinterpretq_m128i_s32( - vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); + vst1q_f32(p, vreinterpretq_f32_m128(a)); } -// rotates the most significant 32 bits into the least signficant 32 bits, and -// shifts the rest up -FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { - return vreinterpretq_m128i_s32( - vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); } -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of a and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); } -// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the -// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); + *p = vreinterpret_m64_f32(vget_high_f32(a)); } -// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the -// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and -// places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); + *p = vreinterpret_m64_f32(vget_low_f32(a)); } -FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[127:96] +// MEM[mem_addr+63:mem_addr+32] := a[95:64] +// MEM[mem_addr+95:mem_addr+64] := a[63:32] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { - int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); } -FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); + vst1q_f32(p, vreinterpretq_f32_m128(a)); } -FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +// Stores 16-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16 +FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); + vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); } -// Shuffle packed 8-bit integers in a according to shuffle control mask in the -// corresponding 8-bit element of b, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 -FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +// Stores 64-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64 +FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { - int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a - uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b - uint8x16_t idx_masked = - vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits -#if defined(__aarch64__) - return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); -#elif defined(__GNUC__) - int8x16_t ret; - // %e and %f represent the even and odd D registers - // respectively. - __asm__ __volatile__( - "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" - "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" - : [ret] "=&w"(ret) - : [tbl] "w"(tbl), [idx] "w"(idx_masked)); - return vreinterpretq_m128i_s8(ret); -#else - // use this line if testing on aarch64 - int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; - return vreinterpretq_m128i_s8( - vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), - vtbl2_s8(a_split, vget_high_u8(idx_masked)))); -#endif + vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); } -// C equivalent: -// __m128i _mm_shuffle_epi32_default(__m128i a, -// __constrange(0, 255) int imm) { -// __m128i ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; -// return ret; -// } -#define _mm_shuffle_epi32_default(a, imm) \ - __extension__({ \ - int32x4_t ret; \ - ret = vmovq_n_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ - ret, 1); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ - ret, 2); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ - ret, 3); \ - vreinterpretq_m128i_s32(ret); \ - }) +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi +FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) +{ + vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); +} -// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) -// int imm) -#if defined(__aarch64__) -#define _mm_shuffle_epi32_splat(a, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ - }) +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); #else -#define _mm_shuffle_epi32_splat(a, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ - }) + vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif +} -// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. -// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx -// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_epi32(a, imm) \ - __extension__({ \ - int32x4_t _input = vreinterpretq_s32_m128i(a); \ - int32x4_t _shuf = __builtin_shufflevector( \ - _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ - ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ - vreinterpretq_m128i_s32(_shuf); \ - }) -#else // generic -#define _mm_shuffle_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - switch (imm) { \ - case _MM_SHUFFLE(1, 0, 3, 2): \ - ret = _mm_shuffle_epi_1032((a)); \ - break; \ - case _MM_SHUFFLE(2, 3, 0, 1): \ - ret = _mm_shuffle_epi_2301((a)); \ - break; \ - case _MM_SHUFFLE(0, 3, 2, 1): \ - ret = _mm_shuffle_epi_0321((a)); \ - break; \ - case _MM_SHUFFLE(2, 1, 0, 3): \ - ret = _mm_shuffle_epi_2103((a)); \ - break; \ - case _MM_SHUFFLE(1, 0, 1, 0): \ - ret = _mm_shuffle_epi_1010((a)); \ - break; \ - case _MM_SHUFFLE(1, 0, 0, 1): \ - ret = _mm_shuffle_epi_1001((a)); \ - break; \ - case _MM_SHUFFLE(0, 1, 0, 1): \ - ret = _mm_shuffle_epi_0101((a)); \ - break; \ - case _MM_SHUFFLE(2, 2, 1, 1): \ - ret = _mm_shuffle_epi_2211((a)); \ - break; \ - case _MM_SHUFFLE(0, 1, 2, 2): \ - ret = _mm_shuffle_epi_0122((a)); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 2): \ - ret = _mm_shuffle_epi_3332((a)); \ - break; \ - case _MM_SHUFFLE(0, 0, 0, 0): \ - ret = _mm_shuffle_epi32_splat((a), 0); \ - break; \ - case _MM_SHUFFLE(1, 1, 1, 1): \ - ret = _mm_shuffle_epi32_splat((a), 1); \ - break; \ - case _MM_SHUFFLE(2, 2, 2, 2): \ - ret = _mm_shuffle_epi32_splat((a), 2); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 3): \ - ret = _mm_shuffle_epi32_splat((a), 3); \ - break; \ - default: \ - ret = _mm_shuffle_epi32_default((a), (imm)); \ - break; \ - } \ - ret; \ - }) -#endif +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} -// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified -// by imm. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) -// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, -// __constrange(0,255) int -// imm) -#define _mm_shufflelo_epi16_function(a, imm) \ - __extension__({ \ - int16x8_t ret = vreinterpretq_s16_m128i(a); \ - int16x4_t lowBits = vget_low_s16(ret); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ - 1); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ - 2); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ - 3); \ - vreinterpretq_m128i_s16(ret); \ - }) +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} -// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shufflelo_epi16(a, imm) \ - __extension__({ \ - int16x8_t _input = vreinterpretq_s16_m128i(a); \ - int16x8_t _shuf = __builtin_shufflevector( \ - _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ - (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ - vreinterpretq_m128i_s16(_shuf); \ - }) -#else // generic -#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) -#endif +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) -// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified -// by imm. -// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx -// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, -// __constrange(0,255) int -// imm) -#define _mm_shufflehi_epi16_function(a, imm) \ - __extension__({ \ - int16x8_t ret = vreinterpretq_s16_m128i(a); \ - int16x4_t highBits = vget_high_s16(ret); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ - 5); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ - 6); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ - 7); \ - vreinterpretq_m128i_s16(ret); \ - }) +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss -// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shufflehi_epi16(a, imm) \ - __extension__({ \ - int16x8_t _input = vreinterpretq_s16_m128i(a); \ - int16x8_t _shuf = __builtin_shufflevector( \ - _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ - (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ - (((imm) >> 6) & 0x3) + 4); \ - vreinterpretq_m128i_s16(_shuf); \ - }) -#else // generic -#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +// Return vector of type __m128i with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128 +FORCE_INLINE __m128i _mm_undefined_si128(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" #endif + __m128i a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} -// Shuffle double-precision (64-bit) floating-point elements using the control -// in imm8, and store the results in dst. +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. // -// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd -#if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_pd(a, b, imm8) \ - vreinterpretq_m128d_s64(__builtin_shufflevector( \ - vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ - ((imm8 & 0x2) >> 1) + 2)) +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else -#define _mm_shuffle_pd(a, b, imm8) \ - _mm_castsi128_pd(_mm_set_epi64x( \ - vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ - vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif +} -// Blend packed 16-bit integers from a and b using control mask imm8, and store -// the results in dst. +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. // -// FOR j := 0 to 7 -// i := j*16 -// IF imm8[j] -// dst[i+15:i] := b[i+15:i] -// ELSE -// dst[i+15:i] := a[i+15:i] -// FI -// ENDFOR -// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, -// __constrange(0,255) int imm) -#define _mm_blend_epi16(a, b, imm) \ - __extension__({ \ - const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ - uint16x8_t _mask_vec = vld1q_u16(_mask); \ - uint16x8_t _a = vreinterpretq_u16_m128i(a); \ - uint16x8_t _b = vreinterpretq_u16_m128i(b); \ - vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ - }) - -// Blend packed 8-bit integers from a and b using mask, and store the results in -// dst. +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 // -// FOR j := 0 to 15 -// i := j*8 -// IF mask[i+7] -// dst[i+7:i] := b[i+7:i] -// ELSE -// dst[i+7:i] := a[i+7:i] -// FI -// ENDFOR -FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { - // Use a signed shift right to create a mask with the sign bit - uint8x16_t mask = - vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); - uint8x16_t a = vreinterpretq_u8_m128i(_a); - uint8x16_t b = vreinterpretq_u8_m128i(_b); - return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif } -/* Shifts */ +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} +/* SSE2 */ -// Shift packed 16-bit integers in a right by imm while shifting in sign -// bits, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 -FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { - const int count = (imm & ~15) ? 15 : imm; - return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while -// shifting in zeros. +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. // -// r0 := a0 << count -// r1 := a1 << count -// ... -// r7 := a7 << count +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 // -// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx -#define _mm_slli_epi16(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm)) <= 0) { \ - ret = a; \ - } \ - if (unlikely((imm) > 15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s16( \ - vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ - } \ - ret; \ - }) - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while -// shifting in zeros. : -// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx -// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) -FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { - if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ - return a; - if (unlikely(imm > 31)) - return _mm_setzero_si128(); return vreinterpretq_m128i_s32( - vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and -// store the results in dst. -FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { - if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ - return a; - if (unlikely(imm > 63)) - return _mm_setzero_si128(); return vreinterpretq_m128i_s64( - vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } -// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and // store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. // -// FOR j := 0 to 7 -// i := j*16 -// IF imm8[7:0] > 15 -// dst[i+15:i] := 0 -// ELSE -// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) -// FI -// ENDFOR +// dst[63:0] := a[63:0] + b[63:0] +// dst[127:64] := a[127:64] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 -#define _mm_srli_epi16(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely(imm) == 0) { \ - ret = a; \ - } \ - if (likely(0 < (imm) && (imm) < 16)) { \ - ret = vreinterpretq_m128i_u16( \ - vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *) c); +#endif +} -// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and -// store the results in dst. +// Add 64-bit integers a and b, and store the result in dst. // -// FOR j := 0 to 3 -// i := j*32 -// IF imm8[7:0] > 31 -// dst[i+31:i] := 0 -// ELSE -// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) -// FI -// ENDFOR +// dst[63:0] := a[63:0] + b[63:0] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 -// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) == 0)) { \ - ret = a; \ - } \ - if (likely(0 < (imm) && (imm) < 32)) { \ - ret = vreinterpretq_m128i_u32( \ - vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} -// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and -// store the results in dst. +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. // -// FOR j := 0 to 1 -// i := j*64 -// IF imm8[7:0] > 63 -// dst[i+63:i] := 0 -// ELSE -// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) -// FI -// ENDFOR +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 -#define _mm_srli_epi64(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) == 0)) { \ - ret = a; \ - } \ - if (likely(0 < (imm) && (imm) < 64)) { \ - ret = vreinterpretq_m128i_u64( \ - vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} -// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, -// and store the results in dst. +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. // -// FOR j := 0 to 3 -// i := j*32 -// IF imm8[7:0] > 31 -// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) -// ELSE -// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) -// FI +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) // ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 -// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srai_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) == 0)) { \ - ret = a; \ - } \ - if (likely(0 < (imm) && (imm) < 32)) { \ - ret = vreinterpretq_m128i_s32( \ - vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ - } else { \ - ret = vreinterpretq_m128i_s32( \ - vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ - } \ - ret; \ - }) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} -// Shifts the 128 - bit value in a right by imm bytes while shifting in -// zeros.imm must be an immediate. -// -// r := srl(a, imm*8) -// -// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx -// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_si128(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) <= 0)) { \ - ret = a; \ - } \ - if (unlikely((imm) > 15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s8( \ - vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ - } \ - ret; \ - }) +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} -// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm -// must be an immediate. -// -// r := a << (imm * 8) -// -// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx -// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_slli_si128(a, imm) \ - __extension__({ \ - __m128i ret; \ - if (unlikely((imm) <= 0)) { \ - ret = a; \ - } \ - if (unlikely((imm) > 15)) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s8(vextq_s8( \ - vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ - } \ - ret; \ - }) +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} -// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while -// shifting in zeros. +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. // -// r0 := a0 << count -// r1 := a1 << count -// ... -// r7 := a7 << count +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 15)) - return _mm_setzero_si128(); - - int16x8_t vc = vdupq_n_s16((int16_t) c); - return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while -// shifting in zeros. +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. // -// r0 := a0 << count -// r1 := a1 << count -// r2 := a2 << count -// r3 := a3 << count +// r := a & b // -// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 31)) - return _mm_setzero_si128(); - - int32x4_t vc = vdupq_n_s32((int32_t) c); - return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while -// shifting in zeros. +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. // -// r0 := a0 << count -// r1 := a1 << count +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 63)) - return _mm_setzero_si128(); - - int64x2_t vc = vdupq_n_s64((int64_t) c); - return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); } -// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits -// while shifting in zeros. +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. // -// r0 := srl(a0, count) -// r1 := srl(a1, count) -// ... -// r7 := srl(a7, count) +// r := (~a) & b // -// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 15)) - return _mm_setzero_si128(); - - int16x8_t vc = vdupq_n_s16(-(int16_t) c); - return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap } -// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits -// while shifting in zeros. +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. // -// r0 := srl(a0, count) -// r1 := srl(a1, count) -// r2 := srl(a2, count) -// r3 := srl(a3, count) +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 // -// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 31)) - return _mm_setzero_si128(); - - int32x4_t vc = vdupq_n_s32(-(int32_t) c); - return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); } -// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits -// while shifting in zeros. +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. // -// r0 := srl(a0, count) -// r1 := srl(a1, count) +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 // -// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (unlikely(c > 63)) - return _mm_setzero_si128(); - - int64x2_t vc = vdupq_n_s64(-(int64_t) c); - return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } -// NEON does not provide a version of this function. -// Creates a 16-bit mask from the most significant bits of the 16 signed or -// unsigned 8-bit integers in a and zero extends the upper bits. -// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_epi8(__m128i a) -{ - // Use increasingly wide shifts+adds to collect the sign bits - // together. - // Since the widening shifts would be rather confusing to follow in little - // endian, everything will be illustrated in big endian order instead. This - // has a different result - the bits would actually be reversed on a big - // endian machine. - - // Starting input (only half the elements are shown): - // 89 ff 1d c0 00 10 99 33 - uint8x16_t input = vreinterpretq_u8_m128i(a); - - // Shift out everything but the sign bits with an unsigned shift right. - // - // Bytes of the vector:: - // 89 ff 1d c0 00 10 99 33 - // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) - // | | | | | | | | - // 01 01 00 01 00 00 01 00 - // - // Bits of first important lane(s): - // 10001001 (89) - // \______ - // | - // 00000001 (01) - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - - // Merge the even lanes together with a 16-bit unsigned shift right + add. - // 'xx' represents garbage data which will be ignored in the final result. - // In the important bytes, the add functions like a binary OR. - // - // 01 01 00 01 00 00 01 00 - // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) - // \| \| \| \| - // xx 03 xx 01 xx 00 xx 02 - // - // 00000001 00000001 (01 01) - // \_______ | - // \| - // xxxxxxxx xxxxxx11 (xx 03) - uint32x4_t paired16 = - vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128 +#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) - // Repeat with a wider 32-bit shift + add. - // xx 03 xx 01 xx 00 xx 02 - // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> - // 14)) - // \| \| - // xx xx xx 0d xx xx xx 02 - // - // 00000011 00000001 (03 01) - // \\_____ || - // '----.\|| - // xxxxxxxx xxxx1101 (xx 0d) - uint64x2_t paired32 = - vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128 +#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) - // Last, an even wider 64-bit shift + add to get our result in the low 8 bit - // lanes. xx xx xx 0d xx xx xx 02 - // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> - // 28)) - // \| - // xx xx xx xx xx xx xx d2 - // - // 00001101 00000010 (0d 02) - // \ \___ | | - // '---. \| | - // xxxxxxxx 11010010 (xx d2) - uint8x16_t paired64 = - vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} - // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. - // xx xx xx xx xx xx xx d2 - // || return paired64[0] - // d2 - // Note: Little endian would return the correct value 4b (01001011) instead. - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); } -// Copy the lower 64-bit integer in a to dst. -// -// dst[63:0] := a[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 -FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { - return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); } -// Copy the 64-bit integer a to the lower element of dst, and zero the upper -// element. -// -// dst[63:0] := a[63:0] -// dst[127:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 -FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { - return vreinterpretq_m128i_s64( - vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); } -// NEON does not provide this method -// Creates a 4-bit mask from the most significant bits of the four -// single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_ps(__m128 a) +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { - uint32x4_t input = vreinterpretq_u32_m128(a); #if defined(__aarch64__) - static const int32x4_t shift = {0, 1, 2, 3}; - uint32x4_t tmp = vshrq_n_u32(input, 31); - return vaddvq_u32(vshlq_u32(tmp, shift)); + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else - // Uses the exact same method as _mm_movemask_epi8, see that for details. - // Shift out everything but the sign bits with a 32-bit unsigned shift - // right. - uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); - // Merge the two pairs together with a 64-bit unsigned shift right + add. - uint8x16_t paired = - vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); - // Extract the result. - return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); #endif } -// Compute the bitwise NOT of a and then AND with a 128-bit vector containing -// all 1's, and return 1 if the result is zero, otherwise return 0. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones -FORCE_INLINE int _mm_test_all_ones(__m128i a) +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { - return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == - ~(uint64_t) 0; + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); } -// Compute the bitwise AND of 128 bits (representing integer data) in a and -// mask, and return 1 if the result is zero, otherwise return 0. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros -FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) { - int64x2_t a_and_mask = - vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); - return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 - : 1; + (void) p; + // no corollary for Neon? } -/* Math operations */ +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} -// Subtracts the four single-precision, floating-point values of a and b. -// -// r0 := a0 - b0 -// r1 := a1 - b1 -// r2 := a2 - b2 -// r3 := a3 - b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Subtract the lower single-precision (32-bit) floating-point element in b from -// the lower single-precision (32-bit) floating-point element in a, store the -// result in the lower element of dst, and copy the upper 3 packed elements from -// a to the upper elements of dst. -// -// dst[31:0] := a[31:0] - b[31:0] -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss -FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { - return _mm_move_ss(a, _mm_sub_ps(a, b)); + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } -// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, -// and store the results in dst. -// r0 := a0 - b0 -// r1 := a1 - b1 -FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s64( - vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif } -// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or -// unsigned 32-bit integers of a. -// -// r0 := a0 - b0 -// r1 := a1 - b1 -// r2 := a2 - b2 -// r3 := a3 - b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx -FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s32( - vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); } -// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and -// store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 -FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s16( - vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and -// store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 -FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s8( - vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpge_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. // -// dst[63:0] := a[63:0] - b[63:0] +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 -FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { - return vreinterpret_m64_s64( - vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit -// integers of a and saturates.. -// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx -FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { - return vreinterpretq_m128i_u16( - vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit -// integers of a and saturates. +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. // -// r0 := UnsignedSaturate(a0 - b0) -// r1 := UnsignedSaturate(a1 - b1) +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 // ... -// r15 := UnsignedSaturate(a15 - b15) +// r15 := (a15 > b15) ? 0xff : 0x0 // -// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( - vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } -// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers -// of a and saturates. -// -// r0 := SignedSaturate(a0 - b0) -// r1 := SignedSaturate(a1 - b1) -// ... -// r15 := SignedSaturate(a15 - b15) -// -// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s8( - vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers -// of a and saturates. -// -// r0 := SignedSaturate(a0 - b0) -// r1 := SignedSaturate(a1 - b1) -// ... -// r7 := SignedSaturate(a7 - b7) -// -// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s16( - vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Subtract packed double-precision (64-bit) floating-point elements in b from -// packed double-precision (64-bit) floating-point elements in a, and store the -// results in dst. -// -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := a[i+63:i] - b[i+63:i] -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd -FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + return vreinterpretq_m128d_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] - db[0]; - c[1] = da[1] - db[1]; - return vld1q_f32((float32_t *) c); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } -// Subtract the lower double-precision (64-bit) floating-point element in b from -// the lower double-precision (64-bit) floating-point element in a, store the -// result in the lower element of dst, and copy the upper element from a to the -// upper element of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd -FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { - return _mm_move_sd(a, _mm_sub_pd(a, b)); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmple_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Add packed unsigned 16-bit integers in a and b using saturation, and store -// the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 -FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( - vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Negate packed 8-bit integers in a when the corresponding signed -// 8-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..15 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { - int8x16_t a = vreinterpretq_s8_m128i(_a); - int8x16_t b = vreinterpretq_s8_m128i(_b); + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} - // signed shift right: faster than vclt - // (b < 0) ? 0xFF : 0 - uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} - // (b == 0) ? 0xFF : 0 +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) +{ #if defined(__aarch64__) - int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); + return vreinterpretq_m128d_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif - - // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') - // based on ltMask - int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); - // res = masked & (~zeroMask) - int8x16_t res = vbicq_s8(masked, zeroMask); - - return vreinterpretq_m128i_s8(res); } -// Negate packed 16-bit integers in a when the corresponding signed -// 16-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..7 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFF : 0 - uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); - // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) - int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); + return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else - int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif - - // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative - // 'a') based on ltMask - int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); - // res = masked & (~zeroMask) - int16x8_t res = vbicq_s16(masked, zeroMask); - return vreinterpretq_m128i_s16(res); } -// Negate packed 32-bit integers in a when the corresponding signed -// 32-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..3 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFFFFFF : 0 - uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); - - // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) - int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); + return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else - int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); #endif - - // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative - // 'a') based on ltMask - int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); - // res = masked & (~zeroMask) - int32x4_t res = vbicq_s32(masked, zeroMask); - return vreinterpretq_m128i_s32(res); } -// Negate packed 16-bit integers in a when the corresponding signed 16-bit -// integer in b is negative, and store the results in dst. Element in dst are -// zeroed out when the corresponding element in b is zero. -// -// FOR j := 0 to 3 -// i := j*16 -// IF b[i+15:i] < 0 -// dst[i+15:i] := -(a[i+15:i]) -// ELSE IF b[i+15:i] == 0 -// dst[i+15:i] := 0 -// ELSE -// dst[i+15:i] := a[i+15:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 -FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { - int16x4_t a = vreinterpret_s16_m64(_a); - int16x4_t b = vreinterpret_s16_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFF : 0 - uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); +} - // (b == 0) ? 0xFFFF : 0 +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) +{ #if defined(__aarch64__) - int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); + return vreinterpretq_m128d_u64(veorq_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); #else - int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif - - // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') - // based on ltMask - int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); - // res = masked & (~zeroMask) - int16x4_t res = vbic_s16(masked, zeroMask); - - return vreinterpret_m64_s16(res); } -// Negate packed 32-bit integers in a when the corresponding signed 32-bit -// integer in b is negative, and store the results in dst. Element in dst are -// zeroed out when the corresponding element in b is zero. -// -// FOR j := 0 to 1 -// i := j*32 -// IF b[i+31:i] < 0 -// dst[i+31:i] := -(a[i+31:i]) -// ELSE IF b[i+31:i] == 0 -// dst[i+31:i] := 0 -// ELSE -// dst[i+31:i] := a[i+31:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 -FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { - int32x2_t a = vreinterpret_s32_m64(_a); - int32x2_t b = vreinterpret_s32_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFFFFFF : 0 - uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); +} - // (b == 0) ? 0xFFFFFFFF : 0 +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd +FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) +{ #if defined(__aarch64__) - int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); + return vreinterpretq_m128d_u64(veorq_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); #else - int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif - - // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') - // based on ltMask - int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); - // res = masked & (~zeroMask) - int32x2_t res = vbic_s32(masked, zeroMask); - - return vreinterpret_m64_s32(res); } -// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer -// in b is negative, and store the results in dst. Element in dst are zeroed out -// when the corresponding element in b is zero. -// -// FOR j := 0 to 7 -// i := j*8 -// IF b[i+7:i] < 0 -// dst[i+7:i] := -(a[i+7:i]) -// ELSE IF b[i+7:i] == 0 -// dst[i+7:i] := 0 -// ELSE -// dst[i+7:i] := a[i+7:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 -FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd +FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) { - int8x8_t a = vreinterpret_s8_m64(_a); - int8x8_t b = vreinterpret_s8_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFF : 0 - uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); +} - // (b == 0) ? 0xFF : 0 +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd +FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) +{ #if defined(__aarch64__) - int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); + return vreinterpretq_m128d_u64(veorq_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); #else - int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif - - // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') - // based on ltMask - int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); - // res = masked & (~zeroMask) - int8x8_t res = vbic_s8(masked, zeroMask); - - return vreinterpret_m64_s8(res); } -// Average packed unsigned 16-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 -FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd +FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) { - return vreinterpret_m64_u16( - vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); + return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); } -// Average packed unsigned 8-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 -FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd +FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { - return vreinterpret_m64_u8( - vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_u64(veorq_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Average packed unsigned 8-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb -#define _m_pavgb(a, b) _mm_avg_pu8(a, b) - -// Average packed unsigned 16-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw -#define _m_pavgw(a, b) _mm_avg_pu16(a, b) - -// Extract a 16-bit integer from a, selected with imm8, and store the result in -// the lower element of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw -#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) - -// Copy a to dst, and insert the 16-bit integer i into dst at the location -// specified by imm8. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw -#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) - -// Compare packed signed 16-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw -#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) - -// Compare packed unsigned 8-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub -#define _m_pmaxub(a, b) _mm_max_pu8(a, b) - -// Compare packed signed 16-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw -#define _m_pminsw(a, b) _mm_min_pi16(a, b) - -// Compare packed unsigned 8-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub -#define _m_pminub(a, b) _mm_min_pu8(a, b) - -// Computes the average of the 16 unsigned 8-bit integers in a and the 16 -// unsigned 8-bit integers in b and rounds. -// -// r0 := (a0 + b0) / 2 -// r1 := (a1 + b1) / 2 -// ... -// r15 := (a15 + b15) / 2 -// -// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd +FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_u8( - vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); + return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); } -// Computes the average of the 8 unsigned 16-bit integers in a and the 8 -// unsigned 16-bit integers in b and rounds. -// -// r0 := (a0 + b0) / 2 -// r1 := (a1 + b1) / 2 -// ... -// r7 := (a7 + b7) / 2 -// -// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx -FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd +FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { - return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), - vreinterpretq_u16_m128i(b)); +#if defined(__aarch64__) + // Excluding NaNs, any two floating point numbers can be compared. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? ~UINT64_C(0) + : UINT64_C(0); + d[1] = ((*(double *) &a1) == (*(double *) &a1) && + (*(double *) &b1) == (*(double *) &b1)) + ? ~UINT64_C(0) + : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Adds the four single-precision, floating-point values of a and b. -// -// r0 := a0 + b0 -// r1 := a1 + b1 -// r2 := a2 + b2 -// r3 := a3 + b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd +FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { - return vreinterpretq_m128_f32( - vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpord_pd(a, b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? ~UINT64_C(0) + : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Add packed double-precision (64-bit) floating-point elements in a and b, and -// store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd -FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd +FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + // Two NaNs are not equal in comparison operation. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_s32( + vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1] + db[1]; - return vld1q_f32((float32_t *) c); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? UINT64_C(0) + : ~UINT64_C(0); + d[1] = ((*(double *) &a1) == (*(double *) &a1) && + (*(double *) &b1) == (*(double *) &b1)) + ? UINT64_C(0) + : ~UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } -// Add the lower double-precision (64-bit) floating-point element in a and b, -// store the result in the lower element of dst, and copy the upper element from -// a to the upper element of dst. -// -// dst[63:0] := a[63:0] + b[63:0] -// dst[127:64] := a[127:64] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd -FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd +FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) - return _mm_move_sd(a, _mm_add_pd(a, b)); + return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1]; - return vld1q_f32((float32_t *) c); + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? UINT64_C(0) + : ~UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } -// Add 64-bit integers a and b, and store the result in dst. -// -// dst[63:0] := a[63:0] + b[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 -FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd +FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { - return vreinterpret_m64_s64( - vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); -} +#if defined(__aarch64__) + return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); -// adds the scalar single-precision floating point values of a and b. -// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - // the upper values in the result must be the remnants of <a>. - return vreinterpretq_m128_f32(vaddq_f32(a, value)); + return (*(double *) &a0 >= *(double *) &b0); +#endif } -// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or -// unsigned 32-bit integers in b. -// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd +FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s64( - vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#if defined(__aarch64__) + return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 > *(double *) &b0); +#endif } -// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or -// unsigned 32-bit integers in b. -// -// r0 := a0 + b0 -// r1 := a1 + b1 -// r2 := a2 + b2 -// r3 := a3 + b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd +FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s32( - vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#if defined(__aarch64__) + return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 <= *(double *) &b0); +#endif } -// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or -// unsigned 16-bit integers in b. -// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd +FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s16( - vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) + return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 < *(double *) &b0); +#endif } -// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or -// unsigned 8-bit integers in b. -// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) -FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd +FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s8( - vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#if defined(__aarch64__) + return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; +#else + uint32x4_t a_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); + uint32x4_t b_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), + vreinterpretq_u64_u32(a_eq_b)); + return vgetq_lane_u64(and_results, 0) & 0x1; +#endif } -// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b -// and saturates. -// -// r0 := SignedSaturate(a0 + b0) -// r1 := SignedSaturate(a1 + b1) -// ... -// r7 := SignedSaturate(a7 + b7) -// -// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd +FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s16( - vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); + return !_mm_comieq_sd(a, b); } -// Add packed signed 8-bit integers in a and b using saturation, and store the -// results in dst. +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. // -// FOR j := 0 to 15 -// i := j*8 -// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) // ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 -FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { - return vreinterpretq_m128i_s8( - vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); +#else + double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + return _mm_set_pd(a1, a0); +#endif } -// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in -// b and saturates.. -// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { - return vreinterpretq_m128i_u8( - vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); } -// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or -// unsigned 16-bit integers from b. +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. // -// r0 := (a0 * b0)[15:0] -// r1 := (a1 * b1)[15:0] -// ... -// r7 := (a7 * b7)[15:0] +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32 +FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { - return vreinterpretq_m128i_s16( - vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0 = ((double *) &rnd)[0]; + double d1 = ((double *) &rnd)[1]; + return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); } -// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or -// unsigned 32-bit integers from b. -// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32 +FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { - return vreinterpretq_m128i_s32( - vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0 = ((double *) &rnd)[0]; + double d1 = ((double *) &rnd)[1]; + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; + return vreinterpret_m64_s32(vld1_s32(data)); } -// Multiply the packed unsigned 16-bit integers in a and b, producing -// intermediate 32-bit integers, and store the high 16 bits of the intermediate -// integers in dst. +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. // -// FOR j := 0 to 3 -// i := j*16 -// tmp[31:0] := a[i+15:i] * b[i+15:i] -// dst[i+15:i] := tmp[31:16] +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) // ENDFOR +// dst[127:64] := 0 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw -#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} -// Multiplies the four single-precision, floating-point values of a and b. +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. // -// r0 := a0 * b0 -// r1 := a1 * b1 -// r2 := a2 * b2 -// r3 := a3 * b3 +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd +FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { - return vreinterpretq_m128_f32( - vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); +#else + double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); + double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); + return _mm_set_pd(a1, a0); +#endif } -// Multiply packed double-precision (64-bit) floating-point elements in a and b, -// and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd -FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); + case _MM_ROUND_DOWN: + return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); + case _MM_ROUND_UP: + return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); + default: // _MM_ROUND_TOWARD_ZERO + return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); + } #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] * db[0]; - c[1] = da[1] * db[1]; - return vld1q_f32((float32_t *) c); + float *f = (float *) &a; + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32( + vbslq_s32(is_delta_half, r_even, r_normal)); + } + case _MM_ROUND_DOWN: + return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), + floorf(f[0])); + case _MM_ROUND_UP: + return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), + ceilf(f[0])); + default: // _MM_ROUND_TOWARD_ZERO + return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], + (int32_t) f[0]); + } #endif } -// Multiply the lower double-precision (64-bit) floating-point element in a and -// b, store the result in the lower element of dst, and copy the upper element -// from a to the upper element of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd -FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { - return _mm_move_sd(a, _mm_mul_pd(a, b)); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif } -// Multiply the lower single-precision (32-bit) floating-point element in a and -// b, store the result in the lower element of dst, and copy the upper 3 packed -// elements from a to the upper elements of dst. +// Copy the lower double-precision (64-bit) floating-point element of a to dst. // -// dst[31:0] := a[31:0] * b[31:0] -// dst[127:32] := a[127:32] +// dst[63:0] := a[63:0] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss -FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { - return _mm_move_ss(a, _mm_mul_ps(a, b)); +#if defined(__aarch64__) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif } -// Multiply the low unsigned 32-bit integers from each packed 64-bit element in -// a and b, and store the unsigned 64-bit results in dst. +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. // -// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) -// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) -FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +// dst[31:0] := Convert_FP64_To_Int32(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32 +FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { - // vmull_u32 upcasts instead of masking, so we downcast. - uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); - uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); - return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +#if defined(__aarch64__) + return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = ((double *) &rnd)[0]; + return (int32_t) ret; +#endif } -// Multiply the low unsigned 32-bit integers from a and b, and store the -// unsigned 64-bit result in dst. +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. // -// dst[63:0] := a[31:0] * b[31:0] +// dst[63:0] := Convert_FP64_To_Int64(a[63:0]) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 -FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64 +FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { - return vreinterpret_m64_u64(vget_low_u64( - vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +#if defined(__aarch64__) + return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = ((double *) &rnd)[0]; + return (int64_t) ret; +#endif } -// Multiply the low signed 32-bit integers from each packed 64-bit element in -// a and b, and store the signed 64-bit results in dst. +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. // -// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 -// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 -FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +// dst[63:0] := Convert_FP64_To_Int64(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x +#define _mm_cvtsd_si64x _mm_cvtsd_si64 + +// Convert the lower double-precision (64-bit) floating-point element in b to a +// single-precision (32-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss +FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { - // vmull_s32 upcasts instead of masking, so we downcast. - int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); - int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsetq_lane_f32( + vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), + vreinterpretq_f32_m128(a), 0)); +#else + return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], + vreinterpretq_f32_m128(a), 0)); +#endif } -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit -// integers from b. +// Copy the lower 32-bit integer in a to dst. // -// r0 := (a0 * b0) + (a1 * b1) -// r1 := (a2 * b2) + (a3 * b3) -// r2 := (a4 * b4) + (a5 * b5) -// r3 := (a6 * b6) + (a7 * b7) -// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx -FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { - int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), - vget_low_s16(vreinterpretq_s16_m128i(b))); - int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), - vget_high_s16(vreinterpretq_s16_m128i(b))); - - int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); - int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); - - return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); } -// Multiply packed signed 16-bit integers in a and b, producing intermediate -// signed 32-bit integers. Shift right by 15 bits while rounding up, and store -// the packed 16-bit integers in dst. +// Copy the lower 64-bit integer in a to dst. // -// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) -// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) -// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) -// ... -// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) -FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { - // Has issues due to saturation - // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); - - // Multiply - int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), - vget_low_s16(vreinterpretq_s16_m128i(b))); - int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), - vget_high_s16(vreinterpretq_s16_m128i(b))); - - // Rounding narrowing shift right - // narrow = (int16_t)((mul + 16384) >> 15); - int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); - int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); - - // Join together - return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); } -// Vertically multiply each unsigned 8-bit integer from a with the corresponding -// signed 8-bit integer from b, producing intermediate signed 16-bit integers. -// Horizontally add adjacent pairs of intermediate signed 16-bit integers, -// and pack the saturated results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + -// a[i+7:i]*b[i+7:i] ) -// ENDFOR -FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +// Copy the lower 64-bit integer in a to dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Convert the signed 32-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd +FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { #if defined(__aarch64__) - uint8x16_t a = vreinterpretq_u8_m128i(_a); - int8x16_t b = vreinterpretq_s8_m128i(_b); - int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vmovl_s8(vget_low_s8(b))); - int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), - vmovl_s8(vget_high_s8(b))); - return vreinterpretq_m128i_s16( - vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - // This would be much simpler if x86 would choose to zero extend OR sign - // extend, not both. This could probably be optimized better. - uint16x8_t a = vreinterpretq_u16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - - // Zero extend a - int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); - int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); - - // Sign extend by shifting left then shifting right. - int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); - int16x8_t b_odd = vshrq_n_s16(b, 8); - - // multiply - int16x8_t prod1 = vmulq_s16(a_even, b_even); - int16x8_t prod2 = vmulq_s16(a_odd, b_odd); - - // saturated add - return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); + double bf = (double) b; + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } -// Computes the fused multiple add product of 32-bit floating point numbers. +// Copy the lower 64-bit integer in a to dst. // -// Return Value -// Multiplies A and B, and adds C to the temporary result before returning it. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd -FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd +FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { #if defined(__aarch64__) - return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), - vreinterpretq_f32_m128(b), - vreinterpretq_f32_m128(a))); + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - return _mm_add_ps(_mm_mul_ps(a, b), c); + double bf = (double) b; + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } -// Alternatively add and subtract packed single-precision (32-bit) -// floating-point elements in a to/from packed elements in b, and store the -// results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps -FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { - __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; - return _mm_fmadd_ps(b, mask, a); + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); } -// Horizontally add adjacent pairs of double-precision (64-bit) floating-point -// elements in a and b, and pack the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd -FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128 +#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd +#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) return vreinterpretq_m128d_f64( - vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[] = {da[0] + da[1], db[0] + db[1]}; - return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); #endif } -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce two -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of 64-bit elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 -FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32 +FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { - uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); - uint16_t r0 = t[0] + t[1] + t[2] + t[3]; - uint16_t r4 = t[4] + t[5] + t[6] + t[7]; - uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); - return (__m128i) vsetq_lane_u16(r4, r, 4); + double a0 = ((double *) &a)[0]; + double a1 = ((double *) &a)[1]; + return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce four -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 -FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32 +FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { - uint16x4_t t = - vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); - uint16_t r0 = t[0] + t[1] + t[2] + t[3]; - return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); + double a0 = ((double *) &a)[0]; + double a1 = ((double *) &a)[1]; + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; + return vreinterpret_m64_s32(vld1_s32(data)); } -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce four -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of dst. +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. // -// FOR j := 0 to 7 -// i := j*8 -// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -// ENDFOR -// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + -// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw -#define _m_psadbw(a, b) _mm_sad_pu8(a, b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32 +FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) +{ + double ret = *((double *) &a); + return (int32_t) ret; +} -// Divides the four single-precision, floating-point values of a and b. +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. // -// r0 := a0 / b0 -// r1 := a1 / b1 -// r2 := a2 / b2 -// r3 := a3 / b3 +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // -// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { -#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV - return vreinterpretq_m128_f32( - vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#if defined(__aarch64__) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); -#if SSE2NEON_PRECISE_DIV - // Additional Netwon-Raphson iteration for accuracy - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); -#endif - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); + double ret = *((double *) &a); + return (int64_t) ret; #endif } -// Divides the scalar single-precision floating point value of a by b. -// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) -{ - float32_t value = - vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) // Divide packed double-precision (64-bit) floating-point elements in a by // packed elements in b, and store the results in dst. @@ -3875,266 +4340,230 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) #endif } -// Compute the approximate reciprocal of packed single-precision (32-bit) -// floating-point elements in a, and store the results in dst. The maximum -// relative error for this approximation is less than 1.5*2^-12. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps -FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) { - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); -#if SSE2NEON_PRECISE_DIV - // Additional Netwon-Raphson iteration for accuracy - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif - return vreinterpretq_m128_f32(recip); } -// Compute the approximate reciprocal of the lower single-precision (32-bit) -// floating-point element in a, store the result in the lower element of dst, -// and copy the upper 3 packed elements from a to the upper elements of dst. The -// maximum relative error for this approximation is less than 1.5*2^-12. +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. // -// dst[31:0] := (1.0 / a[31:0]) -// dst[127:32] := a[127:32] +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss -FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) -{ - return _mm_move_ss(a, _mm_rcp_ps(a)); -} +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd -// Computes the approximations of square roots of the four single-precision, -// floating-point values of a. First computes reciprocal square roots and then -// reciprocals of the four values. +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. // -// r0 := sqrt(a0) -// r1 := sqrt(a1) -// r2 := sqrt(a2) -// r3 := sqrt(a3) +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 // -// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) { -#if SSE2NEON_PRECISE_SQRT - float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); - - // Test for vrsqrteq_f32(0) -> positive infinity case. - // Change to zero, so that s * 1/sqrt(s) result is zero too. - const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); - const uint32x4_t div_by_zero = - vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); - recip = vreinterpretq_f32_u32( - vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); - - // Additional Netwon-Raphson iteration for accuracy - recip = vmulq_f32( - vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), - recip); - recip = vmulq_f32( - vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), - recip); - - // sqrt(s) = s * 1/sqrt(s) - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); -#elif defined(__aarch64__) - return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else - float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); - float32x4_t sq = vrecpeq_f32(recipsq); - return vreinterpretq_m128_f32(sq); + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } -// Computes the approximation of the square root of the scalar single-precision -// floating point value of in. -// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { - float32_t value = - vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } -// Computes the approximations of the reciprocal square roots of the four -// single-precision floating point values of in. -// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) { - float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); -#if SSE2NEON_PRECISE_RSQRT - // Additional Netwon-Raphson iteration for accuracy - out = vmulq_f32( - out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); - out = vmulq_f32( - out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); #endif - return vreinterpretq_m128_f32(out); } -// Compute the approximate reciprocal square root of the lower single-precision -// (32-bit) floating-point element in a, store the result in the lower element -// of dst, and copy the upper 3 packed elements from a to the upper elements of -// dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss -FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { - return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif } -// Compare packed signed 16-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 -FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +// Load 64-bit integer from memory into the first element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { - return vreinterpret_m64_s16( - vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); } -// Compare packed signed 16-bit integers in a and b, and store packed maximum -// values in dst. +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. // -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -// ENDFOR +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 -#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) - -// Computes the maximums of the four single-precision, floating-point values of -// a and b. -// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { -#if SSE2NEON_PRECISE_MINMAX - float32x4_t _a = vreinterpretq_f32_m128(a); - float32x4_t _b = vreinterpretq_f32_m128(b); - return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else - return vreinterpretq_m128_f32( - vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } -// Compare packed unsigned 8-bit integers in a and b, and store packed maximum -// values in dst. +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. // -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -// ENDFOR +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 -FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { - return vreinterpret_m64_u8( - vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif } -// Compare packed unsigned 8-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 -#define _m_pmaxub(a, b) _mm_max_pu8(a, b) - -// Compare packed signed 16-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 -FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { - return vreinterpret_m64_s16( - vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); + return _mm_load_pd(p); } -// Compare packed signed 16-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 -#define _m_pminsw(a, b) _mm_min_pi16(a, b) - -// Computes the minima of the four single-precision, floating-point values of a -// and b. -// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { -#if SSE2NEON_PRECISE_MINMAX - float32x4_t _a = vreinterpretq_f32_m128(a); - float32x4_t _b = vreinterpretq_f32_m128(b); - return vbslq_f32(vcltq_f32(_a, _b), _a, _b); -#else - return vreinterpretq_m128_f32( - vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#endif + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } -// Compare packed unsigned 8-bit integers in a and b, and store packed minimum -// values in dst. +// Load unaligned 32-bit integer from memory into the first element of dst. // -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -// ENDFOR +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 -FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { - return vreinterpret_m64_u8( - vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); } -// Compare packed unsigned 8-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -// ENDFOR +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 -#define _m_pminub(a, b) _mm_min_pu8(a, b) +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); -// Computes the maximum of the two lower scalar single-precision floating point -// values of a and b. -// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128 +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { - float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), + vreinterpretq_s8_m128(b)); + vst1q_s8((int8_t *) mem_addr, masked); } -// Computes the minimum of the two lower scalar single-precision floating point -// values of a and b. -// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { - float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the @@ -4146,13 +4575,41 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } -// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the -// 16 unsigned 8-bit integers from b. -// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx -FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_u8( - vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_max_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {fmax(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *) c); +#endif } // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 @@ -4164,110 +4621,246 @@ FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Compare packed signed 8-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 -FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { - return vreinterpretq_m128i_s8( - vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } -// Compare packed unsigned 16-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 -FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_u16( - vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif } -// Compare packed signed 8-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 -FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s8( - vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_min_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {fmin(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *) c); +#endif } -// Compare packed unsigned 16-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 -FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { - return vreinterpretq_m128i_u16( - vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); } -// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 -// signed 16-bit integers from b. -// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_s16( - vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); } -// epi versions of min/max -// Computes the pariwise maximums of the four signed 32-bit integer values of a -// and b. +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd +FORCE_INLINE int _mm_movemask_pd(__m128d a) +{ + uint64x2_t input = vreinterpretq_u64_m128d(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1); +} + +// Copy the lower 64-bit integer in a to dst. // -// A 128-bit parameter that can be defined with the following equations: -// r0 := (a0 > b0) ? a0 : b0 -// r1 := (a1 > b1) ? a1 : b1 -// r2 := (a2 > b2) ? a2 : b2 -// r3 := (a3 > b3) ? a3 : b3 +// dst[63:0] := a[63:0] // -// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { - return vreinterpretq_m128i_s32( - vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); } -// Computes the pariwise minima of the four signed 32-bit integer values of a -// and b. +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. // -// A 128-bit parameter that can be defined with the following equations: -// r0 := (a0 < b0) ? a0 : b0 -// r1 := (a1 < b1) ? a1 : b1 -// r2 := (a2 < b2) ? a2 : b2 -// r3 := (a3 < b3) ? a3 : b3 +// dst[63:0] := a[63:0] +// dst[127:64] := 0 // -// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { - return vreinterpretq_m128i_s32( - vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); } -// Compare packed unsigned 32-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 -FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { - return vreinterpretq_m128i_u32( - vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); } -// Compare packed unsigned 32-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 -FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { - return vreinterpretq_m128i_u32( - vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *) c); +#endif } -// Multiply the packed unsigned 16-bit integers in a and b, producing -// intermediate 32-bit integers, and store the high 16 bits of the intermediate -// integers in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 -FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { - return vreinterpret_m64_u16(vshrn_n_u32( - vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit @@ -4321,1341 +4914,2413 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) #endif } -// Computes pairwise add of each argument as single-precision, floating-point -// values a and b. -// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx -FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32( - vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); -#endif + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Computes pairwise add of each argument as a 16-bit signed or unsigned integer -// values a and b. -FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) - return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); -#else - return vreinterpretq_m128i_s16( - vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), - vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); -#endif + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } -// Horizontally substract adjacent pairs of single-precision (32-bit) -// floating-point elements in a and b, and pack the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps -FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vsubq_f32( - vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), - vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); -#else - float32x4x2_t c = - vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); - return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); -#endif + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the -// signed 16-bit results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 -FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { - return vreinterpret_m64_s16( - vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); } -// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the -// signed 32-bit results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 -FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { - return vreinterpret_m64_s32( - vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); } -// Computes pairwise difference of each argument as a 16-bit signed or unsigned -// integer values a and b. -FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Subtract - return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); } -// Computes saturated pairwise sub of each argument as a 16-bit signed -// integer values a and b. -FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +FORCE_INLINE void _mm_pause() { -#if defined(__aarch64__) - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - return vreinterpretq_s64_s16( - vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); -#else - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Saturated add - return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); -#endif + __asm__ __volatile__("isb\n"); } -// Computes saturated pairwise difference of each argument as a 16-bit signed -// integer values a and b. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 -FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { -#if defined(__aarch64__) - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - return vreinterpretq_s64_s16( - vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); -#else - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Saturated subtract - return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); -#endif + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); } -// Computes pairwise add of each argument as a 32-bit signed or unsigned integer -// values a and b. -FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) { - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - return vreinterpretq_m128i_s32( - vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), - vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); } -// Computes pairwise difference of each argument as a 32-bit signed or unsigned -// integer values a and b. -FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { - int64x2_t a = vreinterpretq_s64_m128i(_a); - int64x2_t b = vreinterpretq_s64_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|b0|b2] - // [a1|a2|b1|b3] - int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); - int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); - // Subtract - return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); } -// Kahan summation for accurate summation of floating-point numbers. -// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html -FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { - y -= *c; - float t = *sum + y; - *c = (t - *sum) - y; - *sum = t; + return _mm_set_epi64x((int64_t) i1, (int64_t) i2); } -// Conditionally multiply the packed single-precision (32-bit) floating-point -// elements in a and b using the high 4 bits in imm8, sum the four products, -// and conditionally store the sum in dst using the low 4 bits of imm. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps -FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { -#if defined(__aarch64__) - /* shortcuts */ - if (imm == 0xFF) { - return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); - } - if (imm == 0x7F) { - float32x4_t m = _mm_mul_ps(a, b); - m[3] = 0; - return _mm_set1_ps(vaddvq_f32(m)); - } -#endif - - float s = 0, c = 0; - float32x4_t f32a = vreinterpretq_f32_m128(a); - float32x4_t f32b = vreinterpretq_f32_m128(b); - - /* To improve the accuracy of floating-point summation, Kahan algorithm - * is used for each operation. - */ - if (imm & (1 << 4)) - _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); - if (imm & (1 << 5)) - _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); - if (imm & (1 << 6)) - _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); - if (imm & (1 << 7)) - _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); - s += c; - - float32x4_t res = { - (imm & 0x1) ? s : 0, - (imm & 0x2) ? s : 0, - (imm & 0x4) ? s : 0, - (imm & 0x8) ? s : 0, - }; - return vreinterpretq_m128_f32(res); + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); } -/* Compare operations */ +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} -// Compares for less than -// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { - return vreinterpretq_m128_u32( - vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif } -// Compares for less than -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) -FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) { - return _mm_move_ss(a, _mm_cmplt_ps(a, b)); + return _mm_set_pd(0, a); } -// Compares for greater than. +// Sets the 8 signed 16-bit integer values to w. // -// r0 := (a0 > b0) ? 0xffffffff : 0x0 -// r1 := (a1 > b1) ? 0xffffffff : 0x0 -// r2 := (a2 > b2) ? 0xffffffff : 0x0 -// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// r0 := w +// r1 := w +// ... +// r7 := w // -// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) { - return vreinterpretq_m128_u32( - vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); } -// Compares for greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) -FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) { - return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); } -// Compares for greater than or equal. -// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { - return vreinterpretq_m128_u32( - vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); } -// Compares for greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) -FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { - return _mm_move_ss(a, _mm_cmpge_ps(a, b)); + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); } -// Compares for less than or equal. +// Sets the 16 signed 8-bit integer values to b. // -// r0 := (a0 <= b0) ? 0xffffffff : 0x0 -// r1 := (a1 <= b1) ? 0xffffffff : 0x0 -// r2 := (a2 <= b2) ? 0xffffffff : 0x0 -// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// r0 := b +// r1 := b +// ... +// r15 := b // -// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { - return vreinterpretq_m128_u32( - vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); } -// Compares for less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) -FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) { - return _mm_move_ss(a, _mm_cmple_ps(a, b)); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); +#endif } -// Compares for equality. -// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) { - return vreinterpretq_m128_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); } -// Compares for equality. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) -FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { - return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); } -// Compares for inequality. -// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { - return vreinterpretq_m128_u32(vmvnq_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); } -// Compares for inequality. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) -FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) { - return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); } -// Compares for not greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { - return _mm_cmplt_ps(a, b); + return _mm_set_pd(e0, e1); } -// Compares for not greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +// Return vector of type __m128d with all elements set to zero. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) { - return _mm_cmplt_ss(a, b); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif } -// Compares for not greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) -FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) { - return _mm_cmple_ps(a, b); + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); } -// Compares for not greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) -FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// +// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64(__builtin_shufflevector( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ + ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shift packed 16-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF count[63:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16 +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { - return _mm_cmple_ss(a, b); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } -// Compares for not less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +// Shift packed 32-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF count[63:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32 +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { - return _mm_cmpgt_ps(a, b); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } -// Compares for not less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +// Shift packed 64-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF count[63:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64 +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { - return _mm_cmpgt_ss(a, b); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } -// Compares for not less than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16 +FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) { - return _mm_cmpge_ps(a, b); + if (_sse2neon_unlikely(imm & ~15)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s16( + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); } -// Compares for not less than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32 +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { - return _mm_cmpge_ss(a, b); + if (_sse2neon_unlikely(imm & ~31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); } -// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or -// unsigned 8-bit integers in b for equality. -// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64 +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// +// tmp := imm8[7:0] +// IF tmp > 15 +// tmp := 16 +// FI +// dst[127:0] := a[127:0] << (tmp*8) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128 +FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm) { + if (_sse2neon_unlikely(imm & ~15)) + return _mm_setzero_si128(); + uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)}; return vreinterpretq_m128i_u8( - vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); + vld1q_u8(((uint8_t const *) tmp) + (16 - imm))); } -// Compare packed double-precision (64-bit) floating-point elements in a and b -// for equality, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd -FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { #if defined(__aarch64__) - return vreinterpretq_m128d_u64( - vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else - // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) - uint32x4_t cmp = - vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); - uint32x4_t swapped = vrev64q_u32(cmp); - return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); + double a0 = sqrt(((double *) &a)[0]); + double a1 = sqrt(((double *) &a)[1]); + return _mm_set_pd(a1, a0); #endif } -// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or -// unsigned 16-bit integers in b for equality. -// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { - return vreinterpretq_m128i_u16( - vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_sqrt_pd(b)); +#else + return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); +#endif } -// Compare packed 32-bit integers in a and b for equality, and store the results -// in dst -FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +// Shift packed 16-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF count[63:0] > 15 +// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +// ELSE +// dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16 +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { - return vreinterpretq_m128i_u32( - vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (_sse2neon_unlikely(c & ~15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); } -// Compare packed 64-bit integers in a and b for equality, and store the results -// in dst -FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +// Shift packed 32-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF count[63:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32 +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { -#if defined(__aarch64__) - return vreinterpretq_m128i_u64( - vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); -#else - // ARMv7 lacks vceqq_u64 - // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) - uint32x4_t cmp = - vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); - uint32x4_t swapped = vrev64q_u32(cmp); - return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); -#endif + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (_sse2neon_unlikely(c & ~31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); } -// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers -// in b for lesser than. -// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +// Shift packed 16-bit integers in a right by imm8 while shifting in sign +// bits, and store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) +// ELSE +// dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { - return vreinterpretq_m128i_u8( - vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); } -// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers -// in b for greater than. +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. // -// r0 := (a0 > b0) ? 0xff : 0x0 -// r1 := (a1 > b1) ? 0xff : 0x0 -// ... -// r15 := (a15 > b15) ? 0xff : 0x0 +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR // -// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (_sse2neon_unlikely((imm) == 0)) { \ + ret = a; \ + } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shift packed 16-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF count[63:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16 +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { - return vreinterpretq_m128i_u8( - vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } -// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers -// in b for less than. +// Shift packed 32-bit integers in a right by count while shifting in zeros, and +// store the results in dst. // -// r0 := (a0 < b0) ? 0xffff : 0x0 -// r1 := (a1 < b1) ? 0xffff : 0x0 -// ... -// r7 := (a7 < b7) ? 0xffff : 0x0 +// FOR j := 0 to 3 +// i := j*32 +// IF count[63:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) +// FI +// ENDFOR // -// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32 +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { - return vreinterpretq_m128i_u16( - vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } -// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers -// in b for greater than. +// Shift packed 64-bit integers in a right by count while shifting in zeros, and +// store the results in dst. // -// r0 := (a0 > b0) ? 0xffff : 0x0 -// r1 := (a1 > b1) ? 0xffff : 0x0 -// ... -// r7 := (a7 > b7) ? 0xffff : 0x0 +// FOR j := 0 to 1 +// i := j*64 +// IF count[63:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) +// FI +// ENDFOR // -// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64 +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { - return vreinterpretq_m128i_u16( - vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (_sse2neon_unlikely(imm & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (_sse2neon_unlikely(imm & ~31)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } \ + ret; \ + }) -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers -// in b for less than. -// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (_sse2neon_unlikely(imm & ~63)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } \ + ret; \ + }) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// +// tmp := imm8[7:0] +// IF tmp > 15 +// tmp := 16 +// FI +// dst[127:0] := a[127:0] >> (tmp*8) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128 +FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm) { - return vreinterpretq_m128i_u32( - vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + if (_sse2neon_unlikely(imm & ~15)) + return _mm_setzero_si128(); + uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)}; + return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm)); } -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers -// in b for greater than. -// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { - return vreinterpretq_m128i_u32( - vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#if defined(__aarch64__) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif } -// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers -// in b for greater than. -FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { #if defined(__aarch64__) - return vreinterpretq_m128i_u64( - vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else - // ARMv7 lacks vcgtq_s64. - // This is based off of Clang's SSE2 polyfill: - // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) - - // Mask the sign bit out since we need a signed AND an unsigned comparison - // and it is ugly to try and split them. - int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); - int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); - int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); - // Check if a > b - int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); - // Copy upper mask to lower mask - // a_hi > b_hi - int64x2_t gt_hi = vshrq_n_s64(greater, 63); - // Copy lower mask to upper mask - // a_lo > b_lo - int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); - // Compare for equality - int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); - // Copy upper mask to lower mask - // a_hi == b_hi - int64x2_t eq_hi = vshrq_n_s64(equal, 63); - // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) - int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); - return vreinterpretq_m128i_s64(ret); + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } -// Compares the four 32-bit floats in a and b to check if any values are NaN. -// Ordered compare between each value returns true for "orderable" and false for -// "not orderable" (NaN). -// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see -// also: -// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean -// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics -FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { - // Note: NEON does not have ordered compare builtin - // Need to compare a eq a and b eq b to check for NaN - // Do AND of results to get final - uint32x4_t ceqaa = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t ceqbb = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); +#endif } -// Compares for ordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) -FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { - return _mm_move_ss(a, _mm_cmpord_ps(a, b)); + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } -// Compares for unordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) -FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { - uint32x4_t f32a = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t f32b = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif } -// Compares for unordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) -FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { - return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); } -// Compares the lower single-precision floating point scalar values of a and b -// using a less than operation. : -// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important -// note!! The documentation on MSDN is incorrect! If either of the values is a -// NAN the docs say you will get a one, but in fact, it will return a zero!! -FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_lt_b = - vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif } -// Compares the lower single-precision floating point scalar values of a and b -// using a greater than operation. : -// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx -FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { - // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = - vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); } -// Compares the lower single-precision floating point scalar values of a and b -// using a less than or equal operation. : -// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx -FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { - // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_le_b = - vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; + _mm_store_pd(mem_addr, a); } -// Compares the lower single-precision floating point scalar values of a and b -// using a greater than or equal operation. : -// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx -FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +// Stores 128-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128 +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { - // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = - vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } -// Compares the lower single-precision floating point scalar values of a and b -// using an equality operation. : -// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx -FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +// Stores 32-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32 +FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { - // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_eq_b = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; + vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); } -// Compares the lower single-precision floating point scalar values of a and b -// using an inequality operation. : -// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx -FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd +FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { - // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_neq_b = vmvnq_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#elif defined(__aarch64__) + vst1q_f64(p, vreinterpretq_f64_m128d(a)); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); +#endif } -// according to the documentation, these intrinsics behave the same as the -// non-'u' versions. We'll just alias them here. -#define _mm_ucomieq_ss _mm_comieq_ss -#define _mm_ucomige_ss _mm_comige_ss -#define _mm_ucomigt_ss _mm_comigt_ss -#define _mm_ucomile_ss _mm_comile_ss -#define _mm_ucomilt_ss _mm_comilt_ss -#define _mm_ucomineq_ss _mm_comineq_ss +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} -/* Conversions */ +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32 +FORCE_INLINE void _mm_stream_si32(int *p, int a) +{ + vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); +} -// Convert packed signed 32-bit integers in b to packed single-precision -// (32-bit) floating-point elements, store the results in the lower 2 elements -// of dst, and copy the upper 2 packed elements from a to the upper elements of -// dst. +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. // -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -// dst[95:64] := a[95:64] -// dst[127:96] := a[127:96] +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps -FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), - vget_high_f32(vreinterpretq_f32_m128(a)))); + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Convert the signed 32-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. // -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:32] := a[127:32] +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss -FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *) c); +#endif } -// Convert the signed 32-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. // -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:32] := a[127:32] +// dst[63:0] := a[63:0] - b[63:0] // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss -#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} -// Convert the signed 64-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. // -// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -// dst[127:32] := a[127:32] +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss -FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer, and store the result in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si -FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) - return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); -#else - float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t diff = data - floor(data); - if (diff > 0.5) - return (int32_t) ceil(data); - if (unlikely(diff == 0.5)) { - int32_t f = (int32_t) floor(data); - int32_t c = (int32_t) ceil(data); - return c & 1 ? f : c; - } - return (int32_t) floor(data); -#endif + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } -// Convert packed 16-bit integers in a to packed single-precision (32-bit) -// floating-point elements, and store the results in dst. +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. // -// FOR j := 0 to 3 -// i := j*16 -// m := j*32 -// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -// ENDFOR +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps -FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } -// Convert packed 32-bit integers in b to packed single-precision (32-bit) -// floating-point elements, store the results in the lower 2 elements of dst, -// and copy the upper 2 packed elements from a to the upper elements of dst. +#define _mm_ucomieq_sd _mm_comieq_sd +#define _mm_ucomige_sd _mm_comige_sd +#define _mm_ucomigt_sd _mm_comigt_sd +#define _mm_ucomile_sd _mm_comile_sd +#define _mm_ucomilt_sd _mm_comilt_sd +#define _mm_ucomineq_sd _mm_comineq_sd + +// Return vector of type __m128d with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd +FORCE_INLINE __m128d _mm_undefined_pd(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128d a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. // -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -// dst[95:64] := a[95:64] -// dst[127:96] := a[127:96] +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps -FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), - vget_high_f32(vreinterpretq_f32_m128(a)))); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif } -// Convert packed signed 32-bit integers in a to packed single-precision -// (32-bit) floating-point elements, store the results in the lower 2 elements -// of dst, then covert the packed signed 32-bit integers in b to -// single-precision (32-bit) floating-point element, and store the results in -// the upper 2 elements of dst. +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. // -// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) -// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps -FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { - return vreinterpretq_m128_f32(vcvtq_f32_s32( - vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif } -// Convert the lower packed 8-bit integers in a to packed single-precision -// (32-bit) floating-point elements, and store the results in dst. +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. // -// FOR j := 0 to 3 -// i := j*8 -// m := j*32 -// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) -// ENDFOR +// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[127:64] +// dst[127:64] := src2[127:64] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps -FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { - return vreinterpretq_m128_f32(vcvtq_f32_s32( - vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif } -// Convert packed unsigned 16-bit integers in a to packed single-precision -// (32-bit) floating-point elements, and store the results in dst. +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. // -// FOR j := 0 to 3 -// i := j*16 -// m := j*32 -// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) -// ENDFOR +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps -FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { - return vreinterpretq_m128_f32( - vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif } -// Convert the lower packed unsigned 8-bit integers in a to packed -// single-precision (32-bit) floating-point elements, and store the results in -// dst. +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. // -// FOR j := 0 to 3 -// i := j*8 -// m := j*32 -// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) -// ENDFOR +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps -FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { - return vreinterpretq_m128_f32(vcvtq_f32_u32( - vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif } -// Converts the four single-precision, floating-point values of a to signed -// 32-bit integer values using truncate. -// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { - return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); } -// Convert the lower double-precision (64-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. // -// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 -FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) - return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else - double ret = *((double *) &a); - return (int64_t) ret; + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } -// Convert the lower double-precision (64-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. // -// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[63:0] +// dst[127:64] := src2[63:0] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x -#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) - -// Converts the four signed 32-bit integer values of a to single-precision, -// floating-point values -// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { - return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif } -// Converts the four unsigned 8-bit integers in the lower 16 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - return vreinterpretq_m128i_u16(u16x8); + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } -// Converts the four unsigned 8-bit integers in the lower 32 bits to four -// unsigned 32-bit integers. -// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx -FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ - return vreinterpretq_m128i_u32(u32x4); + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Converts the two unsigned 8-bit integers in the lower 16 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +/* SSE3 */ + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF ((j & 1) == 0) +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ELSE +// dst[i+63:i] := a[i+63:i] + b[i+63:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_u64(u64x2); + __m128d mask = _mm_set_pd(1.0f, -1.0f); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), + vreinterpretq_f64_m128d(b), + vreinterpretq_f64_m128d(mask))); +#else + return _mm_add_pd(_mm_mul_pd(b, mask), a); +#endif } -// Converts the four unsigned 8-bit integers in the lower 16 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - return vreinterpretq_m128i_s16(s16x8); + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; +#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */ + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), + vreinterpretq_f32_m128(mask), + vreinterpretq_f32_m128(b))); +#else + return _mm_add_ps(_mm_mul_ps(b, mask), a); +#endif } -// Converts the four unsigned 8-bit integers in the lower 32 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ - return vreinterpretq_m128i_s32(s32x4); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif } -// Converts the two signed 8-bit integers in the lower 32 bits to four -// signed 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_s64(s64x2); +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif } -// Converts the four signed 16-bit integers in the lower 64 bits to four signed -// 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd +FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) { - return vreinterpretq_m128i_s32( - vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsubq_f64( + vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)), + vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)))); +#else + double *da = (double *) &_a; + double *db = (double *) &_b; + double c[] = {da[0] - da[1], db[0] - db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif } -// Converts the two signed 16-bit integers in the lower 32 bits two signed -// 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { - int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_s64(s64x2); +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif } -// Converts the four unsigned 16-bit integers in the lower 64 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { - return vreinterpretq_m128i_u32( - vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +#if (__aarch64__) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif } -// Converts the two unsigned 16-bit integers in the lower 32 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { - uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_u64(u64x2); +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif } -// Converts the two unsigned 32-bit integers in the lower 64 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { - return vreinterpretq_m128i_u64( - vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif } -// Converts the two signed 32-bit integers in the lower 64 bits to two signed -// 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +/* SSSE3 */ + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { - return vreinterpretq_m128i_s64( - vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); } -// Converts the four single-precision, floating-point values of a to signed -// 32-bit integer values. +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. // -// r0 := (int) a0 -// r1 := (int) a1 -// r2 := (int) a2 -// r3 := (int) a3 +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR // -// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx -// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A -// does not support! It is supported on ARMv8-A however. -FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { -#if defined(__aarch64__) - return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); -#else - uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), - vdupq_n_f32(0.5f)); /* +/- 0.5 */ - int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( - vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ - int32x4_t r_trunc = - vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ - int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( - vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ - int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), - vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ - float32x4_t delta = vsubq_f32( - vreinterpretq_f32_m128(a), - vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ - uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); -#endif + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); } -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 16-bit integers, and store the results in dst. Note: this intrinsic -// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and -// 0x7FFFFFFF. +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 -FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { - return vreinterpret_m64_s16( - vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); } -// Copy the lower 32-bit integer in a to dst. +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. // -// dst[31:0] := a[31:0] +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 -FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { - return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); } -// Copy the lower 64-bit integer in a to dst. +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. // -// dst[63:0] := a[63:0] +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 -FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { - return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); } -// Copy the lower 64-bit integer in a to dst. +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. // -// dst[63:0] := a[63:0] +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x -#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} -// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, -// zero extending the upper bits. +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. // -// r0 := a -// r1 := 0x0 -// r2 := 0x0 -// r3 := 0x0 +// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +// dst[127:0] := tmp[127:0] // -// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 +FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm) { - return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); + if (_sse2neon_unlikely(imm & ~31)) + return _mm_setzero_si128(); + int idx; + uint8x16_t tmp[2]; + if (imm >= 16) { + idx = imm - 16; + tmp[0] = vreinterpretq_u8_m128i(a); + tmp[1] = vdupq_n_u8(0); + } else { + idx = imm; + tmp[0] = vreinterpretq_u8_m128i(b); + tmp[1] = vreinterpretq_u8_m128i(a); + } + return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx)); } -// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, -// zero extending the upper bits. +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. // -// r0 := a -// r1 := 0x0 -FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +// dst[63:0] := tmp[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + __extension__({ \ + __m64 ret; \ + if (_sse2neon_unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low, tmp_high; \ + if (imm >= 8) { \ + const int idx = imm - 8; \ + tmp_low = vreinterpret_u8_m64(a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpret_u8_m64(b); \ + tmp_high = vreinterpret_u8_m64(a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { - return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif } -// Cast vector of type __m128 to type __m128d. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd -FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { - return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); } -// Applies a type cast to reinterpret four 32-bit floating point values passed -// in as a 128-bit parameter as packed 32-bit integers. -// https://msdn.microsoft.com/en-us/library/bb514099.aspx -FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { - return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } -// Cast vector of type __m128i to type __m128d. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd -FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else - return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); #endif } -// Applies a type cast to reinterpret four 32-bit integers passed in as a -// 128-bit parameter as packed 32-bit floating point values. -// https://msdn.microsoft.com/en-us/library/bb514029.aspx -FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16 +FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { - return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) + return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t res = vuzp_s16(a, b); + return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); +#endif } -// Loads 128-bit value. : -// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx -FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); } -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd -FORCE_INLINE __m128d _mm_load1_pd(const double *p) +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16 +FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) +{ + int32x4_t ab = + vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b)); + + int16x4_t ab_low_bits = vmovn_s32(ab); + int16x4_t ab_high_bits = vshrn_n_s32(ab, 16); + + return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits)); +} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32 +FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b))); #else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); + int32x2x2_t trn_ab = + vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b)); + return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1])); #endif } -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 -#define _mm_load_pd1 _mm_load1_pd +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} -// Load a double-precision (64-bit) floating-point element from memory into the -// upper element of dst, and copy the lower element from a to dst. mem_addr does -// not need to be aligned on any particular boundary. -// -// dst[63:0] := a[63:0] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd -FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16 +FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); + return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else - return vreinterpretq_m128d_f32(vcombine_f32( - vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); + int16x4x2_t res = vuzp_s16(a, b); + return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1])); #endif } -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 -#define _mm_load_pd1 _mm_load1_pd +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd -#define _mm_loaddup_pd _mm_load1_pd + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); -// Loads 128-bit value. : -// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx -FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and +// pack the saturated results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16 +FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) { - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); + uint16x4_t a = vreinterpret_u16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // Zero extend a + int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); + int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); + + // Sign extend by shifting left then shifting right. + int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); + int16x4_t b_odd = vshr_n_s16(b, 8); + + // multiply + int16x4_t prod1 = vmul_s16(a_even, b_even); + int16x4_t prod2 = vmul_s16(a_odd, b_odd); + + // saturated add + return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); } -// Load unaligned 32-bit integer from memory into the first element of dst. -// -// dst[31:0] := MEM[mem_addr+31:mem_addr] -// dst[MAX:32] := 0 +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 -FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { - return vreinterpretq_m128i_s32( - vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); } -// Convert packed double-precision (64-bit) floating-point elements in a to -// packed single-precision (32-bit) floating-point elements, and store the -// results in dst. +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Truncate each intermediate integer to the 18 most +// significant bits, round by adding 1, and store bits [16:1] to dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16 +FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) +{ + int32x4_t mul_extend = + vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); + + // Rounding narrowing shift right + return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. // -// FOR j := 0 to 1 -// i := 32*j -// k := 64*j -// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7] == 1 +// dst[i+7:i] := 0 +// ELSE +// index[2:0] := b[i+2:i] +// dst[i+7:i] := a[index*8+7:index*8] +// FI // ENDFOR -// dst[127:64] := 0 // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps -FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8 +FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) +{ + const int8x8_t controlMask = + vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t)(0x1 << 7 | 0x07))); + int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); + return vreinterpret_m64_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) - float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); - return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else - float a0 = (float) ((double *) &a)[0]; - float a1 = (float) ((double *) &a)[1]; - return _mm_set_ps(0, 0, a1, a0); + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); #endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); } -// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. // -// dst[63:0] := a[63:0] +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 -FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) - return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else - return ((double *) &a)[0]; + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); #endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); } -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed double-precision (64-bit) floating-point elements, and store the -// results in dst. +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. // -// FOR j := 0 to 1 -// i := 64*j -// k := 32*j -// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI // ENDFOR // -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd -FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else - double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); - return _mm_set_pd(a1, a0); + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); #endif -} -// Cast vector of type __m128d to type __m128i. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 -FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) -{ - return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); } -// Cast vector of type __m128d to type __m128. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps -FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { - return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); } -// Blend packed single-precision (32-bit) floating-point elements from a and b -// using mask, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps -FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { - // Use a signed shift right to create a mask with the sign bit - uint32x4_t mask = - vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); - float32x4_t a = vreinterpretq_f32_m128(_a); - float32x4_t b = vreinterpretq_f32_m128(_b); - return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); } +/* SSE4.1 */ + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t ones = 0xffff; \ + const uint16_t zeros = 0x0000; \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros, \ + ((imm) & (1 << 1)) ? ones : zeros, \ + ((imm) & (1 << 2)) ? ones : zeros, \ + ((imm) & (1 << 3)) ? ones : zeros, \ + ((imm) & (1 << 4)) ? ones : zeros, \ + ((imm) & (1 << 5)) ? ones : zeros, \ + ((imm) & (1 << 6)) ? ones : zeros, \ + ((imm) & (1 << 7)) ? ones : zeros}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd +#define _mm_blend_pd(a, b, imm) \ + __extension__({ \ + const uint64_t _mask[2] = { \ + ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ + ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ + uint64x2_t _mask_vec = vld1q_u64(_mask); \ + uint64x2_t _a = vreinterpretq_u64_m128d(a); \ + uint64x2_t _b = vreinterpretq_u64_m128d(b); \ + vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ + }) + // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps @@ -5672,6 +7337,27 @@ FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + // Blend packed double-precision (64-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd @@ -5690,154 +7376,55 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) #endif } -typedef struct { - uint16_t res0; - uint8_t res1 : 6; - uint8_t bit22 : 1; - uint8_t bit23 : 1; - uint8_t res2; -#if defined(__aarch64__) - uint32_t res3; -#endif -} fpcr_bitfield; - -// Macro: Set the rounding mode bits of the MXCSR control and status register to -// the value in unsigned 32-bit integer a. The rounding mode may contain any of -// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, -// _MM_ROUND_TOWARD_ZERO -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE -FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) -{ - union { - fpcr_bitfield field; -#if defined(__aarch64__) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) - asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ -#else - asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; - break; - case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; - break; - case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; - break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; - } - -#if defined(__aarch64__) - asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ -#else - asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif -} - -FORCE_INLINE void _mm_setcsr(unsigned int a) +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { - _MM_SET_ROUNDING_MODE(a); + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } -// Round the packed single-precision (32-bit) floating-point elements in a using -// the rounding parameter, and store the results as packed single-precision +// Round the packed double-precision (64-bit) floating-point elements in a up +// to an integer value, and store the results as packed double-precision // floating-point elements in dst. -// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps -FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd +FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { #if defined(__aarch64__) - switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); - default: //_MM_FROUND_CUR_DIRECTION - return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); - } + return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else - float *v_float = (float *) &a; - __m128 zero, neg_inf, pos_inf; - - switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): - return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): - return (__m128){floorf(v_float[0]), floorf(v_float[1]), - floorf(v_float[2]), floorf(v_float[3])}; - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): - return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), - ceilf(v_float[3])}; - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): - zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); - neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), - floorf(v_float[2]), floorf(v_float[3])); - pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), - ceilf(v_float[2]), ceilf(v_float[3])); - return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); - default: //_MM_FROUND_CUR_DIRECTION - return (__m128){roundf(v_float[0]), roundf(v_float[1]), - roundf(v_float[2]), roundf(v_float[3])}; - } + double *f = (double *) &a; + return _mm_set_pd(ceil(f[1]), ceil(f[0])); #endif } -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi -FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { #if defined(__aarch64__) - return vreinterpret_m64_s32( - vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else - return vreinterpret_m64_s32( - vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( - _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); + float *f = (float *) &a; + return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); #endif } -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 -#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) - -// Round the packed single-precision (32-bit) floating-point elements in a up to -// an integer value, and store the results as packed single-precision -// floating-point elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps -FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +// Round the lower double-precision (64-bit) floating-point element in b up to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd +FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) { - return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + return _mm_move_sd(a, _mm_ceil_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b up to @@ -5851,396 +7438,442 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { - return _mm_move_ss( - a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); + return _mm_move_ss(a, _mm_ceil_ps(b)); } -// Round the packed single-precision (32-bit) floating-point elements in a down -// to an integer value, and store the results as packed single-precision -// floating-point elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps -FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { - return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif } -// Round the lower single-precision (32-bit) floating-point element in b down to -// an integer value, store the result as a single-precision floating-point -// element in the lower element of dst, and copy the upper 3 packed elements -// from a to the upper elements of dst. -// -// dst[31:0] := FLOOR(b[31:0]) -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss -FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { - return _mm_move_ss( - a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); } -// Load 128-bits of integer data from unaligned memory into dst. This intrinsic -// may perform better than _mm_loadu_si128 when the data crosses a cache line -// boundary. -// -// dst[127:0] := MEM[mem_addr+127:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 -#define _mm_lddqu_si128 _mm_loadu_si128 +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} -/* Miscellaneous Operations */ +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} -// Shifts the 8 signed 16-bit integers in a right by count bits while shifting -// in the sign bit. -// -// r0 := a0 >> count -// r1 := a1 >> count -// ... -// r7 := a7 >> count -// -// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { - int64_t c = (int64_t) vget_low_s64((int64x2_t) count); - if (unlikely(c > 15)) - return _mm_cmplt_epi16(a, _mm_setzero_si128()); - return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); } -// Shifts the 4 signed 32-bit integers in a right by count bits while shifting -// in the sign bit. -// -// r0 := a0 >> count -// r1 := a1 >> count -// r2 := a2 >> count -// r3 := a3 >> count -// -// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx -FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { - int64_t c = (int64_t) vget_low_s64((int64x2_t) count); - if (unlikely(c > 31)) - return _mm_cmplt_epi32(a, _mm_setzero_si128()); - return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); } -// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and -// saturates. -// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { - return vreinterpretq_m128i_s8( - vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), - vqmovn_s16(vreinterpretq_s16_m128i(b)))); + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); } -// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned -// integers and saturates. -// -// r0 := UnsignedSaturate(a0) -// r1 := UnsignedSaturate(a1) -// ... -// r7 := UnsignedSaturate(a7) -// r8 := UnsignedSaturate(b0) -// r9 := UnsignedSaturate(b1) -// ... -// r15 := UnsignedSaturate(b7) -// -// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { - return vreinterpretq_m128i_u8( - vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), - vqmovun_s16(vreinterpretq_s16_m128i(b)))); + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); } -// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers -// and saturates. -// -// r0 := SignedSaturate(a0) -// r1 := SignedSaturate(a1) -// r2 := SignedSaturate(a2) -// r3 := SignedSaturate(a3) -// r4 := SignedSaturate(b0) -// r5 := SignedSaturate(b1) -// r6 := SignedSaturate(b2) -// r7 := SignedSaturate(b3) -// -// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { - return vreinterpretq_m128i_s16( - vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), - vqmovn_s32(vreinterpretq_s32_m128i(b)))); + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); } -// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit -// integers and saturates. -// -// r0 := UnsignedSaturate(a0) -// r1 := UnsignedSaturate(a1) -// r2 := UnsignedSaturate(a2) -// r3 := UnsignedSaturate(a3) -// r4 := UnsignedSaturate(b0) -// r5 := UnsignedSaturate(b1) -// r6 := UnsignedSaturate(b2) -// r7 := UnsignedSaturate(b3) -FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { - return vreinterpretq_m128i_u16( - vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), - vqmovun_s32(vreinterpretq_s32_m128i(b)))); + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); } -// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower -// 8 signed or unsigned 8-bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// ... -// r14 := a7 -// r15 := b7 -// -// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16 +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { -#if defined(__aarch64__) - return vreinterpretq_m128i_s8( - vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -#else - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -#endif + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); } -// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the -// lower 4 signed or unsigned 16-bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// r4 := a2 -// r5 := b2 -// r6 := a3 -// r7 := b3 -// -// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { -#if defined(__aarch64__) - return vreinterpretq_m128i_s16( - vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -#else - int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -#endif + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); } -// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the -// lower 2 signed or unsigned 32 - bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// -// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Conditionally multiply the packed double-precision (64-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, and +// conditionally store the sum in dst using the low 4 bits of imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd +FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) { + // Generate mask value from constant immediate bit value + const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; + const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; +#if !SSE2NEON_PRECISE_DP + const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; + const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; +#endif + // Conditional multiplication +#if !SSE2NEON_PRECISE_DP + __m128d mul = _mm_mul_pd(a, b); + const __m128d mulMask = + _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); + __m128d tmp = _mm_and_pd(mul, mulMask); +#else #if defined(__aarch64__) - return vreinterpretq_m128i_s32( - vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); + double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) + : 0; + double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) + : 0; #else - int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); + double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; + double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; +#endif + __m128d tmp = _mm_set_pd(d1, d0); #endif + // Sum the products +#if defined(__aarch64__) + double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); +#else + double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); +#endif + // Conditionally store the sum + const __m128d sumMask = + _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); + __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); + return res; } -FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { - int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); - int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); } -// Selects and interleaves the lower two single-precision, floating-point values -// from a and b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// -// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8 +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Round the packed double-precision (64-bit) floating-point elements in a down +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd +FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { #if defined(__aarch64__) - return vreinterpretq_m128_f32( - vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else - float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); + double *f = (double *) &a; + return _mm_set_pd(floor(f[1]), floor(f[0])); #endif } -// Unpack and interleave double-precision (64-bit) floating-point elements from -// the low half of a and b, and store the results in dst. -// -// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { -// dst[63:0] := src1[63:0] -// dst[127:64] := src2[63:0] -// RETURN dst[127:0] -// } -// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd -FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { #if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else - return vreinterpretq_m128d_s64( - vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), - vget_low_s64(vreinterpretq_s64_m128d(b)))); + float *f = (float *) &a; + return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); #endif } -// Unpack and interleave double-precision (64-bit) floating-point elements from -// the high half of a and b, and store the results in dst. -// -// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { -// dst[63:0] := src1[127:64] -// dst[127:64] := src2[127:64] -// RETURN dst[127:0] -// } -// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd -FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +// Round the lower double-precision (64-bit) floating-point element in b down to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd +FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) { -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); -#else - return vreinterpretq_m128d_s64( - vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), - vget_high_s64(vreinterpretq_s64_m128d(b)))); -#endif + return _mm_move_sd(a, _mm_floor_pd(b)); } -// Selects and interleaves the upper two single-precision, floating-point values -// from a and b. +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. // -// r0 := a2 -// r1 := b2 -// r2 := a3 -// r3 := b3 +// dst[31:0] := FLOOR(b[31:0]) +// dst[127:32] := a[127:32] // -// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); -#endif + return _mm_move_ss(a, _mm_floor_ps(b)); } -// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper -// 8 signed or unsigned 8-bit integers in b. +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Copy a to tmp, then insert a single-precision (32-bit) floating-point +// element from b into tmp using the control in imm8. Store tmp to dst using +// the mask in imm8 (elements are zeroed out when the corresponding bit is set). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps +#define _mm_insert_ps(a, b, imm8) \ + __extension__({ \ + float32x4_t tmp1 = \ + vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3), \ + vreinterpretq_f32_m128(a), 0); \ + float32x4_t tmp2 = \ + vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \ + ((imm8 >> 4) & 0x3)); \ + const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \ + uint32x4_t mask = vld1q_u32(data); \ + float32x4_t all_zeros = vdupq_n_f32(0); \ + \ + vreinterpretq_m128_f32( \ + vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \ + }) + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. // -// r0 := a8 -// r1 := b8 -// r2 := a9 -// r3 := b9 -// ... -// r14 := a15 -// r15 := b15 +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 // -// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) return vreinterpretq_m128i_s8( - vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -#else - int8x8_t a1 = - vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = - vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -#endif + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } -// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the -// upper 4 signed or unsigned 16-bit integers in b. +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. // -// r0 := a4 -// r1 := b4 -// r2 := a5 -// r3 := b5 -// r4 := a6 -// r5 := b6 -// r6 := a7 -// r7 := b7 +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 // -// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { -#if defined(__aarch64__) - return vreinterpretq_m128i_s16( - vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -#else - int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -#endif + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } -// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the -// upper 2 signed or unsigned 32-bit integers in b. -// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { -#if defined(__aarch64__) - return vreinterpretq_m128i_s32( - vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -#else - int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); -#endif + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } -// Interleaves the upper signed or unsigned 64-bit integer in a with the -// upper signed or unsigned 64-bit integer in b. -// -// r0 := a1 -// r1 := b1 -FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { - int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); - int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Horizontally compute the minimum amongst the packed unsigned 16-bit integers @@ -6296,6 +7929,339 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) return dst; } +// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +// 8-bit integers in a compared to those in b, and store the 16-bit results in +// dst. Eight SADs are performed using one quadruplet from b and eight +// quadruplets from a. One quadruplet is selected from b starting at on the +// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit +// integers selected from a starting at the offset specified in imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8 +FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) +{ + uint8x16_t _a, _b; + + switch (imm & 0x4) { + case 0: + // do nothing + _a = vreinterpretq_u8_m128i(a); + break; + case 4: + _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), + vreinterpretq_u32_m128i(a), 1)); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#endif + break; + } + + switch (imm & 0x3) { + case 0: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); + break; + case 1: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); + break; + case 2: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); + break; + case 3: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#endif + break; + } + + int16x8_t c04, c15, c26, c37; + uint8x8_t low_b = vget_low_u8(_b); + c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); + _a = vextq_u8(_a, _a, 1); + c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); + _a = vextq_u8(_a, _a, 1); + c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); + _a = vextq_u8(_a, _a, 1); + c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b))); +#if defined(__aarch64__) + // |0|4|2|6| + c04 = vpaddq_s16(c04, c26); + // |1|5|3|7| + c15 = vpaddq_s16(c15, c37); + + int32x4_t trn1_c = + vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + int32x4_t trn2_c = + vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), + vreinterpretq_s16_s32(trn2_c))); +#else + int16x4_t c01, c23, c45, c67; + c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); + c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); + c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); + c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); + + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); +#endif +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Round the packed double-precision (64-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed double-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd +FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_pd(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_pd(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); + } +#else + double *v_double = (double *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + double res[2], tmp; + for (int i = 0; i < 2; i++) { + tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; + double roundDown = floor(tmp); // Round down value + double roundUp = ceil(tmp); // Round up value + double diffDown = tmp - roundDown; + double diffUp = roundUp - tmp; + if (diffDown < diffUp) { + /* If it's closer to the round down value, then use it */ + res[i] = roundDown; + } else if (diffDown > diffUp) { + /* If it's closer to the round up value, then use it */ + res[i] = roundUp; + } else { + /* If it's equidistant between round up and round down value, + * pick the one which is an even number */ + double half = roundDown / 2; + if (half != floor(half)) { + /* If the round down value is odd, return the round up value + */ + res[i] = roundUp; + } else { + /* If the round up value is odd, return the round down value + */ + res[i] = roundDown; + } + } + res[i] = (v_double[i] < 0) ? -res[i] : res[i]; + } + return _mm_set_pd(res[1], res[0]); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_pd(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_pd(a); + } + return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]), + v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0])); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_ps(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_ps(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_ps(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_ps(a); + } + return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), + v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), + v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), + v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b using +// the rounding parameter, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd +FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) +{ + return _mm_move_sd(a, _mm_round_pd(b, rounding)); +} + +// Round the lower single-precision (32-bit) floating-point element in b using +// the rounding parameter, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. Rounding is done according to the +// rounding[3:0] parameter, which can be one of: +// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and +// suppress exceptions +// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and +// suppress exceptions +// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress +// exceptions +// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress +// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see +// _MM_SET_ROUNDING_MODE +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss +FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) +{ + return _mm_move_ss(a, _mm_round_ps(b, rounding)); +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute +// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is +// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero +FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) +{ + uint64x2_t zf = + vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); + uint64x2_t cf = + vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a)); + uint64x2_t result = vandq_u64(zf, cf); + return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1)); +} + // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, @@ -6312,6 +8278,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128 +#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the ZF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) @@ -6321,299 +8295,93 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } -// Extracts the selected signed or unsigned 8-bit integer from a and zero -// extends. -// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) -#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) - -// Inserts the least significant 8 bits of b into the selected 8-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, -// __constrange(0,16) int imm) -#define _mm_insert_epi8(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s8( \ - vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 16-bit integer from a and zero -// extends. -// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx -// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) -#define _mm_extract_epi16(a, imm) \ - vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) - -// Inserts the least significant 16 bits of b into the selected 16-bit integer -// of a. -// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx -// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, -// __constrange(0,8) int imm) -#define _mm_insert_epi16(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s16( \ - vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ - }) - -// Copy a to dst, and insert the 16-bit integer i into dst at the location -// specified by imm8. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 -#define _mm_insert_pi16(a, b, imm) \ - __extension__({ \ - vreinterpret_m64_s16( \ - vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 32-bit integer from a and zero -// extends. -// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) -#define _mm_extract_epi32(a, imm) \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) - -// Extracts the selected single-precision (32-bit) floating-point from a. -// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) -#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) - -// Inserts the least significant 32 bits of b into the selected 32-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, -// __constrange(0,4) int imm) -#define _mm_insert_epi32(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 64-bit integer from a and zero -// extends. -// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) -#define _mm_extract_epi64(a, imm) \ - vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) - -// Inserts the least significant 64 bits of b into the selected 64-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, -// __constrange(0,2) int imm) -#define _mm_insert_epi64(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s64( \ - vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ - }) +/* SSE4.2 */ -// Count the number of bits set to 1 in unsigned 32-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 -FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) -#if __has_builtin(__builtin_popcount) - return __builtin_popcount(a); + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else - return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); -#endif -#else - uint32_t count = 0; - uint8x8_t input_val, count8x8_val; - uint16x4_t count16x4_val; - uint32x2_t count32x2_val; - - input_val = vld1_u8((uint8_t *) &a); - count8x8_val = vcnt_u8(input_val); - count16x4_val = vpaddl_u8(count8x8_val); - count32x2_val = vpaddl_u16(count16x4_val); - - vst1_u32(&count, count32x2_val); - return count; + return vreinterpretq_m128i_s64(vshrq_n_s64( + vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), + 63)); #endif } -// Count the number of bits set to 1 in unsigned 64-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 -FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { -#if defined(__aarch64__) -#if __has_builtin(__builtin_popcountll) - return __builtin_popcountll(a); -#else - return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); -#endif +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); #else - uint64_t count = 0; - uint8x8_t input_val, count8x8_val; - uint16x4_t count16x4_val; - uint32x2_t count32x2_val; - uint64x1_t count64x1_val; - - input_val = vld1_u8((uint8_t *) &a); - count8x8_val = vcnt_u8(input_val); - count16x4_val = vpaddl_u8(count8x8_val); - count32x2_val = vpaddl_u16(count16x4_val); - count64x1_val = vpaddl_u32(count32x2_val); - vst1_u64(&count, count64x1_val); - return count; + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif + return crc; } -// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision -// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the -// transposed matrix in these vectors (row0 now contains column 0, etc.). -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS -#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ - float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ - row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ - vget_low_f32(ROW23.val[0])); \ - row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ - vget_low_f32(ROW23.val[1])); \ - row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ - vget_high_f32(ROW23.val[0])); \ - row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ - vget_high_f32(ROW23.val[1])); \ - } while (0) - -/* Crypto Extensions */ - -#if defined(__ARM_FEATURE_CRYPTO) -// Wraps vmull_p64 -FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) -{ - poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); - poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); - return vreinterpretq_u64_p128(vmull_p64(a, b)); -} -#else // ARMv7 polyfill -// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. -// -// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a -// 64-bit->128-bit polynomial multiply. -// -// It needs some work and is somewhat slow, but it is still faster than all -// known scalar methods. -// -// Algorithm adapted to C from -// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted -// from "Fast Software Polynomial Multiplication on ARM Processors Using the -// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab -// (https://hal.inria.fr/hal-01506572) -static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { - poly8x8_t a = vreinterpret_p8_u64(_a); - poly8x8_t b = vreinterpret_p8_u64(_b); - - // Masks - uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), - vcreate_u8(0x00000000ffffffff)); - uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), - vcreate_u8(0x0000000000000000)); - - // Do the multiplies, rotating with vext to get all combinations - uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 - uint8x16_t e = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 - uint8x16_t f = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 - uint8x16_t g = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 - uint8x16_t h = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 - uint8x16_t i = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 - uint8x16_t j = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 - uint8x16_t k = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 - - // Add cross products - uint8x16_t l = veorq_u8(e, f); // L = E + F - uint8x16_t m = veorq_u8(g, h); // M = G + H - uint8x16_t n = veorq_u8(i, j); // N = I + J - - // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL - // instructions. -#if defined(__aarch64__) - uint8x16_t lm_p0 = vreinterpretq_u8_u64( - vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t lm_p1 = vreinterpretq_u8_u64( - vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t nk_p0 = vreinterpretq_u8_u64( - vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); - uint8x16_t nk_p1 = vreinterpretq_u8_u64( - vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); #else - uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); - uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); - uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); - uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif - // t0 = (L) (P0 + P1) << 8 - // t1 = (M) (P2 + P3) << 16 - uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); - uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); - uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); - - // t2 = (N) (P4 + P5) << 24 - // t3 = (K) (P6 + P7) << 32 - uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); - uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); - uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + return crc; +} - // De-interleave -#if defined(__aarch64__) - uint8x16_t t0 = vreinterpretq_u8_u64( - vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t1 = vreinterpretq_u8_u64( - vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t2 = vreinterpretq_u8_u64( - vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); - uint8x16_t t3 = vreinterpretq_u8_u64( - vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); #else - uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); - uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); - uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); - uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); #endif - // Shift the cross products - uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 - uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 - uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 - uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 - - // Accumulate the products - uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); - uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); - uint8x16_t mix = veorq_u8(d, cross1); - uint8x16_t r = veorq_u8(mix, cross2); - return vreinterpretq_u64_u8(r); + return crc; } -#endif // ARMv7 polyfill -// Perform a carry-less multiplication of two 64-bit integers, selected from a -// and b according to imm8, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 -FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { - uint64x2_t a = vreinterpretq_u64_m128i(_a); - uint64x2_t b = vreinterpretq_u64_m128i(_b); - switch (imm & 0x11) { - case 0x00: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); - case 0x01: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); - case 0x10: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); - case 0x11: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); - default: - abort(); +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); } +#endif + return crc; } +/* AES */ + #if !defined(__ARM_FEATURE_CRYPTO) /* clang-format off */ #define SSE2NEON_AES_DATA(w) \ @@ -6752,22 +8520,22 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { /* FIXME: optimized for NEON */ uint8_t v[4][4] = { - [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, - [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, - [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, - [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, }; for (int i = 0; i < 16; i++) vreinterpretq_nth_u8_m128i(a, i) = @@ -6833,155 +8601,134 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) } #endif -/* Streaming Extensions */ +/* Others */ -// Guarantees that every preceding store is globally visible before any -// subsequent store. -// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx -FORCE_INLINE void _mm_sfence(void) +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) { - __sync_synchronize(); + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } } -// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- -// point elements) from a into memory using a non-temporal memory hint. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps -FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode() { -#if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, (float32x4_t *) p); + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; #else - vst1q_f32(p, vreinterpretq_f32_m128(a)); + uint32_t value; #endif -} + } r; -// Stores the data in a to the address p without polluting the caches. If the -// cache line containing address p is already in the cache, the cache will be -// updated. -// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx -FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) -{ -#if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, p); +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else - vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif + + return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; } -// Load 128-bits of integer data from memory into dst using a non-temporal -// memory hint. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// dst[127:0] := MEM[mem_addr+127:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 -FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { -#if __has_builtin(__builtin_nontemporal_store) - return __builtin_nontemporal_load(p); +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); #else - return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); #endif -} - -// Cache line containing p is flushed and invalidated from all caches in the -// coherency domain. : -// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx -FORCE_INLINE void _mm_clflush(void const *p) -{ - (void) p; - // no corollary for Neon? -} +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; -// Allocate aligned blocks of memory. -// https://software.intel.com/en-us/ -// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks -FORCE_INLINE void *_mm_malloc(size_t size, size_t align) -{ - void *ptr; - if (align == 1) - return malloc(size); - if (align == 2 || (sizeof(void *) == 8 && align == 4)) - align = sizeof(void *); - if (!posix_memalign(&ptr, align, size)) - return ptr; - return NULL; -} + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); -// Free aligned memory that was allocated with _mm_malloc. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free -FORCE_INLINE void _mm_free(void *addr) -{ - free(addr); + vst1_u32(&count, count32x2_val); + return count; +#endif } -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 8-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); #else - crc ^= v; - for (int bit = 0; bit < 8; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; #endif - return crc; } -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 16-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; #else - crc = _mm_crc32_u8(crc, v & 0xff); - crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); + uint32_t value; #endif - return crc; -} + } r; -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 32-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else - crc = _mm_crc32_u16(crc, v & 0xffff); - crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif - return crc; -} -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 64-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) -FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); + r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ #else - crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); - crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif - return crc; } #if defined(__GNUC__) || defined(__clang__) @@ -6993,4 +8740,4 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) #pragma GCC pop_options #endif -#endif +#endif
\ No newline at end of file diff --git a/thirdparty/embree/common/simd/simd.h b/thirdparty/embree/common/simd/simd.h index 195506b530..34e37b08b1 100644 --- a/thirdparty/embree/common/simd/simd.h +++ b/thirdparty/embree/common/simd/simd.h @@ -6,7 +6,7 @@ #include "../math/math.h" /* include SSE wrapper classes */ -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) # include "sse.h" #endif diff --git a/thirdparty/embree/common/simd/sse.h b/thirdparty/embree/common/simd/sse.h index 1465fb4fb0..04d90533dd 100644 --- a/thirdparty/embree/common/simd/sse.h +++ b/thirdparty/embree/common/simd/sse.h @@ -11,7 +11,7 @@ namespace embree { -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { return _mm_blendv_ps(f,t,mask); } diff --git a/thirdparty/embree/common/simd/vboold4_avx.h b/thirdparty/embree/common/simd/vboold4_avx.h index 7db0d1c5c1..450bd7a4eb 100644 --- a/thirdparty/embree/common/simd/vboold4_avx.h +++ b/thirdparty/embree/common/simd/vboold4_avx.h @@ -62,7 +62,11 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {} +#if !defined(__aarch64__) __forceinline vboold(TrueTy) : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {} +#else + __forceinline vboold(TrueTy) : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {} +#endif //////////////////////////////////////////////////////////////////////////////// /// Array Access @@ -107,9 +111,10 @@ namespace embree /// Movement/Shifting/Shuffling Functions //////////////////////////////////////////////////////////////////////////////// +#if !defined(__aarch64__) __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); } __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); } - +#endif #if defined(__AVX2__) template<int i0, int i1, int i2, int i3> diff --git a/thirdparty/embree/common/simd/vboolf16_avx512.h b/thirdparty/embree/common/simd/vboolf16_avx512.h index 19841dcea8..86b718f025 100644 --- a/thirdparty/embree/common/simd/vboolf16_avx512.h +++ b/thirdparty/embree/common/simd/vboolf16_avx512.h @@ -116,7 +116,7 @@ namespace embree __forceinline size_t popcnt (const vboolf16& a) { return popcnt(a.v); } //////////////////////////////////////////////////////////////////////////////// - /// Convertion Operations + /// Conversion Operations //////////////////////////////////////////////////////////////////////////////// __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); } diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h index fa84b1b6ee..9e0fdf5c6f 100644 --- a/thirdparty/embree/common/simd/vboolf4_sse2.h +++ b/thirdparty/embree/common/simd/vboolf4_sse2.h @@ -36,9 +36,11 @@ namespace embree __forceinline vboolf(__m128 input) : v(input) {} __forceinline operator const __m128&() const { return v; } + #if !defined(__EMSCRIPTEN__) __forceinline operator const __m128i() const { return _mm_castps_si128(v); } __forceinline operator const __m128d() const { return _mm_castps_pd(v); } - + #endif + __forceinline vboolf(bool a) : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} __forceinline vboolf(bool a, bool b) @@ -100,7 +102,7 @@ namespace embree __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) return _mm_blendv_ps(f, t, m); #else return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); @@ -114,6 +116,17 @@ namespace embree __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vboolf4 shuffle(const vboolf4& v) { return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); @@ -123,6 +136,7 @@ namespace embree __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } +#endif template<int i0> __forceinline vboolf4 shuffle(const vboolf4& v) { @@ -135,7 +149,7 @@ namespace embree template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } #endif -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) && !defined(__aarch64__) template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } @@ -157,7 +171,9 @@ namespace embree __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } -#if defined(__SSE4_2__) +#if defined(__aarch64__) + __forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); } +#elif defined(__SSE4_2__) __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } #else __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } diff --git a/thirdparty/embree/common/simd/vboolf8_avx.h b/thirdparty/embree/common/simd/vboolf8_avx.h index ba77cc3c5e..18cede19c6 100644 --- a/thirdparty/embree/common/simd/vboolf8_avx.h +++ b/thirdparty/embree/common/simd/vboolf8_avx.h @@ -76,7 +76,7 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {} - __forceinline vboolf(TrueTy) : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {} + __forceinline vboolf(TrueTy) : v(_mm256_castsi256_ps(_mm256_set1_epi32(0xFFFFFFFF))) {} //////////////////////////////////////////////////////////////////////////////// /// Array Access diff --git a/thirdparty/embree/common/simd/vdouble4_avx.h b/thirdparty/embree/common/simd/vdouble4_avx.h index 55326de7dd..208bb7ac99 100644 --- a/thirdparty/embree/common/simd/vdouble4_avx.h +++ b/thirdparty/embree/common/simd/vdouble4_avx.h @@ -189,13 +189,20 @@ namespace embree __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); } __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); } __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); } -#else +#elif !defined(__aarch64__) __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); } __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); } __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); } __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); } __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); } __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); } +#else + __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b); } + __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); } + __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b); } + __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); } + __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); } + __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b); } #endif __forceinline vboold4 operator ==(const vdouble4& a, double b) { return a == vdouble4(b); } diff --git a/thirdparty/embree/common/simd/vfloat16_avx512.h b/thirdparty/embree/common/simd/vfloat16_avx512.h index 9f1e2459c4..75c471cc0c 100644 --- a/thirdparty/embree/common/simd/vfloat16_avx512.h +++ b/thirdparty/embree/common/simd/vfloat16_avx512.h @@ -177,9 +177,10 @@ namespace embree __forceinline vfloat16 abs (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); } __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); } - __forceinline vfloat16 rcp(const vfloat16& a) { + __forceinline vfloat16 rcp(const vfloat16& a) + { const vfloat16 r = _mm512_rcp14_ps(a); - return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f))); + return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r); // computes r + r * (1 - a*r) } __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); } diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h index 5215bf9730..6d7e11fe72 100644 --- a/thirdparty/embree/common/simd/vfloat4_sse2.h +++ b/thirdparty/embree/common/simd/vfloat4_sse2.h @@ -42,6 +42,11 @@ namespace embree __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} +#if defined(__aarch64__) + __forceinline explicit vfloat(const vuint4& x) { + v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); + } +#else __forceinline explicit vfloat(const vuint4& x) { const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 @@ -49,7 +54,7 @@ namespace embree const __m128 bf = _mm_castsi128_ps(b); v = _mm_add_ps(af,bf); } - +#endif //////////////////////////////////////////////////////////////////////////////// /// Constants //////////////////////////////////////////////////////////////////////////////// @@ -107,7 +112,11 @@ namespace embree #endif } -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const char* ptr) { + return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const char* ptr) { return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -117,7 +126,11 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const unsigned char* ptr) { + return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const unsigned char* ptr) { return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -128,7 +141,11 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const short* ptr) { + return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const short* ptr) { return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -145,15 +162,19 @@ namespace embree static __forceinline void store_nt(void* ptr, const vfloat4& v) { #if defined (__SSE4_1__) +#if defined(__aarch64__) _mm_stream_ps((float*)ptr,v); #else + _mm_stream_ps((float*)ptr,v); +#endif +#else _mm_store_ps((float*)ptr,v); #endif } template<int scale = 4> static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _mm_i32gather_ps(ptr, index, scale); #else return vfloat4( @@ -169,7 +190,7 @@ namespace embree vfloat4 r = zero; #if defined(__AVX512VL__) return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) +#elif defined(__AVX2__) && !defined(__aarch64__) return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); #else if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); @@ -223,8 +244,8 @@ namespace embree friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { #if defined(__AVX512VL__) return _mm_mask_blend_ps(m, f, t); -#elif defined(__SSE4_1__) - return _mm_blendv_ps(f, t, m); +#elif defined(__SSE4_1__) || (defined(__aarch64__)) + return _mm_blendv_ps(f, t, m); #else return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); #endif @@ -256,18 +277,34 @@ namespace embree __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } __forceinline vfloat4 operator +(const vfloat4& a) { return a; } +#if defined(__aarch64__) + __forceinline vfloat4 operator -(const vfloat4& a) { + return vnegq_f32(a); + } +#else __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif +#if defined(__aarch64__) + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } +#else __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#endif + #if defined(__AVX512VL__) __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } #else __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } #endif + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } - + __forceinline vfloat4 rcp(const vfloat4& a) { +#if defined(__aarch64__) + return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); +#else + #if defined(__AVX512VL__) const vfloat4 r = _mm_rcp14_ps(a); #else @@ -275,30 +312,39 @@ namespace embree #endif #if defined(__AVX2__) - return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); + return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r); // computes r + r * (1 - a * r) #else - return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r) #endif + +#endif //defined(__aarch64__) } __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } __forceinline vfloat4 rsqrt(const vfloat4& a) { +#if defined(__aarch64__) + vfloat4 r = _mm_rsqrt_ps(a); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + return r; +#else + #if defined(__AVX512VL__) vfloat4 r = _mm_rsqrt14_ps(a); #else vfloat4 r = _mm_rsqrt_ps(a); #endif -#if defined(__ARM_NEON) - r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#elif defined(__AVX2__) +#if defined(__AVX2__) r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); #else r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); #endif + +#endif return r; } @@ -344,7 +390,8 @@ namespace embree __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -393,9 +440,10 @@ namespace embree __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } #else __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + #endif //////////////////////////////////////////////////////////////////////////////// @@ -429,8 +477,13 @@ namespace embree __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } +#if defined(__aarch64__) + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } +#else __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } +#endif __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } #endif @@ -484,7 +537,7 @@ namespace embree return select(vboolf4(mask), t, f); #endif } - + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { return madd(t,b-a,a); } @@ -506,10 +559,10 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// #if defined(__aarch64__) - __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } - __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } - __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } - __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? #elif defined (__SSE4_1__) __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } @@ -524,7 +577,9 @@ namespace embree __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } __forceinline vint4 floori(const vfloat4& a) { -#if defined(__SSE4_1__) +#if defined(__aarch64__) + return vcvtq_s32_f32(floor(a)); +#elif defined(__SSE4_1__) return vint4(floor(a)); #else return vint4(a-vfloat4(0.5f)); @@ -538,6 +593,16 @@ namespace embree __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vfloat4 shuffle(const vfloat4& v) { return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); @@ -547,8 +612,9 @@ namespace embree __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } +#endif -#if defined(__SSE3__) +#if defined(__SSE3__) && !defined(__aarch64__) template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } @@ -559,10 +625,14 @@ namespace embree return shuffle<i,i,i,i>(v); } +#if defined(__aarch64__) + template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; } +#else template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); } template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#endif -#if defined (__SSE4_1__) +#if defined (__SSE4_1__) && !defined(__aarch64__) template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } @@ -664,14 +734,25 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// - +#if defined(__aarch64__) + __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } +#else __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } +#endif +#if defined(__aarch64__) + __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } + __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } + __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } +#else __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } +#endif __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) { @@ -687,7 +768,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// __forceinline float dot(const vfloat4& a, const vfloat4& b) { diff --git a/thirdparty/embree/common/simd/vfloat8_avx.h b/thirdparty/embree/common/simd/vfloat8_avx.h index 13446454e8..b09d5e641d 100644 --- a/thirdparty/embree/common/simd/vfloat8_avx.h +++ b/thirdparty/embree/common/simd/vfloat8_avx.h @@ -107,11 +107,11 @@ namespace embree static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); } static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); } #else - static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } - static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); } + static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); } + static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); } - static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); } + static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); } #endif #if defined(__AVX2__) @@ -126,7 +126,7 @@ namespace embree template<int scale = 4> static __forceinline vfloat8 gather(const float* ptr, const vint8& index) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _mm256_i32gather_ps(ptr, index ,scale); #else return vfloat8( @@ -146,7 +146,7 @@ namespace embree vfloat8 r = zero; #if defined(__AVX512VL__) return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) +#elif defined(__AVX2__) && !defined(__aarch64__) return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale); #else if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); @@ -215,20 +215,52 @@ namespace embree __forceinline vfloat8 toFloat(const vint8& a) { return vfloat8(a); } __forceinline vfloat8 operator +(const vfloat8& a) { return a; } +#if !defined(__aarch64__) __forceinline vfloat8 operator -(const vfloat8& a) { const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); return _mm256_xor_ps(a, mask); } +#else + __forceinline vfloat8 operator -(const vfloat8& a) { + __m256 res; + res.lo = vnegq_f32(a.v.lo); + res.hi = vnegq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) __forceinline vfloat8 abs(const vfloat8& a) { const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); return _mm256_and_ps(a, mask); } +#else +__forceinline vfloat8 abs(const vfloat8& a) { + __m256 res; + res.lo = vabsq_f32(a.v.lo); + res.hi = vabsq_f32(a.v.hi); + return res; +} +#endif + +#if !defined(__aarch64__) __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); } +#else + __forceinline vfloat8 sign (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); } +#endif __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } static __forceinline vfloat8 rcp(const vfloat8& a) { +#if defined(__aarch64__) + vfloat8 ret; + const float32x4_t one = vdupq_n_f32(1.0f); + ret.v.lo = vdivq_f32(one, a.v.lo); + ret.v.hi = vdivq_f32(one, a.v.hi); + return ret; +#endif + #if defined(__AVX512VL__) const vfloat8 r = _mm256_rcp14_ps(a); #else @@ -236,9 +268,12 @@ namespace embree #endif #if defined(__AVX2__) - return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f))); + // First, compute 1 - a * r (which will be very close to 0) + const vfloat8 h_n = _mm256_fnmadd_ps(a, r, vfloat8(1.0f)); + // Then compute r + r * h_n + return _mm256_fmadd_ps(r, h_n, r); #else - return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a))); + return _mm256_add_ps(r,_mm256_mul_ps(r, _mm256_sub_ps(vfloat8(1.0f), _mm256_mul_ps(a, r)))); // computes r + r * (1 - a * r) #endif } __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); } @@ -384,7 +419,7 @@ namespace embree static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { return _mm256_mask_blend_ps(m, f, t); } -#else +#elif !defined(__aarch64__) static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); } static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); } static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); } @@ -395,6 +430,18 @@ namespace embree static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { return _mm256_blendv_ps(f, t, m); } +#else + static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b); } + static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); } + static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b); } + static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b); } + static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b); } + static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b); } + + static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) { + return _mm256_blendv_ps(f, t, m); + } + #endif template<int mask> @@ -463,10 +510,17 @@ namespace embree /// Rounding Functions //////////////////////////////////////////////////////////////////////////////// +#if !defined(__aarch64__) __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF ); } __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO ); } __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } +#else + __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); } + __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); } +#endif + + __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); } //////////////////////////////////////////////////////////////////////////////// @@ -501,9 +555,11 @@ namespace embree return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } +#if !defined(__aarch64__) template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); } template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); } template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); } +#endif __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); } template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); } @@ -512,7 +568,7 @@ namespace embree __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); } -#if defined (__AVX2__) +#if defined (__AVX2__) && !defined(__aarch64__) static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) { return _mm256_permutevar8x32_ps(a, index); } @@ -609,7 +665,7 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// - +#if !defined(__aarch64__) __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); } __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); } __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); } @@ -625,7 +681,14 @@ namespace embree __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); } __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); } __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); } +#else + __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); } + __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); } + __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); } + __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); } + __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); } +#endif __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) { const vfloat8 a = select(valid,v,vfloat8(pos_inf)); @@ -642,7 +705,7 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators (pairs of Vec3fa's) + /// Euclidean Space Operators (pairs of Vec3fa's) //////////////////////////////////////////////////////////////////////////////// //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) { diff --git a/thirdparty/embree/common/simd/vint4_sse2.h b/thirdparty/embree/common/simd/vint4_sse2.h index 9814d5c71c..eea03a771e 100644 --- a/thirdparty/embree/common/simd/vint4_sse2.h +++ b/thirdparty/embree/common/simd/vint4_sse2.h @@ -106,7 +106,14 @@ namespace embree #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vint4 load(const unsigned char* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vint4 loadu(const unsigned char* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) static __forceinline vint4 load(const unsigned char* ptr) { return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } @@ -127,7 +134,9 @@ namespace embree #endif static __forceinline vint4 load(const unsigned short* ptr) { -#if defined (__SSE4_1__) +#if defined(__aarch64__) + return __m128i(vmovl_u16(vld1_u16(ptr))); +#elif defined (__SSE4_1__) return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); #else return vint4(ptr[0],ptr[1],ptr[2],ptr[3]); @@ -135,7 +144,12 @@ namespace embree } static __forceinline void store(unsigned char* ptr, const vint4& v) { -#if defined(__SSE4_1__) +#if defined(__aarch64__) + int32x4_t x = v; + uint16x4_t y = vqmovn_u32(uint32x4_t(x)); + uint8x8_t z = vqmovn_u16(vcombine_u16(y, y)); + vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0); +#elif defined(__SSE4_1__) __m128i x = v; x = _mm_packus_epi32(x, x); x = _mm_packus_epi16(x, x); @@ -147,20 +161,26 @@ namespace embree } static __forceinline void store(unsigned short* ptr, const vint4& v) { +#if defined(__aarch64__) + uint32x4_t x = uint32x4_t(v.v); + uint16x4_t y = vqmovn_u32(x); + vst1_u16(ptr, y); +#else for (size_t i=0;i<4;i++) ptr[i] = (unsigned short)v[i]; +#endif } static __forceinline vint4 load_nt(void* ptr) { -#if defined(__SSE4_1__) - return _mm_stream_load_si128((__m128i*)ptr); +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_stream_load_si128((__m128i*)ptr); #else return _mm_load_si128((__m128i*)ptr); #endif } static __forceinline void store_nt(void* ptr, const vint4& v) { -#if defined(__SSE4_1__) +#if !defined(__aarch64__) && defined(__SSE4_1__) _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); #else _mm_store_si128((__m128i*)ptr,v); @@ -169,7 +189,7 @@ namespace embree template<int scale = 4> static __forceinline vint4 gather(const int* ptr, const vint4& index) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _mm_i32gather_epi32(ptr, index, scale); #else return vint4( @@ -185,7 +205,7 @@ namespace embree vint4 r = zero; #if defined(__AVX512VL__) return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) +#elif defined(__AVX2__) && !defined(__aarch64__) return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale); #else if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]); @@ -222,7 +242,7 @@ namespace embree #endif } -#if defined(__x86_64__) +#if defined(__x86_64__) || defined(__aarch64__) static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); } #endif @@ -236,6 +256,8 @@ namespace embree friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) { #if defined(__AVX512VL__) return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t); +#elif defined(__aarch64__) + return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v)); #elif defined(__SSE4_1__) return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); #else @@ -256,7 +278,9 @@ namespace embree __forceinline vint4 operator +(const vint4& a) { return a; } __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); } -#if defined(__SSSE3__) +#if defined(__aarch64__) + __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); } +#elif defined(__SSSE3__) __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); } #endif @@ -272,7 +296,7 @@ namespace embree __forceinline vint4 operator -(const vint4& a, int b) { return a - vint4(b); } __forceinline vint4 operator -(int a, const vint4& b) { return vint4(a) - b; } -#if defined(__SSE4_1__) +#if (defined(__aarch64__)) || defined(__SSE4_1__) __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); } #else __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); } @@ -292,8 +316,8 @@ namespace embree __forceinline vint4 operator ^(const vint4& a, int b) { return a ^ vint4(b); } __forceinline vint4 operator ^(int a, const vint4& b) { return vint4(a) ^ b; } - __forceinline vint4 operator <<(const vint4& a, int n) { return _mm_slli_epi32(a, n); } - __forceinline vint4 operator >>(const vint4& a, int n) { return _mm_srai_epi32(a, n); } + __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); } + __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); } __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); } __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); } @@ -309,7 +333,7 @@ namespace embree __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; } __forceinline vint4& operator -=(vint4& a, int b) { return a = a - b; } -#if defined(__SSE4_1__) +#if (defined(__aarch64__)) || defined(__SSE4_1__) __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; } __forceinline vint4& operator *=(vint4& a, int b) { return a = a * b; } #endif @@ -393,7 +417,7 @@ namespace embree #endif } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); } __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); } @@ -417,6 +441,16 @@ namespace embree __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template<int i0, int i1, int i2, int i3> + __forceinline vint4 shuffle(const vint4& a, const vint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vint4 shuffle(const vint4& v) { return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); @@ -426,7 +460,7 @@ namespace embree __forceinline vint4 shuffle(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); } - +#endif #if defined(__SSE3__) template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } @@ -438,7 +472,7 @@ namespace embree return shuffle<i,i,i,i>(v); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) && !defined(__aarch64__) template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); } template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); } #else @@ -446,18 +480,27 @@ namespace embree template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; } #endif - template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); } - + __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); } - - __forceinline size_t toSizeT(const vint4& v) { + +#if defined(__aarch64__) + __forceinline size_t toSizeT(const vint4& v) { + uint64x2_t x = uint64x2_t(v.v); + return x[0]; + } +#else +__forceinline size_t toSizeT(const vint4& v) { #if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround return toScalar(v); +#elif defined(__ARM_NEON) + // FIXME(LTE): Do we need a swap(i.e. use lane 1)? + return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0); #else return _mm_cvtsi128_si64(v); #endif } +#endif #if defined(__AVX512VL__) @@ -475,7 +518,17 @@ namespace embree /// Reductions //////////////////////////////////////////////////////////////////////////////// -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) + +#if defined(__aarch64__) + __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); } + __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); } + + __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); } + __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); } + __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); } +#else __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } @@ -483,6 +536,7 @@ namespace embree __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); } __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); } __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); } +#endif __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); } __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); } @@ -502,7 +556,7 @@ namespace embree /// Sorting networks //////////////////////////////////////////////////////////////////////////////// -#if defined(__SSE4_1__) +#if (defined(__aarch64__)) || defined(__SSE4_1__) __forceinline vint4 usort_ascending(const vint4& v) { diff --git a/thirdparty/embree/common/simd/vint8_avx.h b/thirdparty/embree/common/simd/vint8_avx.h index f43e9a8c22..48f5a9b203 100644 --- a/thirdparty/embree/common/simd/vint8_avx.h +++ b/thirdparty/embree/common/simd/vint8_avx.h @@ -79,8 +79,8 @@ namespace embree static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); } static __forceinline void store_nt(void* ptr, const vint8& v) { _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); diff --git a/thirdparty/embree/common/simd/vint8_avx2.h b/thirdparty/embree/common/simd/vint8_avx2.h index e04737ffbe..d48efac3f4 100644 --- a/thirdparty/embree/common/simd/vint8_avx2.h +++ b/thirdparty/embree/common/simd/vint8_avx2.h @@ -393,6 +393,7 @@ namespace embree __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } +#if !defined(__aarch64__) __forceinline vint8 permute(const vint8& v, const __m256i& index) { return _mm256_permutevar8x32_epi32(v, index); } @@ -410,6 +411,9 @@ namespace embree #endif } +#endif + + //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// diff --git a/thirdparty/embree/common/simd/vuint4_sse2.h b/thirdparty/embree/common/simd/vuint4_sse2.h index 0601b9ab80..f7817da6be 100644 --- a/thirdparty/embree/common/simd/vuint4_sse2.h +++ b/thirdparty/embree/common/simd/vuint4_sse2.h @@ -95,7 +95,14 @@ namespace embree static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vuint4 load(const unsigned char* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } + static __forceinline vuint4 loadu(const unsigned char* ptr) { + return _mm_load4epu8_epi32(((__m128i*)ptr)); + } +#elif defined(__SSE4_1__) static __forceinline vuint4 load(const unsigned char* ptr) { return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); } @@ -107,7 +114,9 @@ namespace embree #endif static __forceinline vuint4 load(const unsigned short* ptr) { -#if defined (__SSE4_1__) +#if defined(__aarch64__) + return _mm_load4epu16_epi32(((__m128i*)ptr)); +#elif defined (__SSE4_1__) return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); #else return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]); @@ -115,7 +124,7 @@ namespace embree } static __forceinline vuint4 load_nt(void* ptr) { -#if defined(__SSE4_1__) +#if (defined(__aarch64__)) || defined(__SSE4_1__) return _mm_stream_load_si128((__m128i*)ptr); #else return _mm_load_si128((__m128i*)ptr); @@ -123,8 +132,8 @@ namespace embree } static __forceinline void store_nt(void* ptr, const vuint4& v) { -#if defined(__SSE4_1__) - _mm_stream_ps((float*)ptr,_mm_castsi128_ps(v)); +#if !defined(__aarch64__) && defined(__SSE4_1__) + _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); #else _mm_store_si128((__m128i*)ptr,v); #endif @@ -132,7 +141,7 @@ namespace embree template<int scale = 4> static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _mm_i32gather_epi32((const int*)ptr, index, scale); #else return vuint4( @@ -148,7 +157,7 @@ namespace embree vuint4 r = zero; #if defined(__AVX512VL__) return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale); -#elif defined(__AVX2__) +#elif defined(__AVX2__) && !defined(__aarch64__) return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale); #else if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]); @@ -344,6 +353,16 @@ namespace embree __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& v) { + return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template<int i0, int i1, int i2, int i3> + __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { + return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vuint4 shuffle(const vuint4& v) { return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)); @@ -353,7 +372,7 @@ namespace embree __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0))); } - +#endif #if defined(__SSE3__) template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); } template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); } @@ -365,7 +384,7 @@ namespace embree return shuffle<i,i,i,i>(v); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) && !defined(__aarch64__) template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); } template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); } #else @@ -373,7 +392,6 @@ namespace embree template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; } #endif - template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); } __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); } diff --git a/thirdparty/embree/common/simd/vuint8_avx.h b/thirdparty/embree/common/simd/vuint8_avx.h index 589cd9d731..cb8b5158c1 100644 --- a/thirdparty/embree/common/simd/vuint8_avx.h +++ b/thirdparty/embree/common/simd/vuint8_avx.h @@ -77,8 +77,8 @@ namespace embree static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); } static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); } - static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } - static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); } + static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); } + static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); } static __forceinline void store_nt(void* ptr, const vuint8& v) { _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v)); diff --git a/thirdparty/embree/common/simd/vuint8_avx2.h b/thirdparty/embree/common/simd/vuint8_avx2.h index 17b994522f..959143724b 100644 --- a/thirdparty/embree/common/simd/vuint8_avx2.h +++ b/thirdparty/embree/common/simd/vuint8_avx2.h @@ -385,6 +385,7 @@ namespace embree __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); } +#if !defined(__aarch64__) __forceinline vuint8 permute(const vuint8& v, const __m256i& index) { return _mm256_permutevar8x32_epi32(v, index); } @@ -401,6 +402,7 @@ namespace embree return _mm256_alignr_epi8(a, b, 4*i); #endif } +#endif // !defined(__aarch64__) //////////////////////////////////////////////////////////////////////////////// /// Reductions diff --git a/thirdparty/embree/common/simd/wasm/emulation.h b/thirdparty/embree/common/simd/wasm/emulation.h new file mode 100644 index 0000000000..778ab4ae6a --- /dev/null +++ b/thirdparty/embree/common/simd/wasm/emulation.h @@ -0,0 +1,13 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// According to https://emscripten.org/docs/porting/simd.html, _MM_SET_EXCEPTION_MASK and +// _mm_setcsr are unavailable in WebAssembly. + +#define _MM_SET_EXCEPTION_MASK(x) + +__forceinline void _mm_setcsr(unsigned int) +{ +} diff --git a/thirdparty/embree/common/sys/array.h b/thirdparty/embree/common/sys/array.h index dd9190c52a..e96939b63d 100644 --- a/thirdparty/embree/common/sys/array.h +++ b/thirdparty/embree/common/sys/array.h @@ -59,8 +59,8 @@ namespace embree /********************** Iterators ****************************/ - __forceinline T* begin() const { return items; }; - __forceinline T* end () const { return items+M; }; + __forceinline T* begin() const { return (T*)items; }; + __forceinline T* end () const { return (T*)items+M; }; /********************** Capacity ****************************/ @@ -101,8 +101,8 @@ namespace embree __forceinline T& at(size_t i) { assert(i < M); return items[i]; } __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; } - __forceinline T& front() const { assert(M > 0); return items[0]; }; - __forceinline T& back () const { assert(M > 0); return items[M-1]; }; + __forceinline T& front() { assert(M > 0); return items[0]; }; + __forceinline T& back () { assert(M > 0); return items[M-1]; }; __forceinline T* data() { return items; }; __forceinline const T* data() const { return items; }; @@ -139,7 +139,7 @@ namespace embree __forceinline Ty& operator[](const unsigned i) { assert(i<N); return data[i]; } __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline Ty& operator[](const size_t i) { assert(i<N); return data[i]; } __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; } #endif @@ -196,7 +196,7 @@ namespace embree __forceinline Ty& operator[](const int i) { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; } __forceinline Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; } #endif diff --git a/thirdparty/embree/common/sys/barrier.h b/thirdparty/embree/common/sys/barrier.h index 37fc036291..c56513a2ed 100644 --- a/thirdparty/embree/common/sys/barrier.h +++ b/thirdparty/embree/common/sys/barrier.h @@ -24,7 +24,7 @@ namespace embree BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement public: - /*! intializes the barrier with some number of threads */ + /*! initializes the barrier with some number of threads */ void init(size_t count); /*! lets calling thread wait in barrier */ @@ -94,7 +94,7 @@ namespace embree LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement public: - /*! intializes the barrier with some number of threads */ + /*! initializes the barrier with some number of threads */ void init(size_t threadCount); /*! thread with threadIndex waits in the barrier */ diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h index ed8dd7d40a..2c2f6eccda 100644 --- a/thirdparty/embree/common/sys/intrinsics.h +++ b/thirdparty/embree/common/sys/intrinsics.h @@ -13,6 +13,9 @@ #include "../simd/arm/emulation.h" #else #include <immintrin.h> +#if defined(__EMSCRIPTEN__) +#include "../simd/wasm/emulation.h" +#endif #endif #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) @@ -24,24 +27,26 @@ #endif #endif -#if defined(__LZCNT__) +#if defined(__aarch64__) #if !defined(_lzcnt_u32) - #define _lzcnt_u32 __lzcnt32 + #define _lzcnt_u32 __builtin_clz #endif - #if !defined(_lzcnt_u64) - #define _lzcnt_u64 __lzcnt64 +#else + #if defined(__LZCNT__) + #if !defined(_lzcnt_u32) + #define _lzcnt_u32 __lzcnt32 + #endif + #if !defined(_lzcnt_u64) + #define _lzcnt_u64 __lzcnt64 + #endif #endif #endif #if defined(__WIN32__) -// -- GODOT start -- -#if !defined(NOMINMAX) -// -- GODOT end -- -#define NOMINMAX -// -- GODOT start -- -#endif -#include "windows.h" -// -- GODOT end -- +# if !defined(NOMINMAX) +# define NOMINMAX +# endif +# include <windows.h> #endif /* normally defined in pmmintrin.h, but we always need this */ @@ -69,7 +74,7 @@ namespace embree } __forceinline int bsf(int v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _tzcnt_u32(v); #else unsigned long r = 0; _BitScanForward(&r,v); return r; @@ -77,7 +82,7 @@ namespace embree } __forceinline unsigned bsf(unsigned v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _tzcnt_u32(v); #else unsigned long r = 0; _BitScanForward(&r,v); return r; @@ -118,7 +123,7 @@ namespace embree #endif __forceinline int bsr(int v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return 31 - _lzcnt_u32(v); #else unsigned long r = 0; _BitScanReverse(&r,v); return r; @@ -126,7 +131,7 @@ namespace embree } __forceinline unsigned bsr(unsigned v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return 31 - _lzcnt_u32(v); #else unsigned long r = 0; _BitScanReverse(&r,v); return r; @@ -145,7 +150,7 @@ namespace embree __forceinline int lzcnt(const int x) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _lzcnt_u32(x); #else if (unlikely(x == 0)) return 32; @@ -214,15 +219,26 @@ namespace embree #elif defined(__X86_ASM__) __forceinline void __cpuid(int out[4], int op) { - asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); +#if defined(__ARM_NEON) + if (op == 0) { // Get CPU name + out[0] = 0x41524d20; + out[1] = 0x41524d20; + out[2] = 0x41524d20; + out[3] = 0x41524d20; + } +#else + asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); +#endif } - + +#if !defined(__ARM_NEON) __forceinline void __cpuid_count(int out[4], int op1, int op2) { asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); } - #endif - + +#endif + __forceinline uint64_t read_tsc() { #if defined(__X86_ASM__) uint32_t high,low; @@ -235,30 +251,38 @@ namespace embree } __forceinline int bsf(int v) { -#if defined(__AVX2__) +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) return _tzcnt_u32(v); #elif defined(__X86_ASM__) int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; #else return __builtin_ctz(v); #endif +#endif } #if defined(__64BIT__) __forceinline unsigned bsf(unsigned v) { -#if defined(__AVX2__) +#if defined(__ARM_NEON) + return __builtin_ctz(v); +#else +#if defined(__AVX2__) return _tzcnt_u32(v); #elif defined(__X86_ASM__) unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; #else return __builtin_ctz(v); #endif +#endif } #endif __forceinline size_t bsf(size_t v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) #if defined(__X86_64__) return _tzcnt_u64(v); #else @@ -295,7 +319,7 @@ namespace embree } __forceinline int bsr(int v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return 31 - _lzcnt_u32(v); #elif defined(__X86_ASM__) int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; @@ -304,7 +328,7 @@ namespace embree #endif } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline unsigned bsr(unsigned v) { #if defined(__AVX2__) return 31 - _lzcnt_u32(v); @@ -317,7 +341,7 @@ namespace embree #endif __forceinline size_t bsr(size_t v) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) #if defined(__X86_64__) return 63 - _lzcnt_u64(v); #else @@ -332,7 +356,7 @@ namespace embree __forceinline int lzcnt(const int x) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _lzcnt_u32(x); #else if (unlikely(x == 0)) return 32; @@ -341,18 +365,18 @@ namespace embree } __forceinline size_t blsr(size_t v) { -#if defined(__AVX2__) -#if defined(__INTEL_COMPILER) +#if defined(__AVX2__) && !defined(__aarch64__) + #if defined(__INTEL_COMPILER) return _blsr_u64(v); + #else + #if defined(__X86_64__) + return __blsr_u64(v); + #else + return __blsr_u32(v); + #endif + #endif #else -#if defined(__X86_64__) - return __blsr_u64(v); -#else - return __blsr_u32(v); -#endif -#endif -#else - return v & (v-1); + return v & (v-1); #endif } @@ -368,7 +392,7 @@ namespace embree #if defined(__X86_ASM__) int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; #else - return (v | (v << i)); + return (v | (1 << i)); #endif } @@ -376,7 +400,7 @@ namespace embree #if defined(__X86_ASM__) int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; #else - return (v & ~(v << i)); + return (v & ~(1 << i)); #endif } @@ -392,7 +416,7 @@ namespace embree #if defined(__X86_ASM__) size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; #else - return (v | (v << i)); + return (v | (1 << i)); #endif } @@ -400,7 +424,7 @@ namespace embree #if defined(__X86_ASM__) size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; #else - return (v & ~(v << i)); + return (v & ~(1 << i)); #endif } @@ -435,8 +459,8 @@ namespace embree #endif #endif -#if defined(__SSE4_2__) - +#if defined(__SSE4_2__) || defined(__ARM_NEON) + __forceinline int popcnt(int in) { return _mm_popcnt_u32(in); } @@ -483,14 +507,14 @@ namespace embree #endif } - __forceinline void prefetchL1EX(const void* ptr) { - prefetchEX(ptr); + __forceinline void prefetchL1EX(const void* ptr) { + prefetchEX(ptr); } - - __forceinline void prefetchL2EX(const void* ptr) { - prefetchEX(ptr); + + __forceinline void prefetchL2EX(const void* ptr) { + prefetchEX(ptr); } -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } #if defined(__X86_64__) diff --git a/thirdparty/embree/common/sys/mutex.cpp b/thirdparty/embree/common/sys/mutex.cpp index 789feaf2d8..8212deaa49 100644 --- a/thirdparty/embree/common/sys/mutex.cpp +++ b/thirdparty/embree/common/sys/mutex.cpp @@ -36,6 +36,7 @@ namespace embree MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0; assert(ok); delete (pthread_mutex_t*)mutex; + mutex = nullptr; } void MutexSys::lock() diff --git a/thirdparty/embree/common/sys/mutex.h b/thirdparty/embree/common/sys/mutex.h index 4cb3626d92..26af6c582c 100644 --- a/thirdparty/embree/common/sys/mutex.h +++ b/thirdparty/embree/common/sys/mutex.h @@ -7,6 +7,7 @@ #include "intrinsics.h" #include "atomic.h" +#define CPU_CACHELINE_SIZE 64 namespace embree { /*! system mutex */ @@ -83,6 +84,11 @@ namespace embree atomic<bool> flag; }; + class PaddedSpinLock : public SpinLock + { + private: + char padding[CPU_CACHELINE_SIZE - sizeof(SpinLock)]; + }; /*! safe mutex lock and unlock helper */ template<typename Mutex> class Lock { public: diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h index 3e386c4944..728bf6ed7d 100644 --- a/thirdparty/embree/common/sys/platform.h +++ b/thirdparty/embree/common/sys/platform.h @@ -92,16 +92,19 @@ //////////////////////////////////////////////////////////////////////////////// #ifdef __WIN32__ -#define dll_export __declspec(dllexport) -#define dll_import __declspec(dllimport) +# if defined(EMBREE_STATIC_LIB) +# define dll_export +# define dll_import +# else +# define dll_export __declspec(dllexport) +# define dll_import __declspec(dllimport) +# endif #else -#define dll_export __attribute__ ((visibility ("default"))) -#define dll_import +# define dll_export __attribute__ ((visibility ("default"))) +# define dll_import #endif -// -- GODOT start -- #if defined(__WIN32__) && !defined(__MINGW32__) -// -- GODOT end -- #if !defined(__noinline) #define __noinline __declspec(noinline) #endif @@ -151,9 +154,7 @@ #define DELETED = delete #endif -// -- GODOT start -- #if !defined(likely) -// -- GODOT end -- #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define likely(expr) (expr) #define unlikely(expr) (expr) @@ -161,9 +162,7 @@ #define likely(expr) __builtin_expect((bool)(expr),true ) #define unlikely(expr) __builtin_expect((bool)(expr),false) #endif -// -- GODOT start -- #endif -// -- GODOT end -- //////////////////////////////////////////////////////////////////////////////// /// Error handling and debugging @@ -252,6 +251,7 @@ __forceinline std::string toString(long long value) { #pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning) //#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data #pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data +#pragma warning(disable:4267) // conversion from 'size_t' to 'const int', possible loss of data //#pragma warning(disable:4355) // 'this' : used in base member initializer list //#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch //#pragma warning(disable:4018) // '<' : signed / unsigned mismatch diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp index f1a59e511e..c98f61fa53 100644 --- a/thirdparty/embree/common/sys/sysinfo.cpp +++ b/thirdparty/embree/common/sys/sysinfo.cpp @@ -21,7 +21,11 @@ namespace embree std::string getPlatformName() { -#if defined(__LINUX__) && !defined(__64BIT__) +#if defined(__ANDROID__) && !defined(__64BIT__) + return "Android (32bit)"; +#elif defined(__ANDROID__) && defined(__64BIT__) + return "Android (64bit)"; +#elif defined(__LINUX__) && !defined(__64BIT__) return "Linux (32bit)"; #elif defined(__LINUX__) && defined(__64BIT__) return "Linux (64bit)"; @@ -248,9 +252,7 @@ namespace embree #if defined(__X86_ASM__) __noinline int64_t get_xcr0() { -// -- GODOT start -- -#if defined (__WIN32__) && !defined (__MINGW32__) -// -- GODOT end -- +#if defined (__WIN32__) && !defined (__MINGW32__) && defined(_XCR_XFEATURE_ENABLED_MASK) int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 xcr0 = _xgetbv(0); return xcr0; @@ -337,9 +339,24 @@ namespace embree if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI; return cpu_features; -#elif defined(__ARM_NEON) - /* emulated features with sse2neon */ - return CPU_FEATURE_SSE|CPU_FEATURE_SSE2|CPU_FEATURE_XMM_ENABLED; + +#elif defined(__ARM_NEON) || defined(__EMSCRIPTEN__) + + int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2; + cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42; + cpu_features |= CPU_FEATURE_XMM_ENABLED; + cpu_features |= CPU_FEATURE_YMM_ENABLED; + cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C; + cpu_features |= CPU_FEATURE_POPCNT; + cpu_features |= CPU_FEATURE_AVX; + cpu_features |= CPU_FEATURE_AVX2; + cpu_features |= CPU_FEATURE_FMA3; + cpu_features |= CPU_FEATURE_LZCNT; + cpu_features |= CPU_FEATURE_BMI1; + cpu_features |= CPU_FEATURE_BMI2; + cpu_features |= CPU_FEATURE_NEON_2X; + return cpu_features; + #else /* Unknown CPU. */ return 0; @@ -376,6 +393,8 @@ namespace embree if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL "; if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA "; if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI "; + if (features & CPU_FEATURE_NEON) str += "NEON "; + if (features & CPU_FEATURE_NEON_2X) str += "2xNEON "; return str; } @@ -390,6 +409,9 @@ namespace embree if (isa == AVX) return "AVX"; if (isa == AVX2) return "AVX2"; if (isa == AVX512) return "AVX512"; + + if (isa == NEON) return "NEON"; + if (isa == NEON_2X) return "2xNEON"; return "UNKNOWN"; } @@ -410,6 +432,9 @@ namespace embree if (hasISA(features,AVXI)) v += "AVXI "; if (hasISA(features,AVX2)) v += "AVX2 "; if (hasISA(features,AVX512)) v += "AVX512 "; + + if (hasISA(features,NEON)) v += "NEON "; + if (hasISA(features,NEON_2X)) v += "2xNEON "; return v; } } @@ -613,6 +638,10 @@ namespace embree #include <sys/time.h> #include <pthread.h> +#if defined(__EMSCRIPTEN__) +#include <emscripten.h> +#endif + namespace embree { unsigned int getNumberOfLogicalThreads() @@ -620,12 +649,25 @@ namespace embree static int nThreads = -1; if (nThreads != -1) return nThreads; -// -- GODOT start -- -// #if defined(__MACOSX__) #if defined(__MACOSX__) || defined(__ANDROID__) -// -- GODOT end -- nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container assert(nThreads); +#elif defined(__EMSCRIPTEN__) + // WebAssembly supports pthreads, but not pthread_getaffinity_np. Get the number of logical + // threads from the browser or Node.js using JavaScript. + nThreads = MAIN_THREAD_EM_ASM_INT({ + const isBrowser = typeof window !== 'undefined'; + const isNode = typeof process !== 'undefined' && process.versions != null && + process.versions.node != null; + if (isBrowser) { + // Return 1 if the browser does not expose hardwareConcurrency. + return window.navigator.hardwareConcurrency || 1; + } else if (isNode) { + return require('os').cpus().length; + } else { + return 1; + } + }); #else cpu_set_t set; if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) diff --git a/thirdparty/embree/common/sys/sysinfo.h b/thirdparty/embree/common/sys/sysinfo.h index 72351d12e4..cefd39a0f6 100644 --- a/thirdparty/embree/common/sys/sysinfo.h +++ b/thirdparty/embree/common/sys/sysinfo.h @@ -55,7 +55,12 @@ # define isa sse # define ISA SSE # define ISA_STR "SSE" -#else +#elif defined(__ARM_NEON) +// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment. +#define isa sse2 +#define ISA NEON +#define ISA_STR "NEON" +#else #error Unknown ISA #endif @@ -133,7 +138,9 @@ namespace embree static const int CPU_FEATURE_XMM_ENABLED = 1 << 25; static const int CPU_FEATURE_YMM_ENABLED = 1 << 26; static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27; - + static const int CPU_FEATURE_NEON = 1 << 28; + static const int CPU_FEATURE_NEON_2X = 1 << 29; + /*! get CPU features */ int getCPUFeatures(); @@ -154,6 +161,8 @@ namespace embree static const int AVXI = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND; static const int AVX2 = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT; static const int AVX512 = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED; + static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2; + static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2; /*! converts ISA bitvector into a string */ std::string stringOfISA(int features); diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp index f4014be89b..530c3c7810 100644 --- a/thirdparty/embree/common/sys/thread.cpp +++ b/thirdparty/embree/common/sys/thread.cpp @@ -10,6 +10,9 @@ #include "../simd/arm/emulation.h" #else #include <xmmintrin.h> +#if defined(__EMSCRIPTEN__) +#include "../simd/wasm/emulation.h" +#endif #endif #if defined(PTHREADS_WIN32) @@ -158,9 +161,7 @@ namespace embree /// Linux Platform //////////////////////////////////////////////////////////////////////////////// -// -- GODOT start -- #if defined(__LINUX__) && !defined(__ANDROID__) -// -- GODOT end -- #include <fstream> #include <sstream> @@ -219,6 +220,8 @@ namespace embree /* find correct thread to affinitize to */ cpu_set_t set; + CPU_ZERO(&set); + if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0) { for (int i=0, j=0; i<CPU_SETSIZE; i++) @@ -241,7 +244,8 @@ namespace embree { cpu_set_t cset; CPU_ZERO(&cset); - size_t threadID = mapThreadID(affinity); + //size_t threadID = mapThreadID(affinity); // this is not working properly in LXC containers when some processors are disabled + size_t threadID = affinity; CPU_SET(threadID, &cset); pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset); @@ -249,7 +253,6 @@ namespace embree } #endif -// -- GODOT start -- //////////////////////////////////////////////////////////////////////////////// /// Android Platform //////////////////////////////////////////////////////////////////////////////// @@ -269,7 +272,6 @@ namespace embree } } #endif -// -- GODOT end -- //////////////////////////////////////////////////////////////////////////////// /// FreeBSD Platform @@ -294,6 +296,21 @@ namespace embree #endif //////////////////////////////////////////////////////////////////////////////// +/// WebAssembly Platform +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__EMSCRIPTEN__) +namespace embree +{ + /*! set affinity of the calling thread */ + void setAffinity(ssize_t affinity) + { + // Setting thread affinity is not supported in WASM. + } +} +#endif + +//////////////////////////////////////////////////////////////////////////////// /// MacOSX Platform //////////////////////////////////////////////////////////////////////////////// @@ -379,9 +396,7 @@ namespace embree pthread_attr_destroy(&attr); /* set affinity */ -// -- GODOT start -- #if defined(__LINUX__) && !defined(__ANDROID__) -// -- GODOT end -- if (threadID >= 0) { cpu_set_t cset; CPU_ZERO(&cset); @@ -396,7 +411,6 @@ namespace embree CPU_SET(threadID, &cset); pthread_setaffinity_np(*tid, sizeof(cset), &cset); } -// -- GODOT start -- #elif defined(__ANDROID__) if (threadID >= 0) { cpu_set_t cset; @@ -405,7 +419,6 @@ namespace embree sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset); } #endif -// -- GODOT end -- return thread_t(tid); } @@ -424,14 +437,12 @@ namespace embree /*! destroy a hardware thread by its handle */ void destroyThread(thread_t tid) { -// -- GODOT start -- #if defined(__ANDROID__) - FATAL("Can't destroy threads on Android."); + FATAL("Can't destroy threads on Android."); // pthread_cancel not implemented. #else pthread_cancel(*(pthread_t*)tid); delete (pthread_t*)tid; #endif -// -- GODOT end -- } /*! creates thread local storage */ diff --git a/thirdparty/embree/common/sys/vector.h b/thirdparty/embree/common/sys/vector.h index f832626789..d05e1deb18 100644 --- a/thirdparty/embree/common/sys/vector.h +++ b/thirdparty/embree/common/sys/vector.h @@ -127,14 +127,15 @@ namespace embree { assert(!empty()); size_active--; - alloc.destroy(&items[size_active]); + items[size_active].~T(); } __forceinline void clear() { /* destroy elements */ - for (size_t i=0; i<size_active; i++) - alloc.destroy(&items[i]); + for (size_t i=0; i<size_active; i++){ + items[i].~T(); + } /* free memory */ alloc.deallocate(items,size_alloced); @@ -178,8 +179,9 @@ namespace embree /* destroy elements */ if (new_active < size_active) { - for (size_t i=new_active; i<size_active; i++) - alloc.destroy(&items[i]); + for (size_t i=new_active; i<size_active; i++){ + items[i].~T(); + } size_active = new_active; } @@ -195,7 +197,7 @@ namespace embree items = alloc.allocate(new_alloced); for (size_t i=0; i<size_active; i++) { ::new (&items[i]) T(std::move(old_items[i])); - alloc.destroy(&old_items[i]); + old_items[i].~T(); } for (size_t i=size_active; i<new_active; i++) { diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h index 8fa6bb12fa..6cc2495195 100644 --- a/thirdparty/embree/common/tasking/taskschedulerinternal.h +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h @@ -143,7 +143,7 @@ namespace embree /* allocate new task on right side of stack */ size_t oldStackPtr = stackPtr; TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure); - new (&tasks[right]) Task(func,thread.task,oldStackPtr,size); + new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size); right++; /* also move left pointer */ diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h index 35bd49849f..042ba7bc4c 100644 --- a/thirdparty/embree/common/tasking/taskschedulertbb.h +++ b/thirdparty/embree/common/tasking/taskschedulertbb.h @@ -11,14 +11,8 @@ #include "../sys/condition.h" #include "../sys/ref.h" -#if defined(__WIN32__) -// -- GODOT start -- -#if !defined(NOMINMAX) -// -- GODOT end -- +#if defined(__WIN32__) && !defined(NOMINMAX) # define NOMINMAX -// -- GODOT start -- -#endif -// -- GODOT end -- #endif // We need to define these to avoid implicit linkage against diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h index 4857e1e05e..894628e47c 100644 --- a/thirdparty/embree/include/embree3/rtcore_common.h +++ b/thirdparty/embree/include/embree3/rtcore_common.h @@ -19,9 +19,7 @@ typedef int ssize_t; #endif #endif -// -- GODOT start -- -#if defined(_WIN32) && defined(_MSC_VER) -// -- GODOT end -- +#if defined(_WIN32) && !defined(__MINGW32__) # define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) #else # define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h index 62b7b6f4dc..0b399ef040 100644 --- a/thirdparty/embree/include/embree3/rtcore_config.h +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -1,4 +1,3 @@ - // Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 @@ -6,23 +5,25 @@ #define RTC_VERSION_MAJOR 3 #define RTC_VERSION_MINOR 13 -#define RTC_VERSION_PATCH 1 -#define RTC_VERSION 31301 -#define RTC_VERSION_STRING "3.13.1" +#define RTC_VERSION_PATCH 5 +#define RTC_VERSION 31305 +#define RTC_VERSION_STRING "3.13.5" #define RTC_MAX_INSTANCE_LEVEL_COUNT 1 #define EMBREE_MIN_WIDTH 0 #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH -#define EMBREE_STATIC_LIB -/* #undef EMBREE_API_NAMESPACE */ +#if !defined(EMBREE_STATIC_LIB) +# define EMBREE_STATIC_LIB +#endif +/* #undef EMBREE_API_NAMESPACE*/ #if defined(EMBREE_API_NAMESPACE) # define RTC_NAMESPACE -# define RTC_NAMESPACE_BEGIN namespace { +# define RTC_NAMESPACE_BEGIN namespace { # define RTC_NAMESPACE_END } -# define RTC_NAMESPACE_USE using namespace ; +# define RTC_NAMESPACE_USE using namespace; # define RTC_API_EXTERN_C # undef EMBREE_API_NAMESPACE #else diff --git a/thirdparty/embree/include/embree3/rtcore_quaternion.h b/thirdparty/embree/include/embree3/rtcore_quaternion.h index 6489fa3467..bd5fe1d89a 100644 --- a/thirdparty/embree/include/embree3/rtcore_quaternion.h +++ b/thirdparty/embree/include/embree3/rtcore_quaternion.h @@ -8,7 +8,7 @@ RTC_NAMESPACE_BEGIN /* - * Structure for transformation respresentation as a matrix decomposition using + * Structure for transformation representation as a matrix decomposition using * a quaternion */ struct RTC_ALIGN(16) RTCQuaternionDecomposition diff --git a/thirdparty/embree/include/embree3/rtcore_scene.h b/thirdparty/embree/include/embree3/rtcore_scene.h index 5878a3d402..34d87a2ce4 100644 --- a/thirdparty/embree/include/embree3/rtcore_scene.h +++ b/thirdparty/embree/include/embree3/rtcore_scene.h @@ -47,9 +47,12 @@ RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigne /* Detaches the geometry from the scene. */ RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID); -/* Gets a geometry handle from the scene. */ +/* Gets a geometry handle from the scene. This function is not thread safe and should get used during rendering. */ RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID); +/* Gets a geometry handle from the scene. This function is thread safe and should NOT get used during rendering. */ +RTC_API RTCGeometry rtcGetGeometryThreadSafe(RTCScene scene, unsigned int geomID); + /* Commits the scene. */ RTC_API void rtcCommitScene(RTCScene scene); diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h index 8f21e3254f..cba32ca73c 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_morton.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h @@ -411,7 +411,7 @@ namespace embree ReductionTy bounds[MAX_BRANCHING_FACTOR]; if (current.size() > singleThreadThreshold) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { bounds[i] = recurse(depth+1,children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h index f9a08d65cd..6e73c0d250 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h @@ -374,7 +374,7 @@ namespace embree const size_t begin = set.begin(); const size_t end = set.end(); - const size_t center = (begin + end)/2; + const size_t center = (begin + end + 1) / 2; PrimInfoMB linfo = empty; for (size_t i=begin; i<center; i++) @@ -594,7 +594,7 @@ namespace embree /* spawn tasks */ if (unlikely(current.size() > cfg.singleThreadThreshold)) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { for (size_t i=r.begin(); i<r.end(); i++) { values[i] = recurse(children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h index fff4bf2a35..24c5faf8be 100644 --- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h +++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h @@ -298,7 +298,7 @@ namespace embree /* spawn tasks */ if (current.size() > cfg.singleThreadThreshold) { - /*! parallel_for is faster than spawing sub-tasks */ + /*! parallel_for is faster than spawning sub-tasks */ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here for (size_t i=r.begin(); i<r.end(); i++) { values[i] = recurse(children[i],nullptr,true); diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h index ee29d09ac9..41be6183b8 100644 --- a/thirdparty/embree/kernels/builders/heuristic_binning.h +++ b/thirdparty/embree/kernels/builders/heuristic_binning.h @@ -57,14 +57,12 @@ namespace embree __forceinline Vec3ia bin(const Vec3fa& p) const { const vint4 i = floori((vfloat4(p)-ofs)*scale); -#if 1 assert(i[0] >= 0 && (size_t)i[0] < num); assert(i[1] >= 0 && (size_t)i[1] < num); assert(i[2] >= 0 && (size_t)i[2] < num); - return Vec3ia(i); -#else + + // we clamp to handle corner cases that could calculate out of bounds bin return Vec3ia(clamp(i,vint4(0),vint4(num-1))); -#endif } /*! faster but unsafe binning */ diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h index 4249d16ea1..354e283557 100644 --- a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h +++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h @@ -275,7 +275,7 @@ namespace embree openNodesBasedOnExtend(set); #endif - /* disable opening when unsufficient space for opening a node available */ + /* disable opening when insufficient space for opening a node available */ if (set.ext_range_size() < max_open_size-1) set.set_ext_range(set.end()); /* disable opening */ } diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h index a6939ba258..8b3499ac8d 100644 --- a/thirdparty/embree/kernels/builders/heuristic_spatial.h +++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h @@ -159,27 +159,25 @@ namespace embree assert(binID < BINS); bounds [binID][dim].extend(b); } - - /*! bins an array of triangles */ - template<typename SplitPrimitive> - __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping) + + /*! bins an array of primitives */ + template<typename PrimitiveSplitterFactory> + __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { - for (size_t i=0; i<N; i++) + for (size_t i=begin; i<end; i++) { - const PrimRef prim = prims[i]; + const PrimRef& prim = source[i]; unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); - - if (unlikely(splits == 1)) + + if (unlikely(splits <= 1)) { const vint4 bin = mapping.bin(center(prim.bounds())); for (size_t dim=0; dim<3; dim++) { assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); - numBegin[bin[dim]][dim]++; - numEnd [bin[dim]][dim]++; - bounds [bin[dim]][dim].extend(prim.bounds()); + add(dim,bin[dim],bin[dim],bin[dim],prim.bounds()); } - } + } else { const vint4 bin0 = mapping.bin(prim.bounds().lower); @@ -187,89 +185,44 @@ namespace embree for (size_t dim=0; dim<3; dim++) { + if (unlikely(mapping.invalid(dim))) + continue; + size_t bin; - PrimRef rest = prim; size_t l = bin0[dim]; size_t r = bin1[dim]; - + // same bin optimization if (likely(l == r)) { - numBegin[l][dim]++; - numEnd [l][dim]++; - bounds [l][dim].extend(prim.bounds()); + add(dim,l,l,l,prim.bounds()); continue; } - - for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) + size_t bin_start = bin0[dim]; + size_t bin_end = bin1[dim]; + BBox3fa rest = prim.bounds(); + + /* assure that split position always overlaps the primitive bounds */ + while (bin_start < bin_end && mapping.pos(bin_start+1,dim) <= rest.lower[dim]) bin_start++; + while (bin_start < bin_end && mapping.pos(bin_end ,dim) >= rest.upper[dim]) bin_end--; + + const auto splitter = splitterFactory(prim); + for (bin=bin_start; bin<bin_end; bin++) { const float pos = mapping.pos(bin+1,dim); + BBox3fa left,right; + splitter(rest,dim,pos,left,right); - PrimRef left,right; - splitPrimitive(rest,(int)dim,pos,left,right); - if (unlikely(left.bounds().empty())) l++; - bounds[bin][dim].extend(left.bounds()); + if (unlikely(left.empty())) l++; + extend(dim,bin,left); rest = right; } - if (unlikely(rest.bounds().empty())) r--; - numBegin[l][dim]++; - numEnd [r][dim]++; - bounds [bin][dim].extend(rest.bounds()); + if (unlikely(rest.empty())) r--; + add(dim,l,r,bin,rest); } - } + } } } - - /*! bins a range of primitives inside an array */ - template<typename SplitPrimitive> - void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { - bin(splitPrimitive,prims+begin,end-begin,mapping); - } - - /*! bins an array of primitives */ - template<typename PrimitiveSplitterFactory> - __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) - { - for (size_t i=begin; i<end; i++) - { - const PrimRef &prim = source[i]; - const vint4 bin0 = mapping.bin(prim.bounds().lower); - const vint4 bin1 = mapping.bin(prim.bounds().upper); - - for (size_t dim=0; dim<3; dim++) - { - if (unlikely(mapping.invalid(dim))) - continue; - - size_t bin; - size_t l = bin0[dim]; - size_t r = bin1[dim]; - - // same bin optimization - if (likely(l == r)) - { - add(dim,l,l,l,prim.bounds()); - continue; - } - const size_t bin_start = bin0[dim]; - const size_t bin_end = bin1[dim]; - BBox3fa rest = prim.bounds(); - const auto splitter = splitterFactory(prim); - for (bin=bin_start; bin<bin_end; bin++) - { - const float pos = mapping.pos(bin+1,dim); - BBox3fa left,right; - splitter(rest,dim,pos,left,right); - if (unlikely(left.empty())) l++; - extend(dim,bin,left); - rest = right; - } - if (unlikely(rest.empty())) r--; - add(dim,l,r,bin,rest); - } - } - } - /*! bins an array of primitives */ diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h index 60d235f48d..2584c19bda 100644 --- a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h +++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h @@ -241,7 +241,7 @@ namespace embree SpatialBinner binner(empty); const SpatialBinMapping<SPATIAL_BINS> mapping(set); binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/ return binner.best(mapping,logBlockSize); //,set.ext_size()); } @@ -256,7 +256,7 @@ namespace embree binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); return binner; }, [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); - /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/ return binner.best(mapping,logBlockSize); //,set.ext_size()); } @@ -286,6 +286,7 @@ namespace embree //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) + if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) { assert(splits > 1); @@ -384,8 +385,8 @@ namespace embree new (&lset) PrimInfoExtRange(begin,center,center,local_left); new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); + assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f); + assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f); return std::pair<size_t,size_t>(left_weight,right_weight); } @@ -410,7 +411,7 @@ namespace embree begin,end,local_left,local_right, [&] (const PrimRef& ref) { const Vec3fa c = ref.bounds().center(); - return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }, [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); @@ -419,8 +420,8 @@ namespace embree new (&lset) PrimInfoExtRange(begin,center,center,local_left); new (&rset) PrimInfoExtRange(center,end,end,local_right); - assert(area(lset.geomBounds) >= 0.0f); - assert(area(rset.geomBounds) >= 0.0f); + assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f); + assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f); return std::pair<size_t,size_t>(left_weight,right_weight); } diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp index d279dc4993..e2d7c27bd8 100644 --- a/thirdparty/embree/kernels/builders/primrefgen.cpp +++ b/thirdparty/embree/kernels/builders/primrefgen.cpp @@ -184,9 +184,7 @@ namespace embree // special variants for grid meshes -// -- GODOT start -- #if defined(EMBREE_GEOMETRY_GRID) -// -- GODOT end -- PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) { PrimInfo pinfo(empty); @@ -296,9 +294,7 @@ namespace embree return pinfo; } -// -- GODOT start -- #endif -// -- GODOT end -- // ==================================================================================================== // ==================================================================================================== diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h index 8cd251ddd2..aa2026a85e 100644 --- a/thirdparty/embree/kernels/builders/primrefgen_presplit.h +++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h @@ -266,7 +266,7 @@ namespace embree /* anything to split ? */ if (center < numPrimitives) { - const size_t numPrimitivesToSplit = numPrimitives - center; + size_t numPrimitivesToSplit = numPrimitives - center; assert(presplitItem[center].priority >= 1.0f); /* sort presplit items in ascending order */ @@ -279,8 +279,8 @@ namespace embree }); ); - unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem; - unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; + unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem; + unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { @@ -317,11 +317,16 @@ namespace embree sum += numSubPrims; } new_center++; + + primOffset0 += new_center - center; + numPrimitivesToSplit -= new_center - center; center = new_center; + assert(numPrimitivesToSplit == (numPrimitives - center)); } /* parallel prefix sum to compute offsets for storing sub-primitives */ const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); + assert(numPrimitives+offset <= alloc_numPrimitives); /* iterate over range, and split primitives into sub primitives and append them to prims array */ parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { @@ -338,7 +343,7 @@ namespace embree unsigned int numSubPrims = 0; splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); const size_t newID = numPrimitives + primOffset1[j-center]; - assert(newID+numSubPrims <= alloc_numPrimitives); + assert(newID+numSubPrims-1 <= alloc_numPrimitives); prims[primrefID] = subPrims[0]; for (size_t i=1;i<numSubPrims;i++) prims[newID+i-1] = subPrims[i]; diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h index f7720bd284..da89d0b178 100644 --- a/thirdparty/embree/kernels/builders/splitter.h +++ b/thirdparty/embree/kernels/builders/splitter.h @@ -128,28 +128,30 @@ namespace embree const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); QuadMesh::Quad quad = mesh->quad(prim.primID()); - v[0] = mesh->vertex(quad.v[0]); - v[1] = mesh->vertex(quad.v[1]); - v[2] = mesh->vertex(quad.v[2]); - v[3] = mesh->vertex(quad.v[3]); - v[4] = mesh->vertex(quad.v[0]); - inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); - inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); - inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); - inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); + v[0] = mesh->vertex(quad.v[1]); + v[1] = mesh->vertex(quad.v[2]); + v[2] = mesh->vertex(quad.v[3]); + v[3] = mesh->vertex(quad.v[0]); + v[4] = mesh->vertex(quad.v[1]); + v[5] = mesh->vertex(quad.v[3]); + inv_length[0] = Vec3fa(1.0f) / (v[1] - v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2] - v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[3] - v[2]); + inv_length[3] = Vec3fa(1.0f) / (v[4] - v[3]); + inv_length[4] = Vec3fa(1.0f) / (v[5] - v[4]); } __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { - splitPolygon<4>(prim,dim,pos,v,left_o,right_o); + splitPolygon<5>(prim,dim,pos,v,left_o,right_o); } __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { - splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); + splitPolygon<5>(prim,dim,pos,v,inv_length,left_o,right_o); } private: - Vec3fa v[5]; - Vec3fa inv_length[4]; + Vec3fa v[6]; + Vec3fa inv_length[5]; }; struct QuadSplitterFactory diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp index a84295f0da..f6cf626465 100644 --- a/thirdparty/embree/kernels/bvh/bvh.cpp +++ b/thirdparty/embree/kernels/bvh/bvh.cpp @@ -183,7 +183,7 @@ namespace embree template class BVHN<8>; #endif -#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) template class BVHN<4>; #endif } diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp index 6e9a5a538e..1d393fd06b 100644 --- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -230,7 +230,7 @@ namespace embree continue; /* switch to single ray traversal */ -#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__)) #if FORCE_SINGLE_MODE == 0 if (single) #endif @@ -676,7 +676,7 @@ namespace embree continue; /* switch to single ray traversal */ -#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__)) #if FORCE_SINGLE_MODE == 0 if (single) #endif diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h index 717f559677..c7e040fadb 100644 --- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h @@ -170,12 +170,23 @@ namespace embree TravRayKStream<K,robust> &p = packets[rayID / K]; const size_t i = rayID % K; const vint<N> bitmask(shiftTable[rayID]); + +#if defined (__aarch64__) + const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); + const vfloat<N> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<N> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<N> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); +#else const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); const vfloat<N> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); const vfloat<N> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); const vfloat<N> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); +#endif + const vfloat<N> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i])); const vfloat<N> tFar = mini(tFarX , tFarY , tFarZ, vfloat<N>(p.tfar[i])); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h index 57530692bc..3fd9fc7d18 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h @@ -46,6 +46,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif AABBNode_t* node = ref.getAABBNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); return ref; @@ -60,6 +68,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif AABBNode_t* node = ref.getAABBNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h index c4cea7d8ba..001f526c25 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h @@ -31,6 +31,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i].ref == NodeRef::emptyNode); + assert(emptyChild == (children[i].ref == NodeRef::emptyNode)); + } +#endif AABBNodeMB_t* node = ref.getAABBNodeMB(); LBBox3fa bounds = empty; diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h index 46a81d7581..3b966fd054 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h @@ -41,6 +41,14 @@ namespace embree template<typename BuildRecord> __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i].ref == NodeRef::emptyNode); + assert(emptyChild == (children[i].ref == NodeRef::emptyNode)); + } +#endif if (likely(ref.isAABBNodeMB())) { for (size_t i=0; i<num; i++) ref.getAABBNodeMB()->set(i, children[i]); diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h index 2afc8c98e7..99671ddc5a 100644 --- a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h +++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h @@ -190,6 +190,14 @@ namespace embree template<typename BuildRecord> __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const { +#if defined(DEBUG) + // check that empty children are only at the end of the child list + bool emptyChild = false; + for (size_t i=0; i<num; i++) { + emptyChild |= (children[i] == NodeRef::emptyNode); + assert(emptyChild == (children[i] == NodeRef::emptyNode)); + } +#endif QuantizedNode_t* node = ref.quantizedNode(); for (size_t i=0; i<num; i++) node->setRef(i,children[i]); return ref; diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp index d857ff7d95..57f75bfd7e 100644 --- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp @@ -162,7 +162,7 @@ namespace embree template class BVHNStatistics<8>; #endif -#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) +#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) template class BVHNStatistics<4>; #endif } diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h index 1ec4fc63fc..17641fa888 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector1.h +++ b/thirdparty/embree/kernels/bvh/node_intersector1.h @@ -5,6 +5,15 @@ #include "node_intersector.h" +#if defined(__AVX2__) +#define __FMA_X4__ +#endif + +#if defined(__aarch64__) +#define __FMA_X4__ +#endif + + namespace embree { namespace isa @@ -29,9 +38,15 @@ namespace embree org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) const Vec3fa ray_org_rdir = ray_org*ray_rdir; +#if !defined(__aarch64__) org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); +#else + //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd + //x86 will use msub + neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z); +#endif #endif nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); @@ -49,8 +64,12 @@ namespace embree org = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); dir = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); -#if defined(__AVX2__) || defined(__ARM_NEON) - org_rdir = org*rdir; +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + org_rdir = org*rdir; +#else + neg_org_rdir = -(org*rdir); +#endif #endif nearX = nearXYZ.x[k]; nearY = nearXYZ.y[k]; @@ -62,8 +81,14 @@ namespace embree Vec3fa org_xyz, dir_xyz; Vec3vf<N> org, dir, rdir; -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if !defined(__aarch64__) Vec3vf<N> org_rdir; +#else + //aarch64 version are keeping negation of the org_rdir and use madd + //x86 uses msub + Vec3vf<N> neg_org_rdir; +#endif #endif size_t nearX, nearY, nearZ; size_t farX, farY, farZ; @@ -404,13 +429,22 @@ namespace embree template<> __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist) { -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; @@ -450,13 +484,23 @@ namespace embree template<> __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist) { -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif + #else const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; @@ -522,13 +566,22 @@ namespace embree const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; @@ -537,7 +590,7 @@ namespace embree const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; #endif -#if defined(__AVX2__) && !defined(__AVX512F__) // HSW +#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vbool<N> vmask = asInt(tNear) > asInt(tFar); @@ -598,13 +651,22 @@ namespace embree const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); -#if defined (__AVX2__) || defined(__ARM_NEON) +#if defined (__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; @@ -613,7 +675,7 @@ namespace embree const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; #endif -#if defined(__AVX2__) && !defined(__AVX512F__) +#if defined(__FMA_X4__) && !defined(__AVX512F__) const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); #else @@ -687,13 +749,22 @@ namespace embree const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; @@ -703,7 +774,7 @@ namespace embree const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; #endif -#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW +#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); const vbool4 vmask = asInt(tNear) > asInt(tFar); @@ -775,13 +846,22 @@ namespace embree const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; @@ -857,13 +937,22 @@ namespace embree const vfloat<N> upper_y = node->dequantizeUpperY(time); const vfloat<N> lower_z = node->dequantizeLowerZ(time); const vfloat<N> upper_z = node->dequantizeUpperZ(time); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif #else const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x; const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y; diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h index 1f7215e5df..cad4e6de2d 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h @@ -75,9 +75,13 @@ namespace embree min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); +#if defined (__aarch64__) + neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org)); + neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org)); +#else min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); - +#endif min_dist = reduced_min_dist; max_dist = reduced_max_dist; @@ -95,9 +99,13 @@ namespace embree Vec3fa min_rdir; Vec3fa max_rdir; +#if defined (__aarch64__) + Vec3fa neg_min_org_rdir; + Vec3fa neg_max_org_rdir; +#else Vec3fa min_org_rdir; Vec3fa max_org_rdir; - +#endif float min_dist; float max_dist; }; @@ -191,13 +199,21 @@ namespace embree const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); +#if defined (__aarch64__) + const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x)); + const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y)); + const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z)); + const vfloat<N> fmaxX = madd(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.neg_max_org_rdir.x)); + const vfloat<N> fmaxY = madd(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.neg_max_org_rdir.y)); + const vfloat<N> fmaxZ = madd(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.neg_max_org_rdir.z)); +#else const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x)); const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y)); const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z)); const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x)); const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y)); const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z)); - +#endif const vfloat<N> fmin = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist)); dist = fmin; const vfloat<N> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist)); diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h index d5498fc5db..4deacd620d 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_packet.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h @@ -39,7 +39,9 @@ namespace embree org = ray_org; dir = ray_dir; rdir = rcp_safe(ray_dir); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + neg_org_rdir = -(org * rdir); +#elif defined(__AVX2__) org_rdir = org * rdir; #endif @@ -55,7 +57,9 @@ namespace embree Vec3vf<K> org; Vec3vf<K> dir; Vec3vf<K> rdir; -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#elif defined(__AVX2__) Vec3vf<K> org_rdir; #endif Vec3vi<K> nearXYZ; @@ -119,7 +123,14 @@ namespace embree const TravRayKFast<K>& ray, vfloat<K>& dist) { - #if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); @@ -199,7 +210,14 @@ namespace embree const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); @@ -302,7 +320,14 @@ namespace embree const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); @@ -464,7 +489,14 @@ namespace embree const vfloat<N> lower_z = node->dequantizeLowerZ(); const vfloat<N> upper_z = node->dequantizeUpperZ(); - #if defined(__AVX2__) || defined(__ARM_NEON) + #if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); + #elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); @@ -549,7 +581,14 @@ namespace embree const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time); const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time); -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h index 55b2c27231..943fd7043f 100644 --- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h +++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h @@ -32,11 +32,19 @@ namespace embree __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) { rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(ray_org * rdir); +#else org_rdir = ray_org * rdir; +#endif } Vec3vf<K> rdir; +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#else Vec3vf<K> org_rdir; +#endif vfloat<K> tnear; vfloat<K> tfar; }; @@ -87,12 +95,21 @@ namespace embree const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); +#if defined (__aarch64__) + const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k])); + const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k])); + const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k])); + const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k])); + const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k])); + const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k])); +#else const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k])); const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k])); const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k])); +#endif const vfloat<N> rmin = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k])); const vfloat<N> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k])); @@ -113,12 +130,21 @@ namespace embree const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); +#if defined (__aarch64__) + const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z); +#else const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); +#endif const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear); const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h index cc4ea1805b..d24326ce92 100644 --- a/thirdparty/embree/kernels/common/accel.h +++ b/thirdparty/embree/kernels/common/accel.h @@ -332,7 +332,7 @@ namespace embree intersectorN.intersect(this,rayN,N,context); } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) { const vint<4> mask = valid.mask32(); intersect4(&mask,(RTCRayHit4&)ray,context); @@ -388,7 +388,7 @@ namespace embree intersectorN.occluded(this,rayN,N,context); } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) { const vint<4> mask = valid.mask32(); occluded4(&mask,(RTCRay4&)ray,context); diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp index 32a27c560a..111c62083d 100644 --- a/thirdparty/embree/kernels/common/acceln.cpp +++ b/thirdparty/embree/kernels/common/acceln.cpp @@ -97,7 +97,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded4(valid,ray,context); -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); if (unlikely(none(valid0 & hit0))) break; @@ -111,7 +111,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded8(valid,ray,context); -#if defined(__SSE2__) // FIXME: use higher ISA +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 valid1 = asBool(((vint4*)valid)[1]); @@ -127,7 +127,7 @@ namespace embree for (size_t i=0; i<This->accels.size(); i++) { if (This->accels[i]->isEmpty()) continue; This->accels[i]->intersectors.occluded16(valid,ray,context); -#if defined(__SSE2__) // FIXME: use higher ISA +#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA vbool4 valid0 = asBool(((vint4*)valid)[0]); vbool4 hit0 = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero); vbool4 valid1 = asBool(((vint4*)valid)[1]); diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h index 90b184a07b..1b67120c97 100644 --- a/thirdparty/embree/kernels/common/accelset.h +++ b/thirdparty/embree/kernels/common/accelset.h @@ -14,21 +14,14 @@ namespace embree struct IntersectFunctionNArguments; struct OccludedFunctionNArguments; - typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args); - struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments { - IntersectContext* internal_context; Geometry* geometry; - ReportIntersectionFunc report; }; struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments { - IntersectContext* internal_context; Geometry* geometry; - ReportOcclusionFunc report; }; /*! Base class for set of acceleration structures. */ @@ -145,7 +138,7 @@ namespace embree public: /*! Intersects a single ray with the scene. */ - __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.intersect); @@ -159,15 +152,13 @@ namespace embree args.N = 1; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.intersect(&args); } /*! Tests if single ray is occluded by the scene. */ - __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.occluded); @@ -181,16 +172,14 @@ namespace embree args.N = 1; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.occluded(&args); } /*! Intersects a packet of K rays with the scene. */ template<int K> - __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) + __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.intersect); @@ -204,16 +193,14 @@ namespace embree args.N = K; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.intersect(&args); } /*! Tests if a packet of K rays is occluded by the scene. */ template<int K> - __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report) + __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) { assert(primID < size()); assert(intersectorN.occluded); @@ -227,9 +214,7 @@ namespace embree args.N = K; args.geomID = geomID; args.primID = primID; - args.internal_context = context; args.geometry = this; - args.report = report; intersectorN.occluded(&args); } diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp index 1a0e1aeed3..38a76225f4 100644 --- a/thirdparty/embree/kernels/common/alloc.cpp +++ b/thirdparty/embree/kernels/common/alloc.cpp @@ -3,6 +3,9 @@ #include "alloc.h" #include "../../common/sys/thread.h" +#if defined(APPLE) && defined(__aarch64__) +#include "../../common/sys/barrier.h" +#endif namespace embree { diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h index 4458e35c24..12769df2c8 100644 --- a/thirdparty/embree/kernels/common/alloc.h +++ b/thirdparty/embree/kernels/common/alloc.h @@ -8,6 +8,10 @@ #include "scene.h" #include "primref.h" +#if defined(APPLE) && defined(__aarch64__) +#include <mutex> +#endif + namespace embree { class FastAllocator @@ -26,7 +30,7 @@ namespace embree public: struct ThreadLocal2; - enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE }; + enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE }; /*! Per thread structure holding the current memory block. */ struct __aligned(64) ThreadLocal @@ -132,7 +136,11 @@ namespace embree { assert(alloc_i); if (alloc.load() == alloc_i) return; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind if (alloc.load()) { alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); @@ -150,7 +158,11 @@ namespace embree { assert(alloc_i); if (alloc.load() != alloc_i) return; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif if (alloc.load() != alloc_i) return; // required as a different thread calls unbind alloc.load()->bytesUsed += alloc0.getUsedBytes() + alloc1.getUsedBytes(); alloc.load()->bytesFree += alloc0.getFreeBytes() + alloc1.getFreeBytes(); @@ -161,7 +173,11 @@ namespace embree } public: +#if defined(APPLE) && defined(__aarch64__) + std::mutex mutex; +#else SpinLock mutex; //!< required as unbind is called from other threads +#endif std::atomic<FastAllocator*> alloc; //!< parent allocator ThreadLocal alloc0; ThreadLocal alloc1; @@ -169,7 +185,7 @@ namespace embree FastAllocator (Device* device, bool osAllocation) : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0), - growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC), + growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC), primrefarray(device,0) { for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) @@ -206,7 +222,7 @@ namespace embree void setOSallocation(bool flag) { - atype = flag ? OS_MALLOC : ALIGNED_MALLOC; + atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC; } private: @@ -217,7 +233,11 @@ namespace embree ThreadLocal2* alloc = thread_local_allocator2; if (alloc == nullptr) { thread_local_allocator2 = alloc = new ThreadLocal2; +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else Lock<SpinLock> lock(s_thread_local_allocators_lock); +#endif s_thread_local_allocators.push_back(make_unique(alloc)); } return alloc; @@ -227,7 +247,11 @@ namespace embree __forceinline void join(ThreadLocal2* alloc) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(s_thread_local_allocators_lock); +#else Lock<SpinLock> lock(thread_local_allocators_lock); +#endif thread_local_allocators.push_back(alloc); } @@ -492,7 +516,11 @@ namespace embree /* parallel block creation in case of no freeBlocks, avoids single global mutex */ if (likely(freeBlocks.load() == nullptr)) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(slotMutex[slot]); +#else Lock<SpinLock> lock(slotMutex[slot]); +#endif if (myUsedBlocks == threadUsedBlocks[slot]) { const size_t alignedBytes = (bytes+(align-1)) & ~(align-1); const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes); @@ -505,7 +533,11 @@ namespace embree /* if this fails allocate new block */ { - Lock<SpinLock> lock(mutex); +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else + Lock<SpinLock> lock(mutex); +#endif if (myUsedBlocks == threadUsedBlocks[slot]) { if (freeBlocks.load() != nullptr) { @@ -527,7 +559,11 @@ namespace embree /*! add new block */ void addBlock(void* ptr, ssize_t bytes) { +#if defined(APPLE) && defined(__aarch64__) + std::scoped_lock lock(mutex); +#else Lock<SpinLock> lock(mutex); +#endif const size_t sizeof_Header = offsetof(Block,data[0]); void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1)); size_t ofs = (size_t) aptr - (size_t) ptr; @@ -613,8 +649,8 @@ namespace embree bytesWasted(alloc->bytesWasted), stat_all(alloc,ANY_TYPE), stat_malloc(alloc,ALIGNED_MALLOC), - stat_4K(alloc,OS_MALLOC,false), - stat_2M(alloc,OS_MALLOC,true), + stat_4K(alloc,EMBREE_OS_MALLOC,false), + stat_2M(alloc,EMBREE_OS_MALLOC,true), stat_shared(alloc,SHARED) {} AllStatistics (size_t bytesUsed, @@ -707,7 +743,7 @@ namespace embree /* We avoid using os_malloc for small blocks as this could * cause a risk of fragmenting the virtual address space and * reach the limit of vm.max_map_count = 65k under Linux. */ - if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize) + if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize) atype = ALIGNED_MALLOC; /* we need to additionally allocate some header */ @@ -716,7 +752,7 @@ namespace embree bytesReserve = sizeof_Header+bytesReserve; /* consume full 4k pages with using os_malloc */ - if (atype == OS_MALLOC) { + if (atype == EMBREE_OS_MALLOC) { bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1)); bytesReserve = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1)); } @@ -748,11 +784,11 @@ namespace embree return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment); } } - else if (atype == OS_MALLOC) + else if (atype == EMBREE_OS_MALLOC) { if (device) device->memoryMonitor(bytesAllocate,false); bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages); - return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); + return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages); } else assert(false); @@ -796,7 +832,7 @@ namespace embree if (device) device->memoryMonitor(-sizeof_Alloced,true); } - else if (atype == OS_MALLOC) { + else if (atype == EMBREE_OS_MALLOC) { size_t sizeof_This = sizeof_Header+reserveEnd; os_free(this,sizeof_This,huge_pages); if (device) device->memoryMonitor(-sizeof_Alloced,true); @@ -857,7 +893,7 @@ namespace embree bool hasType(AllocationType atype_i, bool huge_pages_i) const { if (atype_i == ANY_TYPE ) return true; - else if (atype == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; + else if (atype == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages; else return atype_i == atype; } @@ -906,7 +942,7 @@ namespace embree void print_block() const { if (atype == ALIGNED_MALLOC) std::cout << "A"; - else if (atype == OS_MALLOC) std::cout << "O"; + else if (atype == EMBREE_OS_MALLOC) std::cout << "O"; else if (atype == SHARED) std::cout << "S"; if (huge_pages) std::cout << "H"; size_t bytesUsed = getBlockUsedBytes(); @@ -936,7 +972,11 @@ namespace embree std::atomic<Block*> freeBlocks; std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS]; - SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#if defined(APPLE) && defined(__aarch64__) + std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#else + PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS]; +#endif bool use_single_mode; size_t defaultBlockSize; @@ -950,7 +990,11 @@ namespace embree static __thread ThreadLocal2* thread_local_allocator2; static SpinLock s_thread_local_allocators_lock; static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators; +#if defined(APPLE) && defined(__aarch64__) + std::mutex thread_local_allocators_lock; +#else SpinLock thread_local_allocators_lock; +#endif std::vector<ThreadLocal2*> thread_local_allocators; AllocationType atype; mvector<PrimRef> primrefarray; //!< primrefarray used to allocate nodes diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp index 068e0c2983..833ec65139 100644 --- a/thirdparty/embree/kernels/common/device.cpp +++ b/thirdparty/embree/kernels/common/device.cpp @@ -66,7 +66,11 @@ namespace embree case CPU::CORE1: frequency_level = FREQUENCY_SIMD128; break; case CPU::XEON_PHI_KNIGHTS_MILL : frequency_level = FREQUENCY_SIMD512; break; case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break; +#if defined(__APPLE__) + case CPU::ARM: frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4 +#else case CPU::ARM: frequency_level = FREQUENCY_SIMD128; break; +#endif } /* initialize global state */ diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h index 2f9f2e7c94..593990f5b1 100644 --- a/thirdparty/embree/kernels/common/geometry.h +++ b/thirdparty/embree/kernels/common/geometry.h @@ -91,7 +91,7 @@ namespace embree size_t numFilterFunctions; //!< number of geometries with filter functions enabled size_t numTriangles; //!< number of enabled triangles - size_t numMBTriangles; //!< number of enabled motion blured triangles + size_t numMBTriangles; //!< number of enabled motion blurred triangles size_t numQuads; //!< number of enabled quads size_t numMBQuads; //!< number of enabled motion blurred quads size_t numBezierCurves; //!< number of enabled curves @@ -99,7 +99,7 @@ namespace embree size_t numLineSegments; //!< number of enabled line segments size_t numMBLineSegments; //!< number of enabled line motion blurred segments size_t numSubdivPatches; //!< number of enabled subdivision patches - size_t numMBSubdivPatches; //!< number of enabled motion blured subdivision patches + size_t numMBSubdivPatches; //!< number of enabled motion blurred subdivision patches size_t numUserGeometries; //!< number of enabled user geometries size_t numMBUserGeometries; //!< number of enabled motion blurred user geometries size_t numInstancesCheap; //!< number of enabled cheap instances diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h index ae6556336c..9e1132e1a0 100644 --- a/thirdparty/embree/kernels/common/isa.h +++ b/thirdparty/embree/kernels/common/isa.h @@ -44,7 +44,7 @@ namespace embree #define SELECT_SYMBOL_DEFAULT(features,intersector) \ intersector = isa::intersector; -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) #if !defined(EMBREE_TARGET_SIMD4) #define EMBREE_TARGET_SIMD4 #endif diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h index 7b951cc1e8..3c8ee3989c 100644 --- a/thirdparty/embree/kernels/common/ray.h +++ b/thirdparty/embree/kernels/common/ray.h @@ -6,7 +6,7 @@ #include "default.h" #include "instance_stack.h" -// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted +// FIXME: if ray gets separated into ray* and hit, uload4 needs to be adjusted namespace embree { diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp index 94b3819e42..a6ea55bfc4 100644 --- a/thirdparty/embree/kernels/common/rtcore.cpp +++ b/thirdparty/embree/kernels/common/rtcore.cpp @@ -7,6 +7,7 @@ #include "device.h" #include "scene.h" #include "context.h" +#include "../geometry/filter.h" #include "../../include/embree3/rtcore_ray.h" using namespace embree; @@ -482,7 +483,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray4* ray4 = (Ray4*) rayhit; + RayHit4* ray4 = (RayHit4*) rayhit; for (size_t i=0; i<4; i++) { if (!valid[i]) continue; RayHit ray1; ray4->get(i,ray1); @@ -513,7 +514,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray8* ray8 = (Ray8*) rayhit; + RayHit8* ray8 = (RayHit8*) rayhit; for (size_t i=0; i<8; i++) { if (!valid[i]) continue; RayHit ray1; ray8->get(i,ray1); @@ -546,7 +547,7 @@ RTC_NAMESPACE_BEGIN; IntersectContext context(scene,user_context); #if !defined(EMBREE_RAY_PACKETS) - Ray16* ray16 = (Ray16*) rayhit; + RayHit16* ray16 = (RayHit16*) rayhit; for (size_t i=0; i<16; i++) { if (!valid[i]) continue; RayHit ray1; ray16->get(i,ray1); @@ -1097,13 +1098,13 @@ RTC_NAMESPACE_BEGIN; RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) { IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i; - args->report(args,filter_args); + isa::reportIntersection1(args, filter_args); } RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args) { OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i; - args->report(args,filter_args); + isa::reportOcclusion1(args,filter_args); } RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type) @@ -1763,4 +1764,19 @@ RTC_NAMESPACE_BEGIN; return nullptr; } + RTC_API RTCGeometry rtcGetGeometryThreadSafe (RTCScene hscene, unsigned int geomID) + { + Scene* scene = (Scene*) hscene; + RTC_CATCH_BEGIN; + RTC_TRACE(rtcGetGeometryThreadSafe); +#if defined(DEBUG) + RTC_VERIFY_HANDLE(hscene); + RTC_VERIFY_GEOMID(geomID); +#endif + Ref<Geometry> geom = scene->get_locked(geomID); + return (RTCGeometry) geom.ptr; + RTC_CATCH_END2(scene); + return nullptr; + } + RTC_NAMESPACE_END diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h index f8aad7c7cb..ac58a84d6f 100644 --- a/thirdparty/embree/kernels/common/rtcore.h +++ b/thirdparty/embree/kernels/common/rtcore.h @@ -26,56 +26,59 @@ namespace embree /*! Macros used in the rtcore API implementation */ // -- GODOT start -- -// #define RTC_CATCH_BEGIN try { #define RTC_CATCH_BEGIN - -// #define RTC_CATCH_END(device) \ -// } catch (std::bad_alloc&) { \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } #define RTC_CATCH_END(device) - -// #define RTC_CATCH_END2(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// } #define RTC_CATCH_END2(scene) - -// #define RTC_CATCH_END2_FALSE(scene) \ -// } catch (std::bad_alloc&) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -// return false; \ -// } catch (rtcore_error& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,e.error,e.what()); \ -// return false; \ -// } catch (std::exception& e) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -// return false; \ -// } catch (...) { \ -// Device* device = scene ? scene->device : nullptr; \ -// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -// return false; \ -// } #define RTC_CATCH_END2_FALSE(scene) return false; + +#if 0 +#define RTC_CATCH_BEGIN try { + +#define RTC_CATCH_END(device) \ + } catch (std::bad_alloc&) { \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + } catch (rtcore_error& e) { \ + Device::process_error(device,e.error,e.what()); \ + } catch (std::exception& e) { \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + } catch (...) { \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + } + +#define RTC_CATCH_END2(scene) \ + } catch (std::bad_alloc&) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + } catch (rtcore_error& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,e.error,e.what()); \ + } catch (std::exception& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + } catch (...) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + } + +#define RTC_CATCH_END2_FALSE(scene) \ + } catch (std::bad_alloc&) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ + return false; \ + } catch (rtcore_error& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,e.error,e.what()); \ + return false; \ + } catch (std::exception& e) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ + return false; \ + } catch (...) { \ + Device* device = scene ? scene->device : nullptr; \ + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + return false; \ + } +#endif // -- GODOT end -- #define RTC_VERIFY_HANDLE(handle) \ @@ -103,39 +106,35 @@ namespace embree #define RTC_TRACE(x) #endif -// -- GODOT begin -- -// /*! used to throw embree API errors */ -// struct rtcore_error : public std::exception -// { -// __forceinline rtcore_error(RTCError error, const std::string& str) -// : error(error), str(str) {} -// -// ~rtcore_error() throw() {} -// -// const char* what () const throw () { -// return str.c_str(); -// } -// -// RTCError error; -// std::string str; -// }; -// -- GODOT end -- +// -- GODOT start -- +#if 0 + /*! used to throw embree API errors */ + struct rtcore_error : public std::exception + { + __forceinline rtcore_error(RTCError error, const std::string& str) + : error(error), str(str) {} + + ~rtcore_error() throw() {} + + const char* what () const throw () { + return str.c_str(); + } + + RTCError error; + std::string str; + }; +#endif #if defined(DEBUG) // only report file and line in debug mode - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #define throw_RTCError(error,str) \ printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort(); - // -- GODOT end -- + // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #else - // -- GODOT begin -- - // #define throw_RTCError(error,str) \ - // throw rtcore_error(error,str); #define throw_RTCError(error,str) \ abort(); - // -- GODOT end -- + // throw rtcore_error(error,str); #endif +// -- GODOT end -- #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp index 1f1b6f6ddf..29e3bdca20 100644 --- a/thirdparty/embree/kernels/common/rtcore_builder.cpp +++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp @@ -371,7 +371,7 @@ RTC_NAMESPACE_BEGIN bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa)); bvh->allocator.reset(); - /* switch between differnet builders based on quality level */ + /* switch between different builders based on quality level */ if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW) return rtcBuildBVHMorton(arguments); else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM) diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp index 408d7eae6f..65d31d0f81 100644 --- a/thirdparty/embree/kernels/common/scene.cpp +++ b/thirdparty/embree/kernels/common/scene.cpp @@ -629,9 +629,7 @@ namespace embree if (geometry == null) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry"); - if (geometry->isEnabled()) { - setModified (); - } + setModified (); accels_deleteGeometry(unsigned(geomID)); id_pool.deallocate((unsigned)geomID); geometries[geomID] = null; diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h index a5a39e42d4..a1ea45d3c7 100644 --- a/thirdparty/embree/kernels/common/scene_curves.h +++ b/thirdparty/embree/kernels/common/scene_curves.h @@ -452,6 +452,10 @@ namespace embree const Vec3fa n1 = normal(index+1,itime); if (!isvalid(n0) || !isvalid(n1)) return false; + + const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds(); + if (!isvalid(b)) + return false; } } @@ -612,6 +616,10 @@ namespace embree const Vec3fa dn1 = dnormal(index+1,itime); if (!isvalid(dn0) || !isvalid(dn1)) return false; + + const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds(); + if (!isvalid(b)) + return false; } } diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp index 01c862da0c..db6b803041 100644 --- a/thirdparty/embree/kernels/common/state.cpp +++ b/thirdparty/embree/kernels/common/state.cpp @@ -144,7 +144,20 @@ namespace embree } bool State::checkISASupport() { +#if defined(__ARM_NEON) + /* + * NEON CPU type is a mixture of NEON and SSE2 + */ + + bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2; + + /* this will be true when explicitly initialize Device with `isa=neon` config */ + bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON; + + return hasSSE2 || hasNEON; +#else return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features; +#endif } void State::verify() @@ -157,8 +170,10 @@ namespace embree * functions */ #if defined(DEBUG) #if defined(EMBREE_TARGET_SSE2) +#if !defined(__ARM_NEON) assert(sse2::getISA() <= SSE2); #endif +#endif #if defined(EMBREE_TARGET_SSE42) assert(sse42::getISA() <= SSE42); #endif diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 2bf7e93587..84ac27d103 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -1,5 +1,4 @@ - -// Copyright 2009-2020 Intel Corporation +// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /* #undef EMBREE_RAY_MASK */ @@ -20,6 +19,7 @@ /* #undef EMBREE_COMPACT_POLYS */ #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 +#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE #if defined(EMBREE_GEOMETRY_TRIANGLE) #define IF_ENABLED_TRIS(x) x diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h index 3d8900c2aa..75532f5ae0 100644 --- a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h +++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h @@ -225,7 +225,7 @@ namespace embree /* exit if convergence cannot get proven, but terminate if we are very small */ if (unlikely(!subset(K,x) && !very_small)) return false; - /* solve using newton raphson iteration of convergence is guarenteed */ + /* solve using newton raphson iteration of convergence is guaranteed */ solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J); return true; } diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h index 2d4abd73ac..ed827d583f 100644 --- a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h +++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h @@ -60,7 +60,7 @@ namespace embree const Vec3fa dir = ray.dir; const float length_ray_dir = length(dir); - /* error of curve evaluations is propertional to largest coordinate */ + /* error of curve evaluations is proportional to largest coordinate */ const BBox3ff box = curve.bounds(); const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper))); diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h index 816c066899..ec6fa9c4f3 100644 --- a/thirdparty/embree/kernels/geometry/disc_intersector.h +++ b/thirdparty/embree/kernels/geometry/disc_intersector.h @@ -68,15 +68,15 @@ namespace embree const Vec3vf<M> center = v0.xyz(); const vfloat<M> radius = v0.w; + /* compute ray distance projC0 to hit point with ray oriented plane */ const Vec3vf<M> c0 = center - ray_org; const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar)); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale; // ignore self intersections if (unlikely(none(valid))) return false; - + + /* check if hit point lies inside disc */ const Vec3vf<M> perp = c0 - projC0 * ray_dir; const vfloat<M> l2 = dot(perp, perp); const vfloat<M> r2 = radius * radius; @@ -84,6 +84,15 @@ namespace embree if (unlikely(none(valid))) return false; + /* We reject hits where the ray origin lies inside the ray + * oriented disc to avoid self intersections. */ +#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE) + const vfloat<M> m2 = dot(c0, c0); + valid &= (m2 > r2); + if (unlikely(none(valid))) + return false; +#endif + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); return epilog(valid, hit); } @@ -152,15 +161,15 @@ namespace embree const Vec3vf<M> center = v0.xyz(); const vfloat<M> radius = v0.w; + /* compute ray distance projC0 to hit point with ray oriented plane */ const Vec3vf<M> c0 = center - ray_org; const vfloat<M> projC0 = dot(c0, ray_dir) * rd2; valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k])); - if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) - valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k]; // ignore self intersections if (unlikely(none(valid))) return false; + /* check if hit point lies inside disc */ const Vec3vf<M> perp = c0 - projC0 * ray_dir; const vfloat<M> l2 = dot(perp, perp); const vfloat<M> r2 = radius * radius; @@ -168,6 +177,15 @@ namespace embree if (unlikely(none(valid))) return false; + /* We reject hits where the ray origin lies inside the ray + * oriented disc to avoid self intersections. */ +#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE) + const vfloat<M> m2 = dot(c0, c0); + valid &= (m2 > r2); + if (unlikely(none(valid))) + return false; +#endif + DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir); return epilog(valid, hit); } diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h index 3b4d924ea7..d64320bf78 100644 --- a/thirdparty/embree/kernels/geometry/filter.h +++ b/thirdparty/embree/kernels/geometry/filter.h @@ -51,20 +51,11 @@ namespace embree __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) { #if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->intersectionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->intersectionFilterN(filter_args); - } + if (args->geometry->intersectionFilterN) + args->geometry->intersectionFilterN(filter_args); - //if (args->valid[0] == 0) - // return; - - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } + if (args->context->filter) + args->context->filter(filter_args); #endif } @@ -105,20 +96,11 @@ namespace embree __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args) { #if defined(EMBREE_FILTER_FUNCTION) - IntersectContext* MAYBE_UNUSED context = args->internal_context; - const Geometry* const geometry = args->geometry; - if (geometry->occlusionFilterN) { - assert(context->scene->hasGeometryFilterFunction()); - geometry->occlusionFilterN(filter_args); - } - - //if (args->valid[0] == 0) - // return false; + if (args->geometry->occlusionFilterN) + args->geometry->occlusionFilterN(filter_args); - if (context->user->filter) { - assert(context->scene->hasContextFilterFunction()); - context->user->filter(filter_args); - } + if (args->context->filter) + args->context->filter(filter_args); #endif } diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h index 11ceb2f7fe..e4ad01852f 100644 --- a/thirdparty/embree/kernels/geometry/object_intersector.h +++ b/thirdparty/embree/kernels/geometry/object_intersector.h @@ -32,7 +32,7 @@ namespace embree return; #endif - accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1); + accel->intersect(ray,prim.geomID(),prim.primID(),context); } static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim) @@ -44,7 +44,7 @@ namespace embree return false; #endif - accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + accel->occluded(ray,prim.geomID(),prim.primID(),context); return ray.tfar < 0.0f; } @@ -89,7 +89,7 @@ namespace embree valid &= (ray.mask & accel->mask) != 0; if (none(valid)) return; #endif - accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1); + accel->intersect(valid,ray,prim.geomID(),prim.primID(),context); } static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim) @@ -102,7 +102,7 @@ namespace embree valid &= (ray.mask & accel->mask) != 0; if (none(valid)) return false; #endif - accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1); + accel->occluded(valid,ray,prim.geomID(),prim.primID(),context); return ray.tfar < 0.0f; } diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h index 2137356ff2..514e519b0c 100644 --- a/thirdparty/embree/kernels/geometry/quadv.h +++ b/thirdparty/embree/kernels/geometry/quadv.h @@ -152,7 +152,7 @@ namespace embree Vec3vf<M> v0; // 1st vertex of the quads Vec3vf<M> v1; // 2nd vertex of the quads Vec3vf<M> v2; // 3rd vertex of the quads - Vec3vf<M> v3; // 4rd vertex of the quads + Vec3vf<M> v3; // 4th vertex of the quads private: vuint<M> geomIDs; // geometry ID vuint<M> primIDs; // primitive ID diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h index 0e9393442b..764ff93fec 100644 --- a/thirdparty/embree/kernels/geometry/roundline_intersector.h +++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h @@ -19,7 +19,7 @@ For multiple connected round linear curve segments this construction yield a proper shape when viewed from the outside. Using the - following CSG we can also handle the interiour in most common cases: + following CSG we can also handle the interior in most common cases: round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) = cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr) @@ -431,7 +431,7 @@ namespace embree Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP Inserting the definition of w0 and dw and refactoring - yield a furhter scaled Ng'': + yield a further scaled Ng'': Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h index ad5fee2e4e..e241073812 100644 --- a/thirdparty/embree/kernels/geometry/subgrid_intersector.h +++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h @@ -264,8 +264,8 @@ namespace embree const Vec3vf<K> p2 = vtx[i*4+2]; const Vec3vf<K> p3 = vtx[i*4+3]; STAT3(shadow.trav_prims,1,popcnt(valid0),K); - if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - break; + pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)); + if (none(valid0)) break; } return !valid0; } @@ -408,10 +408,8 @@ namespace embree const Vec3vf<K> p2 = vtx[i*4+2]; const Vec3vf<K> p3 = vtx[i*4+3]; STAT3(shadow.trav_prims,1,popcnt(valid0),K); - //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i))) - - break; + pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)); + if (none(valid0)) break; } return !valid0; } diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 470e15f03e..39d50e2354 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -1,5 +1,4 @@ - -// Copyright 2009-2020 Intel Corporation +// Copyright 2009-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd" +#define RTC_HASH "698442324ccddd11725fb8875275dc1384f7fb40" diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h index 2ff03902a7..0a2aef321f 100644 --- a/thirdparty/embree/kernels/subdiv/bezier_patch.h +++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h @@ -94,7 +94,7 @@ namespace embree matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1); matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); - /* compute buttom edge control points */ + /* compute bottom edge control points */ matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1); matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2); diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h index e5ad5dadfe..eab91d9ee6 100644 --- a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h +++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h @@ -388,7 +388,7 @@ namespace embree return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n); } - /* gets limit tangent in the direction of egde vtx -> ring[0] */ + /* gets limit tangent in the direction of edge vtx -> ring[0] */ __forceinline Vertex getLimitTangent() const { if (unlikely(std::isinf(vertex_crease_weight))) @@ -429,7 +429,7 @@ namespace embree return sigma * (alpha + beta); } - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */ __forceinline Vertex getSecondLimitTangent() const { if (unlikely(std::isinf(vertex_crease_weight))) @@ -763,7 +763,7 @@ namespace embree } - /* gets limit tangent in the direction of egde vtx -> ring[0] */ + /* gets limit tangent in the direction of edge vtx -> ring[0] */ __forceinline Vertex getLimitTangent() const { CatmullClark1Ring cc_vtx; @@ -779,7 +779,7 @@ namespace embree return 2.0f * cc_vtx.getLimitTangent(); } - /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */ + /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */ __forceinline Vertex getSecondLimitTangent() const { CatmullClark1Ring cc_vtx; diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h index 74fc4c1230..9532287d98 100644 --- a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h +++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h @@ -8,7 +8,7 @@ /* - Implements Catmul Rom curves with control points p0, p1, p2, p3. At + Implements Catmull-Rom curves with control points p0, p1, p2, p3. At t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1 the curve goes through p2 with tangent (p3-p2)/2. @@ -91,11 +91,11 @@ namespace embree : v0(v0), v1(v1), v2(v2), v3(v3) {} __forceinline Vertex begin() const { - return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2)); + return v1; } __forceinline Vertex end() const { - return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3)); + return v2; } __forceinline Vertex center() const { diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h index f8e8a25f35..dcdb101d7c 100644 --- a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h +++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h @@ -81,29 +81,29 @@ namespace embree { SourceCurve<Vec3ff> vcurve = center; SourceCurve<Vec3fa> ncurve = normal; - + /* here we construct a patch which follows the curve l(t) = * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */ const Vec3ff p0 = vcurve.eval(0.0f); const Vec3ff dp0 = vcurve.eval_du(0.0f); - const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); + //const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); // ddp0 is assumed to be 0 const Vec3fa n0 = ncurve.eval(0.0f); const Vec3fa dn0 = ncurve.eval_du(0.0f); const Vec3ff p1 = vcurve.eval(1.0f); const Vec3ff dp1 = vcurve.eval_du(1.0f); - const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); + //const Vec3ff ddp1 = vcurve.eval_dudu(1.0f); // ddp1 is assumed to be 0 const Vec3fa n1 = ncurve.eval(1.0f); const Vec3fa dn1 = ncurve.eval_du(1.0f); const Vec3fa bt0 = cross(n0,dp0); - const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0); + const Vec3fa dbt0 = cross(dn0,dp0);// + cross(n0,ddp0); const Vec3fa bt1 = cross(n1,dp1); - const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1); + const Vec3fa dbt1 = cross(dn1,dp1);// + cross(n1,ddp1); const Vec3fa k0 = normalize(bt0); const Vec3fa dk0 = dnormalize(bt0,dbt0); diff --git a/thirdparty/embree/patches/godot-changes-android.patch b/thirdparty/embree/patches/godot-changes-android.patch deleted file mode 100644 index a27f924bde..0000000000 --- a/thirdparty/embree/patches/godot-changes-android.patch +++ /dev/null @@ -1,103 +0,0 @@ -diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp -index ba97dc227b..1679599608 100644 ---- a/thirdparty/embree/common/sys/sysinfo.cpp -+++ b/thirdparty/embree/common/sys/sysinfo.cpp -@@ -618,7 +618,10 @@ namespace embree - static int nThreads = -1; - if (nThreads != -1) return nThreads; - --#if defined(__MACOSX__) -+// -- GODOT start -- -+// #if defined(__MACOSX__) -+#if defined(__MACOSX__) || defined(__ANDROID__) -+// -- GODOT end -- - nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container - assert(nThreads); - #else -diff --git a/thirdparty/embree/common/sys/thread.cpp b/thirdparty/embree/common/sys/thread.cpp -index a7827e18f7..f4014be89b 100644 ---- a/thirdparty/embree/common/sys/thread.cpp -+++ b/thirdparty/embree/common/sys/thread.cpp -@@ -158,7 +158,9 @@ namespace embree - /// Linux Platform - //////////////////////////////////////////////////////////////////////////////// - --#if defined(__LINUX__) -+// -- GODOT start -- -+#if defined(__LINUX__) && !defined(__ANDROID__) -+// -- GODOT end -- - - #include <fstream> - #include <sstream> -@@ -247,6 +249,28 @@ namespace embree - } - #endif - -+// -- GODOT start -- -+//////////////////////////////////////////////////////////////////////////////// -+/// Android Platform -+//////////////////////////////////////////////////////////////////////////////// -+ -+#if defined(__ANDROID__) -+ -+namespace embree -+{ -+ /*! set affinity of the calling thread */ -+ void setAffinity(ssize_t affinity) -+ { -+ cpu_set_t cset; -+ CPU_ZERO(&cset); -+ CPU_SET(affinity, &cset); -+ -+ sched_setaffinity(0, sizeof(cset), &cset); -+ } -+} -+#endif -+// -- GODOT end -- -+ - //////////////////////////////////////////////////////////////////////////////// - /// FreeBSD Platform - //////////////////////////////////////////////////////////////////////////////// -@@ -355,7 +379,9 @@ namespace embree - pthread_attr_destroy(&attr); - - /* set affinity */ --#if defined(__LINUX__) -+// -- GODOT start -- -+#if defined(__LINUX__) && !defined(__ANDROID__) -+// -- GODOT end -- - if (threadID >= 0) { - cpu_set_t cset; - CPU_ZERO(&cset); -@@ -370,7 +396,16 @@ namespace embree - CPU_SET(threadID, &cset); - pthread_setaffinity_np(*tid, sizeof(cset), &cset); - } -+// -- GODOT start -- -+#elif defined(__ANDROID__) -+ if (threadID >= 0) { -+ cpu_set_t cset; -+ CPU_ZERO(&cset); -+ CPU_SET(threadID, &cset); -+ sched_setaffinity(pthread_gettid_np(*tid), sizeof(cset), &cset); -+ } - #endif -+// -- GODOT end -- - - return thread_t(tid); - } -@@ -389,8 +424,14 @@ namespace embree - - /*! destroy a hardware thread by its handle */ - void destroyThread(thread_t tid) { -+// -- GODOT start -- -+#if defined(__ANDROID__) -+ FATAL("Can't destroy threads on Android."); -+#else - pthread_cancel(*(pthread_t*)tid); - delete (pthread_t*)tid; -+#endif -+// -- GODOT end -- - } - - /*! creates thread local storage */ diff --git a/thirdparty/embree/patches/godot-changes-misc.patch b/thirdparty/embree/patches/godot-changes-misc.patch deleted file mode 100644 index 8bf0d9fa97..0000000000 --- a/thirdparty/embree/patches/godot-changes-misc.patch +++ /dev/null @@ -1,105 +0,0 @@ -diff --git a/thirdparty/embree/common/sys/intrinsics.h b/thirdparty/embree/common/sys/intrinsics.h -index 79729c87ab..ed8dd7d40a 100644 ---- a/thirdparty/embree/common/sys/intrinsics.h -+++ b/thirdparty/embree/common/sys/intrinsics.h -@@ -34,8 +34,14 @@ - #endif - - #if defined(__WIN32__) --# define NOMINMAX --# include <windows.h> -+// -- GODOT start -- -+#if !defined(NOMINMAX) -+// -- GODOT end -- -+#define NOMINMAX -+// -- GODOT start -- -+#endif -+#include "windows.h" -+// -- GODOT end -- - #endif - - /* normally defined in pmmintrin.h, but we always need this */ -diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h -index 3fc5e99b8d..697e07bb86 100644 ---- a/thirdparty/embree/common/sys/platform.h -+++ b/thirdparty/embree/common/sys/platform.h -@@ -99,7 +99,9 @@ - #define dll_import - #endif - --#ifdef __WIN32__ -+// -- GODOT start -- -+#if defined(__WIN32__) && !defined(__MINGW32__) -+// -- GODOT end -- - #if !defined(__noinline) - #define __noinline __declspec(noinline) - #endif -@@ -149,6 +151,9 @@ - #define DELETED = delete - #endif - -+// -- GODOT start -- -+#if !defined(likely) -+// -- GODOT end -- - #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) - #define likely(expr) (expr) - #define unlikely(expr) (expr) -@@ -156,6 +161,9 @@ - #define likely(expr) __builtin_expect((bool)(expr),true ) - #define unlikely(expr) __builtin_expect((bool)(expr),false) - #endif -+// -- GODOT start -- -+#endif -+// -- GODOT end -- - - //////////////////////////////////////////////////////////////////////////////// - /// Error handling and debugging -diff --git a/thirdparty/embree/common/sys/sysinfo.cpp b/thirdparty/embree/common/sys/sysinfo.cpp -index ba97dc227b..f1a59e511e 100644 ---- a/thirdparty/embree/common/sys/sysinfo.cpp -+++ b/thirdparty/embree/common/sys/sysinfo.cpp -@@ -248,7 +248,9 @@ namespace embree - #if defined(__X86_ASM__) - __noinline int64_t get_xcr0() - { --#if defined (__WIN32__) -+// -- GODOT start -- -+#if defined (__WIN32__) && !defined (__MINGW32__) -+// -- GODOT end -- - int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32 - xcr0 = _xgetbv(0); - return xcr0; -diff --git a/thirdparty/embree/include/embree3/rtcore_common.h b/thirdparty/embree/include/embree3/rtcore_common.h -index 9c14b28745..4857e1e05e 100644 ---- a/thirdparty/embree/include/embree3/rtcore_common.h -+++ b/thirdparty/embree/include/embree3/rtcore_common.h -@@ -19,7 +19,9 @@ typedef int ssize_t; - #endif - #endif - --#ifdef _WIN32 -+// -- GODOT start -- -+#if defined(_WIN32) && defined(_MSC_VER) -+// -- GODOT end -- - # define RTC_ALIGN(...) __declspec(align(__VA_ARGS__)) - #else - # define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__))) -diff --git a/thirdparty/embree/common/tasking/taskschedulertbb.h b/thirdparty/embree/common/tasking/taskschedulertbb.h -index 3fd15816e9..35bd49849f 100644 ---- a/thirdparty/embree/common/tasking/taskschedulertbb.h -+++ b/thirdparty/embree/common/tasking/taskschedulertbb.h -@@ -12,7 +12,13 @@ - #include "../sys/ref.h" - - #if defined(__WIN32__) -+// -- GODOT start -- -+#if !defined(NOMINMAX) -+// -- GODOT end -- - # define NOMINMAX -+// -- GODOT start -- -+#endif -+// -- GODOT end -- - #endif - - // We need to define these to avoid implicit linkage against -
\ No newline at end of file diff --git a/thirdparty/embree/patches/godot-changes-noexcept.patch b/thirdparty/embree/patches/godot-changes-noexcept.patch index 598a7f2ddc..84169c36e4 100644 --- a/thirdparty/embree/patches/godot-changes-noexcept.patch +++ b/thirdparty/embree/patches/godot-changes-noexcept.patch @@ -1,5 +1,5 @@ diff --git a/thirdparty/embree/common/algorithms/parallel_for.h b/thirdparty/embree/common/algorithms/parallel_for.h -index f052d8b468..645681ac63 100644 +index f2969a88f1..6d411e4852 100644 --- a/thirdparty/embree/common/algorithms/parallel_for.h +++ b/thirdparty/embree/common/algorithms/parallel_for.h @@ -21,7 +21,10 @@ namespace embree @@ -12,9 +12,9 @@ index f052d8b468..645681ac63 100644 + abort(); + // -- GODOT end -- } - #elif defined(TASKING_TBB) -@@ -31,13 +34,19 @@ namespace embree + #if TBB_INTERFACE_VERSION >= 12002 +@@ -30,13 +33,19 @@ namespace embree func(i); },context); if (context.is_group_execution_cancelled()) @@ -36,7 +36,7 @@ index f052d8b468..645681ac63 100644 #endif #elif defined(TASKING_PPL) -@@ -57,7 +66,10 @@ namespace embree +@@ -56,7 +65,10 @@ namespace embree #if defined(TASKING_INTERNAL) TaskScheduler::spawn(first,last,minStepSize,func); if (!TaskScheduler::wait()) @@ -48,7 +48,7 @@ index f052d8b468..645681ac63 100644 #elif defined(TASKING_TBB) #if TBB_INTERFACE_VERSION >= 12002 -@@ -66,13 +78,19 @@ namespace embree +@@ -65,13 +77,19 @@ namespace embree func(range<Index>(r.begin(),r.end())); },context); if (context.is_group_execution_cancelled()) @@ -70,7 +70,7 @@ index f052d8b468..645681ac63 100644 #endif #elif defined(TASKING_PPL) -@@ -104,13 +122,19 @@ namespace embree +@@ -103,13 +121,19 @@ namespace embree func(i); },tbb::simple_partitioner(),context); if (context.is_group_execution_cancelled()) @@ -92,7 +92,7 @@ index f052d8b468..645681ac63 100644 #endif } -@@ -125,13 +149,19 @@ namespace embree +@@ -124,13 +148,19 @@ namespace embree func(i); },ap,context); if (context.is_group_execution_cancelled()) @@ -115,7 +115,7 @@ index f052d8b468..645681ac63 100644 } diff --git a/thirdparty/embree/common/algorithms/parallel_reduce.h b/thirdparty/embree/common/algorithms/parallel_reduce.h -index f42ae2ec50..8271372ea4 100644 +index 1a94aad8c4..cd0078f2e6 100644 --- a/thirdparty/embree/common/algorithms/parallel_reduce.h +++ b/thirdparty/embree/common/algorithms/parallel_reduce.h @@ -58,15 +58,19 @@ namespace embree @@ -247,10 +247,10 @@ index 1bc30fe9a5..abdd269069 100644 /* hint for transparent huge pages (THP) */ diff --git a/thirdparty/embree/common/sys/platform.h b/thirdparty/embree/common/sys/platform.h -index 8a6d9fa0a9..697e07bb86 100644 +index be3ec36436..728bf6ed7d 100644 --- a/thirdparty/embree/common/sys/platform.h +++ b/thirdparty/embree/common/sys/platform.h -@@ -179,11 +179,19 @@ +@@ -178,11 +178,19 @@ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl #if defined(DEBUG) // only report file and line in debug mode @@ -351,7 +351,7 @@ index dca835a716..ad438588a3 100644 bool TaskScheduler::steal_from_other_threads(Thread& thread) diff --git a/thirdparty/embree/common/tasking/taskschedulerinternal.h b/thirdparty/embree/common/tasking/taskschedulerinternal.h -index c766a0bb6a..8fa6bb12fa 100644 +index 61a0e57c5b..6cc2495195 100644 --- a/thirdparty/embree/common/tasking/taskschedulerinternal.h +++ b/thirdparty/embree/common/tasking/taskschedulerinternal.h @@ -123,7 +123,10 @@ namespace embree @@ -391,7 +391,7 @@ index c766a0bb6a..8fa6bb12fa 100644 /*! steals a task from a different thread */ bool steal_from_other_threads(Thread& thread); diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp -index d8da78eed7..d857ff7d95 100644 +index 40f9043736..57f75bfd7e 100644 --- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp +++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp @@ -150,7 +150,10 @@ namespace embree @@ -407,10 +407,10 @@ index d8da78eed7..d857ff7d95 100644 return s; } diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp -index 74e9fb335c..94b3819e42 100644 +index 95a94319ec..a6ea55bfc4 100644 --- a/thirdparty/embree/kernels/common/rtcore.cpp +++ b/thirdparty/embree/kernels/common/rtcore.cpp -@@ -197,7 +197,10 @@ RTC_NAMESPACE_BEGIN; +@@ -198,7 +198,10 @@ RTC_NAMESPACE_BEGIN; if (quality != RTC_BUILD_QUALITY_LOW && quality != RTC_BUILD_QUALITY_MEDIUM && quality != RTC_BUILD_QUALITY_HIGH) @@ -422,7 +422,7 @@ index 74e9fb335c..94b3819e42 100644 scene->setBuildQuality(quality); RTC_CATCH_END2(scene); } -@@ -1350,7 +1353,10 @@ RTC_NAMESPACE_BEGIN; +@@ -1351,7 +1354,10 @@ RTC_NAMESPACE_BEGIN; quality != RTC_BUILD_QUALITY_MEDIUM && quality != RTC_BUILD_QUALITY_HIGH && quality != RTC_BUILD_QUALITY_REFIT) @@ -435,172 +435,67 @@ index 74e9fb335c..94b3819e42 100644 RTC_CATCH_END2(geometry); } diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h -index 4e4b24e9c2..373e49a689 100644 +index 4e4b24e9c2..ac58a84d6f 100644 --- a/thirdparty/embree/kernels/common/rtcore.h +++ b/thirdparty/embree/kernels/common/rtcore.h -@@ -25,52 +25,58 @@ namespace embree +@@ -25,6 +25,13 @@ namespace embree #endif /*! Macros used in the rtcore API implementation */ --#define RTC_CATCH_BEGIN try { +// -- GODOT start -- -+// #define RTC_CATCH_BEGIN try { +#define RTC_CATCH_BEGIN - --#define RTC_CATCH_END(device) \ -- } catch (std::bad_alloc&) { \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- } catch (rtcore_error& e) { \ -- Device::process_error(device,e.error,e.what()); \ -- } catch (std::exception& e) { \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- } catch (...) { \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- } -+// #define RTC_CATCH_END(device) \ -+// } catch (std::bad_alloc&) { \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// } catch (rtcore_error& e) { \ -+// Device::process_error(device,e.error,e.what()); \ -+// } catch (std::exception& e) { \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// } catch (...) { \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// } +#define RTC_CATCH_END(device) - --#define RTC_CATCH_END2(scene) \ -- } catch (std::bad_alloc&) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- } catch (rtcore_error& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,e.error,e.what()); \ -- } catch (std::exception& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- } catch (...) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- } -+// #define RTC_CATCH_END2(scene) \ -+// } catch (std::bad_alloc&) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// } catch (rtcore_error& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,e.error,e.what()); \ -+// } catch (std::exception& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// } catch (...) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// } +#define RTC_CATCH_END2(scene) - --#define RTC_CATCH_END2_FALSE(scene) \ -- } catch (std::bad_alloc&) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -- return false; \ -- } catch (rtcore_error& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,e.error,e.what()); \ -- return false; \ -- } catch (std::exception& e) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -- return false; \ -- } catch (...) { \ -- Device* device = scene ? scene->device : nullptr; \ -- Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -- return false; \ -- } -+// #define RTC_CATCH_END2_FALSE(scene) \ -+// } catch (std::bad_alloc&) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory"); \ -+// return false; \ -+// } catch (rtcore_error& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,e.error,e.what()); \ -+// return false; \ -+// } catch (std::exception& e) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,e.what()); \ -+// return false; \ -+// } catch (...) { \ -+// Device* device = scene ? scene->device : nullptr; \ -+// Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ -+// return false; \ -+// } +#define RTC_CATCH_END2_FALSE(scene) return false; ++ ++#if 0 + #define RTC_CATCH_BEGIN try { + + #define RTC_CATCH_END(device) \ +@@ -71,6 +78,8 @@ namespace embree + Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \ + return false; \ + } ++#endif +// -- GODOT end -- #define RTC_VERIFY_HANDLE(handle) \ if (handle == nullptr) { \ -@@ -97,28 +103,38 @@ namespace embree +@@ -97,6 +106,8 @@ namespace embree #define RTC_TRACE(x) #endif -- /*! used to throw embree API errors */ -- struct rtcore_error : public std::exception -- { -- __forceinline rtcore_error(RTCError error, const std::string& str) -- : error(error), str(str) {} -- -- ~rtcore_error() throw() {} -- -- const char* what () const throw () { -- return str.c_str(); -- } -- -- RTCError error; -- std::string str; -- }; -+// -- GODOT begin -- -+// /*! used to throw embree API errors */ -+// struct rtcore_error : public std::exception -+// { -+// __forceinline rtcore_error(RTCError error, const std::string& str) -+// : error(error), str(str) {} -+// -+// ~rtcore_error() throw() {} -+// -+// const char* what () const throw () { -+// return str.c_str(); -+// } -+// -+// RTCError error; -+// std::string str; -+// }; -+// -- GODOT end -- ++// -- GODOT start -- ++#if 0 + /*! used to throw embree API errors */ + struct rtcore_error : public std::exception + { +@@ -112,14 +123,18 @@ namespace embree + RTCError error; + std::string str; + }; ++#endif #if defined(DEBUG) // only report file and line in debug mode -+ // -- GODOT begin -- -+ // #define throw_RTCError(error,str) \ -+ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #define throw_RTCError(error,str) \ - throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); + printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort(); -+ // -- GODOT end -- ++ // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)); #else -+ // -- GODOT begin -- -+ // #define throw_RTCError(error,str) \ -+ // throw rtcore_error(error,str); #define throw_RTCError(error,str) \ - throw rtcore_error(error,str); + abort(); -+ // -- GODOT end -- ++ // throw rtcore_error(error,str); #endif ++// -- GODOT end -- #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \ + (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp -index 0149055f2c..408d7eae6f 100644 +index ad1916c54e..65d31d0f81 100644 --- a/thirdparty/embree/kernels/common/scene.cpp +++ b/thirdparty/embree/kernels/common/scene.cpp -@@ -792,16 +792,18 @@ namespace embree +@@ -790,16 +790,18 @@ namespace embree } /* initiate build */ diff --git a/thirdparty/embree/patches/godot-changes-ubsan.patch b/thirdparty/embree/patches/godot-changes-ubsan.patch deleted file mode 100644 index 1336246f0d..0000000000 --- a/thirdparty/embree/patches/godot-changes-ubsan.patch +++ /dev/null @@ -1,24 +0,0 @@ -diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp -index bb4fc81dfe..d279dc4993 100644 ---- a/thirdparty/embree/kernels/builders/primrefgen.cpp -+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp -@@ -184,6 +184,9 @@ namespace embree - - // special variants for grid meshes - -+// -- GODOT start -- -+#if defined(EMBREE_GEOMETRY_GRID) -+// -- GODOT end -- - PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids) - { - PrimInfo pinfo(empty); -@@ -293,6 +296,9 @@ namespace embree - - return pinfo; - } -+// -- GODOT start -- -+#endif -+// -- GODOT end -- - - // ==================================================================================================== - // ==================================================================================================== diff --git a/thirdparty/glad/LICENSE b/thirdparty/glad/LICENSE index b6e2ca25b0..4965a6bffc 100644 --- a/thirdparty/glad/LICENSE +++ b/thirdparty/glad/LICENSE @@ -1,20 +1,63 @@ -The MIT License (MIT) - -Copyright (c) 2013-2018 David Herberth - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +The glad source code: + + The MIT License (MIT) + + Copyright (c) 2013-2022 David Herberth + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +The Khronos Specifications: + + Copyright (c) 2013-2020 The Khronos Group Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +The EGL Specification and various headers: + + Copyright (c) 2007-2016 The Khronos Group Inc. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and/or associated documentation files (the + "Materials"), to deal in the Materials without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Materials, and to + permit persons to whom the Materials are furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Materials. + + THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. diff --git a/thirdparty/glad/gl.c b/thirdparty/glad/gl.c new file mode 100644 index 0000000000..6be716284c --- /dev/null +++ b/thirdparty/glad/gl.c @@ -0,0 +1,1995 @@ +/** + * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <glad/gl.h> + +#ifndef GLAD_IMPL_UTIL_C_ +#define GLAD_IMPL_UTIL_C_ + +#ifdef _MSC_VER +#define GLAD_IMPL_UTIL_SSCANF sscanf_s +#else +#define GLAD_IMPL_UTIL_SSCANF sscanf +#endif + +#endif /* GLAD_IMPL_UTIL_C_ */ + +#ifdef __cplusplus +extern "C" { +#endif + + + +int GLAD_GL_VERSION_1_0 = 0; +int GLAD_GL_VERSION_1_1 = 0; +int GLAD_GL_VERSION_1_2 = 0; +int GLAD_GL_VERSION_1_3 = 0; +int GLAD_GL_VERSION_1_4 = 0; +int GLAD_GL_VERSION_1_5 = 0; +int GLAD_GL_VERSION_2_0 = 0; +int GLAD_GL_VERSION_2_1 = 0; +int GLAD_GL_VERSION_3_0 = 0; +int GLAD_GL_VERSION_3_1 = 0; +int GLAD_GL_VERSION_3_2 = 0; +int GLAD_GL_VERSION_3_3 = 0; +int GLAD_GL_ARB_debug_output = 0; +int GLAD_GL_ARB_framebuffer_object = 0; +int GLAD_GL_EXT_framebuffer_blit = 0; +int GLAD_GL_EXT_framebuffer_multisample = 0; +int GLAD_GL_EXT_framebuffer_object = 0; +int GLAD_GL_OVR_multiview = 0; +int GLAD_GL_OVR_multiview2 = 0; + + + +PFNGLACCUMPROC glad_glAccum = NULL; +PFNGLACTIVETEXTUREPROC glad_glActiveTexture = NULL; +PFNGLALPHAFUNCPROC glad_glAlphaFunc = NULL; +PFNGLARETEXTURESRESIDENTPROC glad_glAreTexturesResident = NULL; +PFNGLARRAYELEMENTPROC glad_glArrayElement = NULL; +PFNGLATTACHSHADERPROC glad_glAttachShader = NULL; +PFNGLBEGINPROC glad_glBegin = NULL; +PFNGLBEGINCONDITIONALRENDERPROC glad_glBeginConditionalRender = NULL; +PFNGLBEGINQUERYPROC glad_glBeginQuery = NULL; +PFNGLBEGINTRANSFORMFEEDBACKPROC glad_glBeginTransformFeedback = NULL; +PFNGLBINDATTRIBLOCATIONPROC glad_glBindAttribLocation = NULL; +PFNGLBINDBUFFERPROC glad_glBindBuffer = NULL; +PFNGLBINDBUFFERBASEPROC glad_glBindBufferBase = NULL; +PFNGLBINDBUFFERRANGEPROC glad_glBindBufferRange = NULL; +PFNGLBINDFRAGDATALOCATIONPROC glad_glBindFragDataLocation = NULL; +PFNGLBINDFRAGDATALOCATIONINDEXEDPROC glad_glBindFragDataLocationIndexed = NULL; +PFNGLBINDFRAMEBUFFERPROC glad_glBindFramebuffer = NULL; +PFNGLBINDFRAMEBUFFEREXTPROC glad_glBindFramebufferEXT = NULL; +PFNGLBINDRENDERBUFFERPROC glad_glBindRenderbuffer = NULL; +PFNGLBINDRENDERBUFFEREXTPROC glad_glBindRenderbufferEXT = NULL; +PFNGLBINDSAMPLERPROC glad_glBindSampler = NULL; +PFNGLBINDTEXTUREPROC glad_glBindTexture = NULL; +PFNGLBINDVERTEXARRAYPROC glad_glBindVertexArray = NULL; +PFNGLBITMAPPROC glad_glBitmap = NULL; +PFNGLBLENDCOLORPROC glad_glBlendColor = NULL; +PFNGLBLENDEQUATIONPROC glad_glBlendEquation = NULL; +PFNGLBLENDEQUATIONSEPARATEPROC glad_glBlendEquationSeparate = NULL; +PFNGLBLENDFUNCPROC glad_glBlendFunc = NULL; +PFNGLBLENDFUNCSEPARATEPROC glad_glBlendFuncSeparate = NULL; +PFNGLBLITFRAMEBUFFERPROC glad_glBlitFramebuffer = NULL; +PFNGLBLITFRAMEBUFFEREXTPROC glad_glBlitFramebufferEXT = NULL; +PFNGLBUFFERDATAPROC glad_glBufferData = NULL; +PFNGLBUFFERSUBDATAPROC glad_glBufferSubData = NULL; +PFNGLCALLLISTPROC glad_glCallList = NULL; +PFNGLCALLLISTSPROC glad_glCallLists = NULL; +PFNGLCHECKFRAMEBUFFERSTATUSPROC glad_glCheckFramebufferStatus = NULL; +PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glad_glCheckFramebufferStatusEXT = NULL; +PFNGLCLAMPCOLORPROC glad_glClampColor = NULL; +PFNGLCLEARPROC glad_glClear = NULL; +PFNGLCLEARACCUMPROC glad_glClearAccum = NULL; +PFNGLCLEARBUFFERFIPROC glad_glClearBufferfi = NULL; +PFNGLCLEARBUFFERFVPROC glad_glClearBufferfv = NULL; +PFNGLCLEARBUFFERIVPROC glad_glClearBufferiv = NULL; +PFNGLCLEARBUFFERUIVPROC glad_glClearBufferuiv = NULL; +PFNGLCLEARCOLORPROC glad_glClearColor = NULL; +PFNGLCLEARDEPTHPROC glad_glClearDepth = NULL; +PFNGLCLEARINDEXPROC glad_glClearIndex = NULL; +PFNGLCLEARSTENCILPROC glad_glClearStencil = NULL; +PFNGLCLIENTACTIVETEXTUREPROC glad_glClientActiveTexture = NULL; +PFNGLCLIENTWAITSYNCPROC glad_glClientWaitSync = NULL; +PFNGLCLIPPLANEPROC glad_glClipPlane = NULL; +PFNGLCOLOR3BPROC glad_glColor3b = NULL; +PFNGLCOLOR3BVPROC glad_glColor3bv = NULL; +PFNGLCOLOR3DPROC glad_glColor3d = NULL; +PFNGLCOLOR3DVPROC glad_glColor3dv = NULL; +PFNGLCOLOR3FPROC glad_glColor3f = NULL; +PFNGLCOLOR3FVPROC glad_glColor3fv = NULL; +PFNGLCOLOR3IPROC glad_glColor3i = NULL; +PFNGLCOLOR3IVPROC glad_glColor3iv = NULL; +PFNGLCOLOR3SPROC glad_glColor3s = NULL; +PFNGLCOLOR3SVPROC glad_glColor3sv = NULL; +PFNGLCOLOR3UBPROC glad_glColor3ub = NULL; +PFNGLCOLOR3UBVPROC glad_glColor3ubv = NULL; +PFNGLCOLOR3UIPROC glad_glColor3ui = NULL; +PFNGLCOLOR3UIVPROC glad_glColor3uiv = NULL; +PFNGLCOLOR3USPROC glad_glColor3us = NULL; +PFNGLCOLOR3USVPROC glad_glColor3usv = NULL; +PFNGLCOLOR4BPROC glad_glColor4b = NULL; +PFNGLCOLOR4BVPROC glad_glColor4bv = NULL; +PFNGLCOLOR4DPROC glad_glColor4d = NULL; +PFNGLCOLOR4DVPROC glad_glColor4dv = NULL; +PFNGLCOLOR4FPROC glad_glColor4f = NULL; +PFNGLCOLOR4FVPROC glad_glColor4fv = NULL; +PFNGLCOLOR4IPROC glad_glColor4i = NULL; +PFNGLCOLOR4IVPROC glad_glColor4iv = NULL; +PFNGLCOLOR4SPROC glad_glColor4s = NULL; +PFNGLCOLOR4SVPROC glad_glColor4sv = NULL; +PFNGLCOLOR4UBPROC glad_glColor4ub = NULL; +PFNGLCOLOR4UBVPROC glad_glColor4ubv = NULL; +PFNGLCOLOR4UIPROC glad_glColor4ui = NULL; +PFNGLCOLOR4UIVPROC glad_glColor4uiv = NULL; +PFNGLCOLOR4USPROC glad_glColor4us = NULL; +PFNGLCOLOR4USVPROC glad_glColor4usv = NULL; +PFNGLCOLORMASKPROC glad_glColorMask = NULL; +PFNGLCOLORMASKIPROC glad_glColorMaski = NULL; +PFNGLCOLORMATERIALPROC glad_glColorMaterial = NULL; +PFNGLCOLORP3UIPROC glad_glColorP3ui = NULL; +PFNGLCOLORP3UIVPROC glad_glColorP3uiv = NULL; +PFNGLCOLORP4UIPROC glad_glColorP4ui = NULL; +PFNGLCOLORP4UIVPROC glad_glColorP4uiv = NULL; +PFNGLCOLORPOINTERPROC glad_glColorPointer = NULL; +PFNGLCOMPILESHADERPROC glad_glCompileShader = NULL; +PFNGLCOMPRESSEDTEXIMAGE1DPROC glad_glCompressedTexImage1D = NULL; +PFNGLCOMPRESSEDTEXIMAGE2DPROC glad_glCompressedTexImage2D = NULL; +PFNGLCOMPRESSEDTEXIMAGE3DPROC glad_glCompressedTexImage3D = NULL; +PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC glad_glCompressedTexSubImage1D = NULL; +PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC glad_glCompressedTexSubImage2D = NULL; +PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC glad_glCompressedTexSubImage3D = NULL; +PFNGLCOPYBUFFERSUBDATAPROC glad_glCopyBufferSubData = NULL; +PFNGLCOPYPIXELSPROC glad_glCopyPixels = NULL; +PFNGLCOPYTEXIMAGE1DPROC glad_glCopyTexImage1D = NULL; +PFNGLCOPYTEXIMAGE2DPROC glad_glCopyTexImage2D = NULL; +PFNGLCOPYTEXSUBIMAGE1DPROC glad_glCopyTexSubImage1D = NULL; +PFNGLCOPYTEXSUBIMAGE2DPROC glad_glCopyTexSubImage2D = NULL; +PFNGLCOPYTEXSUBIMAGE3DPROC glad_glCopyTexSubImage3D = NULL; +PFNGLCREATEPROGRAMPROC glad_glCreateProgram = NULL; +PFNGLCREATESHADERPROC glad_glCreateShader = NULL; +PFNGLCULLFACEPROC glad_glCullFace = NULL; +PFNGLDEBUGMESSAGECALLBACKARBPROC glad_glDebugMessageCallbackARB = NULL; +PFNGLDEBUGMESSAGECONTROLARBPROC glad_glDebugMessageControlARB = NULL; +PFNGLDEBUGMESSAGEINSERTARBPROC glad_glDebugMessageInsertARB = NULL; +PFNGLDELETEBUFFERSPROC glad_glDeleteBuffers = NULL; +PFNGLDELETEFRAMEBUFFERSPROC glad_glDeleteFramebuffers = NULL; +PFNGLDELETEFRAMEBUFFERSEXTPROC glad_glDeleteFramebuffersEXT = NULL; +PFNGLDELETELISTSPROC glad_glDeleteLists = NULL; +PFNGLDELETEPROGRAMPROC glad_glDeleteProgram = NULL; +PFNGLDELETEQUERIESPROC glad_glDeleteQueries = NULL; +PFNGLDELETERENDERBUFFERSPROC glad_glDeleteRenderbuffers = NULL; +PFNGLDELETERENDERBUFFERSEXTPROC glad_glDeleteRenderbuffersEXT = NULL; +PFNGLDELETESAMPLERSPROC glad_glDeleteSamplers = NULL; +PFNGLDELETESHADERPROC glad_glDeleteShader = NULL; +PFNGLDELETESYNCPROC glad_glDeleteSync = NULL; +PFNGLDELETETEXTURESPROC glad_glDeleteTextures = NULL; +PFNGLDELETEVERTEXARRAYSPROC glad_glDeleteVertexArrays = NULL; +PFNGLDEPTHFUNCPROC glad_glDepthFunc = NULL; +PFNGLDEPTHMASKPROC glad_glDepthMask = NULL; +PFNGLDEPTHRANGEPROC glad_glDepthRange = NULL; +PFNGLDETACHSHADERPROC glad_glDetachShader = NULL; +PFNGLDISABLEPROC glad_glDisable = NULL; +PFNGLDISABLECLIENTSTATEPROC glad_glDisableClientState = NULL; +PFNGLDISABLEVERTEXATTRIBARRAYPROC glad_glDisableVertexAttribArray = NULL; +PFNGLDISABLEIPROC glad_glDisablei = NULL; +PFNGLDRAWARRAYSPROC glad_glDrawArrays = NULL; +PFNGLDRAWARRAYSINSTANCEDPROC glad_glDrawArraysInstanced = NULL; +PFNGLDRAWBUFFERPROC glad_glDrawBuffer = NULL; +PFNGLDRAWBUFFERSPROC glad_glDrawBuffers = NULL; +PFNGLDRAWELEMENTSPROC glad_glDrawElements = NULL; +PFNGLDRAWELEMENTSBASEVERTEXPROC glad_glDrawElementsBaseVertex = NULL; +PFNGLDRAWELEMENTSINSTANCEDPROC glad_glDrawElementsInstanced = NULL; +PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC glad_glDrawElementsInstancedBaseVertex = NULL; +PFNGLDRAWPIXELSPROC glad_glDrawPixels = NULL; +PFNGLDRAWRANGEELEMENTSPROC glad_glDrawRangeElements = NULL; +PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC glad_glDrawRangeElementsBaseVertex = NULL; +PFNGLEDGEFLAGPROC glad_glEdgeFlag = NULL; +PFNGLEDGEFLAGPOINTERPROC glad_glEdgeFlagPointer = NULL; +PFNGLEDGEFLAGVPROC glad_glEdgeFlagv = NULL; +PFNGLENABLEPROC glad_glEnable = NULL; +PFNGLENABLECLIENTSTATEPROC glad_glEnableClientState = NULL; +PFNGLENABLEVERTEXATTRIBARRAYPROC glad_glEnableVertexAttribArray = NULL; +PFNGLENABLEIPROC glad_glEnablei = NULL; +PFNGLENDPROC glad_glEnd = NULL; +PFNGLENDCONDITIONALRENDERPROC glad_glEndConditionalRender = NULL; +PFNGLENDLISTPROC glad_glEndList = NULL; +PFNGLENDQUERYPROC glad_glEndQuery = NULL; +PFNGLENDTRANSFORMFEEDBACKPROC glad_glEndTransformFeedback = NULL; +PFNGLEVALCOORD1DPROC glad_glEvalCoord1d = NULL; +PFNGLEVALCOORD1DVPROC glad_glEvalCoord1dv = NULL; +PFNGLEVALCOORD1FPROC glad_glEvalCoord1f = NULL; +PFNGLEVALCOORD1FVPROC glad_glEvalCoord1fv = NULL; +PFNGLEVALCOORD2DPROC glad_glEvalCoord2d = NULL; +PFNGLEVALCOORD2DVPROC glad_glEvalCoord2dv = NULL; +PFNGLEVALCOORD2FPROC glad_glEvalCoord2f = NULL; +PFNGLEVALCOORD2FVPROC glad_glEvalCoord2fv = NULL; +PFNGLEVALMESH1PROC glad_glEvalMesh1 = NULL; +PFNGLEVALMESH2PROC glad_glEvalMesh2 = NULL; +PFNGLEVALPOINT1PROC glad_glEvalPoint1 = NULL; +PFNGLEVALPOINT2PROC glad_glEvalPoint2 = NULL; +PFNGLFEEDBACKBUFFERPROC glad_glFeedbackBuffer = NULL; +PFNGLFENCESYNCPROC glad_glFenceSync = NULL; +PFNGLFINISHPROC glad_glFinish = NULL; +PFNGLFLUSHPROC glad_glFlush = NULL; +PFNGLFLUSHMAPPEDBUFFERRANGEPROC glad_glFlushMappedBufferRange = NULL; +PFNGLFOGCOORDPOINTERPROC glad_glFogCoordPointer = NULL; +PFNGLFOGCOORDDPROC glad_glFogCoordd = NULL; +PFNGLFOGCOORDDVPROC glad_glFogCoorddv = NULL; +PFNGLFOGCOORDFPROC glad_glFogCoordf = NULL; +PFNGLFOGCOORDFVPROC glad_glFogCoordfv = NULL; +PFNGLFOGFPROC glad_glFogf = NULL; +PFNGLFOGFVPROC glad_glFogfv = NULL; +PFNGLFOGIPROC glad_glFogi = NULL; +PFNGLFOGIVPROC glad_glFogiv = NULL; +PFNGLFRAMEBUFFERRENDERBUFFERPROC glad_glFramebufferRenderbuffer = NULL; +PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glad_glFramebufferRenderbufferEXT = NULL; +PFNGLFRAMEBUFFERTEXTUREPROC glad_glFramebufferTexture = NULL; +PFNGLFRAMEBUFFERTEXTURE1DPROC glad_glFramebufferTexture1D = NULL; +PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glad_glFramebufferTexture1DEXT = NULL; +PFNGLFRAMEBUFFERTEXTURE2DPROC glad_glFramebufferTexture2D = NULL; +PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glad_glFramebufferTexture2DEXT = NULL; +PFNGLFRAMEBUFFERTEXTURE3DPROC glad_glFramebufferTexture3D = NULL; +PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glad_glFramebufferTexture3DEXT = NULL; +PFNGLFRAMEBUFFERTEXTURELAYERPROC glad_glFramebufferTextureLayer = NULL; +PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glFramebufferTextureMultiviewOVR = NULL; +PFNGLFRONTFACEPROC glad_glFrontFace = NULL; +PFNGLFRUSTUMPROC glad_glFrustum = NULL; +PFNGLGENBUFFERSPROC glad_glGenBuffers = NULL; +PFNGLGENFRAMEBUFFERSPROC glad_glGenFramebuffers = NULL; +PFNGLGENFRAMEBUFFERSEXTPROC glad_glGenFramebuffersEXT = NULL; +PFNGLGENLISTSPROC glad_glGenLists = NULL; +PFNGLGENQUERIESPROC glad_glGenQueries = NULL; +PFNGLGENRENDERBUFFERSPROC glad_glGenRenderbuffers = NULL; +PFNGLGENRENDERBUFFERSEXTPROC glad_glGenRenderbuffersEXT = NULL; +PFNGLGENSAMPLERSPROC glad_glGenSamplers = NULL; +PFNGLGENTEXTURESPROC glad_glGenTextures = NULL; +PFNGLGENVERTEXARRAYSPROC glad_glGenVertexArrays = NULL; +PFNGLGENERATEMIPMAPPROC glad_glGenerateMipmap = NULL; +PFNGLGENERATEMIPMAPEXTPROC glad_glGenerateMipmapEXT = NULL; +PFNGLGETACTIVEATTRIBPROC glad_glGetActiveAttrib = NULL; +PFNGLGETACTIVEUNIFORMPROC glad_glGetActiveUniform = NULL; +PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC glad_glGetActiveUniformBlockName = NULL; +PFNGLGETACTIVEUNIFORMBLOCKIVPROC glad_glGetActiveUniformBlockiv = NULL; +PFNGLGETACTIVEUNIFORMNAMEPROC glad_glGetActiveUniformName = NULL; +PFNGLGETACTIVEUNIFORMSIVPROC glad_glGetActiveUniformsiv = NULL; +PFNGLGETATTACHEDSHADERSPROC glad_glGetAttachedShaders = NULL; +PFNGLGETATTRIBLOCATIONPROC glad_glGetAttribLocation = NULL; +PFNGLGETBOOLEANI_VPROC glad_glGetBooleani_v = NULL; +PFNGLGETBOOLEANVPROC glad_glGetBooleanv = NULL; +PFNGLGETBUFFERPARAMETERI64VPROC glad_glGetBufferParameteri64v = NULL; +PFNGLGETBUFFERPARAMETERIVPROC glad_glGetBufferParameteriv = NULL; +PFNGLGETBUFFERPOINTERVPROC glad_glGetBufferPointerv = NULL; +PFNGLGETBUFFERSUBDATAPROC glad_glGetBufferSubData = NULL; +PFNGLGETCLIPPLANEPROC glad_glGetClipPlane = NULL; +PFNGLGETCOMPRESSEDTEXIMAGEPROC glad_glGetCompressedTexImage = NULL; +PFNGLGETDEBUGMESSAGELOGARBPROC glad_glGetDebugMessageLogARB = NULL; +PFNGLGETDOUBLEVPROC glad_glGetDoublev = NULL; +PFNGLGETERRORPROC glad_glGetError = NULL; +PFNGLGETFLOATVPROC glad_glGetFloatv = NULL; +PFNGLGETFRAGDATAINDEXPROC glad_glGetFragDataIndex = NULL; +PFNGLGETFRAGDATALOCATIONPROC glad_glGetFragDataLocation = NULL; +PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC glad_glGetFramebufferAttachmentParameteriv = NULL; +PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glad_glGetFramebufferAttachmentParameterivEXT = NULL; +PFNGLGETINTEGER64I_VPROC glad_glGetInteger64i_v = NULL; +PFNGLGETINTEGER64VPROC glad_glGetInteger64v = NULL; +PFNGLGETINTEGERI_VPROC glad_glGetIntegeri_v = NULL; +PFNGLGETINTEGERVPROC glad_glGetIntegerv = NULL; +PFNGLGETLIGHTFVPROC glad_glGetLightfv = NULL; +PFNGLGETLIGHTIVPROC glad_glGetLightiv = NULL; +PFNGLGETMAPDVPROC glad_glGetMapdv = NULL; +PFNGLGETMAPFVPROC glad_glGetMapfv = NULL; +PFNGLGETMAPIVPROC glad_glGetMapiv = NULL; +PFNGLGETMATERIALFVPROC glad_glGetMaterialfv = NULL; +PFNGLGETMATERIALIVPROC glad_glGetMaterialiv = NULL; +PFNGLGETMULTISAMPLEFVPROC glad_glGetMultisamplefv = NULL; +PFNGLGETPIXELMAPFVPROC glad_glGetPixelMapfv = NULL; +PFNGLGETPIXELMAPUIVPROC glad_glGetPixelMapuiv = NULL; +PFNGLGETPIXELMAPUSVPROC glad_glGetPixelMapusv = NULL; +PFNGLGETPOINTERVPROC glad_glGetPointerv = NULL; +PFNGLGETPOLYGONSTIPPLEPROC glad_glGetPolygonStipple = NULL; +PFNGLGETPROGRAMINFOLOGPROC glad_glGetProgramInfoLog = NULL; +PFNGLGETPROGRAMIVPROC glad_glGetProgramiv = NULL; +PFNGLGETQUERYOBJECTI64VPROC glad_glGetQueryObjecti64v = NULL; +PFNGLGETQUERYOBJECTIVPROC glad_glGetQueryObjectiv = NULL; +PFNGLGETQUERYOBJECTUI64VPROC glad_glGetQueryObjectui64v = NULL; +PFNGLGETQUERYOBJECTUIVPROC glad_glGetQueryObjectuiv = NULL; +PFNGLGETQUERYIVPROC glad_glGetQueryiv = NULL; +PFNGLGETRENDERBUFFERPARAMETERIVPROC glad_glGetRenderbufferParameteriv = NULL; +PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glad_glGetRenderbufferParameterivEXT = NULL; +PFNGLGETSAMPLERPARAMETERIIVPROC glad_glGetSamplerParameterIiv = NULL; +PFNGLGETSAMPLERPARAMETERIUIVPROC glad_glGetSamplerParameterIuiv = NULL; +PFNGLGETSAMPLERPARAMETERFVPROC glad_glGetSamplerParameterfv = NULL; +PFNGLGETSAMPLERPARAMETERIVPROC glad_glGetSamplerParameteriv = NULL; +PFNGLGETSHADERINFOLOGPROC glad_glGetShaderInfoLog = NULL; +PFNGLGETSHADERSOURCEPROC glad_glGetShaderSource = NULL; +PFNGLGETSHADERIVPROC glad_glGetShaderiv = NULL; +PFNGLGETSTRINGPROC glad_glGetString = NULL; +PFNGLGETSTRINGIPROC glad_glGetStringi = NULL; +PFNGLGETSYNCIVPROC glad_glGetSynciv = NULL; +PFNGLGETTEXENVFVPROC glad_glGetTexEnvfv = NULL; +PFNGLGETTEXENVIVPROC glad_glGetTexEnviv = NULL; +PFNGLGETTEXGENDVPROC glad_glGetTexGendv = NULL; +PFNGLGETTEXGENFVPROC glad_glGetTexGenfv = NULL; +PFNGLGETTEXGENIVPROC glad_glGetTexGeniv = NULL; +PFNGLGETTEXIMAGEPROC glad_glGetTexImage = NULL; +PFNGLGETTEXLEVELPARAMETERFVPROC glad_glGetTexLevelParameterfv = NULL; +PFNGLGETTEXLEVELPARAMETERIVPROC glad_glGetTexLevelParameteriv = NULL; +PFNGLGETTEXPARAMETERIIVPROC glad_glGetTexParameterIiv = NULL; +PFNGLGETTEXPARAMETERIUIVPROC glad_glGetTexParameterIuiv = NULL; +PFNGLGETTEXPARAMETERFVPROC glad_glGetTexParameterfv = NULL; +PFNGLGETTEXPARAMETERIVPROC glad_glGetTexParameteriv = NULL; +PFNGLGETTRANSFORMFEEDBACKVARYINGPROC glad_glGetTransformFeedbackVarying = NULL; +PFNGLGETUNIFORMBLOCKINDEXPROC glad_glGetUniformBlockIndex = NULL; +PFNGLGETUNIFORMINDICESPROC glad_glGetUniformIndices = NULL; +PFNGLGETUNIFORMLOCATIONPROC glad_glGetUniformLocation = NULL; +PFNGLGETUNIFORMFVPROC glad_glGetUniformfv = NULL; +PFNGLGETUNIFORMIVPROC glad_glGetUniformiv = NULL; +PFNGLGETUNIFORMUIVPROC glad_glGetUniformuiv = NULL; +PFNGLGETVERTEXATTRIBIIVPROC glad_glGetVertexAttribIiv = NULL; +PFNGLGETVERTEXATTRIBIUIVPROC glad_glGetVertexAttribIuiv = NULL; +PFNGLGETVERTEXATTRIBPOINTERVPROC glad_glGetVertexAttribPointerv = NULL; +PFNGLGETVERTEXATTRIBDVPROC glad_glGetVertexAttribdv = NULL; +PFNGLGETVERTEXATTRIBFVPROC glad_glGetVertexAttribfv = NULL; +PFNGLGETVERTEXATTRIBIVPROC glad_glGetVertexAttribiv = NULL; +PFNGLHINTPROC glad_glHint = NULL; +PFNGLINDEXMASKPROC glad_glIndexMask = NULL; +PFNGLINDEXPOINTERPROC glad_glIndexPointer = NULL; +PFNGLINDEXDPROC glad_glIndexd = NULL; +PFNGLINDEXDVPROC glad_glIndexdv = NULL; +PFNGLINDEXFPROC glad_glIndexf = NULL; +PFNGLINDEXFVPROC glad_glIndexfv = NULL; +PFNGLINDEXIPROC glad_glIndexi = NULL; +PFNGLINDEXIVPROC glad_glIndexiv = NULL; +PFNGLINDEXSPROC glad_glIndexs = NULL; +PFNGLINDEXSVPROC glad_glIndexsv = NULL; +PFNGLINDEXUBPROC glad_glIndexub = NULL; +PFNGLINDEXUBVPROC glad_glIndexubv = NULL; +PFNGLINITNAMESPROC glad_glInitNames = NULL; +PFNGLINTERLEAVEDARRAYSPROC glad_glInterleavedArrays = NULL; +PFNGLISBUFFERPROC glad_glIsBuffer = NULL; +PFNGLISENABLEDPROC glad_glIsEnabled = NULL; +PFNGLISENABLEDIPROC glad_glIsEnabledi = NULL; +PFNGLISFRAMEBUFFERPROC glad_glIsFramebuffer = NULL; +PFNGLISFRAMEBUFFEREXTPROC glad_glIsFramebufferEXT = NULL; +PFNGLISLISTPROC glad_glIsList = NULL; +PFNGLISPROGRAMPROC glad_glIsProgram = NULL; +PFNGLISQUERYPROC glad_glIsQuery = NULL; +PFNGLISRENDERBUFFERPROC glad_glIsRenderbuffer = NULL; +PFNGLISRENDERBUFFEREXTPROC glad_glIsRenderbufferEXT = NULL; +PFNGLISSAMPLERPROC glad_glIsSampler = NULL; +PFNGLISSHADERPROC glad_glIsShader = NULL; +PFNGLISSYNCPROC glad_glIsSync = NULL; +PFNGLISTEXTUREPROC glad_glIsTexture = NULL; +PFNGLISVERTEXARRAYPROC glad_glIsVertexArray = NULL; +PFNGLLIGHTMODELFPROC glad_glLightModelf = NULL; +PFNGLLIGHTMODELFVPROC glad_glLightModelfv = NULL; +PFNGLLIGHTMODELIPROC glad_glLightModeli = NULL; +PFNGLLIGHTMODELIVPROC glad_glLightModeliv = NULL; +PFNGLLIGHTFPROC glad_glLightf = NULL; +PFNGLLIGHTFVPROC glad_glLightfv = NULL; +PFNGLLIGHTIPROC glad_glLighti = NULL; +PFNGLLIGHTIVPROC glad_glLightiv = NULL; +PFNGLLINESTIPPLEPROC glad_glLineStipple = NULL; +PFNGLLINEWIDTHPROC glad_glLineWidth = NULL; +PFNGLLINKPROGRAMPROC glad_glLinkProgram = NULL; +PFNGLLISTBASEPROC glad_glListBase = NULL; +PFNGLLOADIDENTITYPROC glad_glLoadIdentity = NULL; +PFNGLLOADMATRIXDPROC glad_glLoadMatrixd = NULL; +PFNGLLOADMATRIXFPROC glad_glLoadMatrixf = NULL; +PFNGLLOADNAMEPROC glad_glLoadName = NULL; +PFNGLLOADTRANSPOSEMATRIXDPROC glad_glLoadTransposeMatrixd = NULL; +PFNGLLOADTRANSPOSEMATRIXFPROC glad_glLoadTransposeMatrixf = NULL; +PFNGLLOGICOPPROC glad_glLogicOp = NULL; +PFNGLMAP1DPROC glad_glMap1d = NULL; +PFNGLMAP1FPROC glad_glMap1f = NULL; +PFNGLMAP2DPROC glad_glMap2d = NULL; +PFNGLMAP2FPROC glad_glMap2f = NULL; +PFNGLMAPBUFFERPROC glad_glMapBuffer = NULL; +PFNGLMAPBUFFERRANGEPROC glad_glMapBufferRange = NULL; +PFNGLMAPGRID1DPROC glad_glMapGrid1d = NULL; +PFNGLMAPGRID1FPROC glad_glMapGrid1f = NULL; +PFNGLMAPGRID2DPROC glad_glMapGrid2d = NULL; +PFNGLMAPGRID2FPROC glad_glMapGrid2f = NULL; +PFNGLMATERIALFPROC glad_glMaterialf = NULL; +PFNGLMATERIALFVPROC glad_glMaterialfv = NULL; +PFNGLMATERIALIPROC glad_glMateriali = NULL; +PFNGLMATERIALIVPROC glad_glMaterialiv = NULL; +PFNGLMATRIXMODEPROC glad_glMatrixMode = NULL; +PFNGLMULTMATRIXDPROC glad_glMultMatrixd = NULL; +PFNGLMULTMATRIXFPROC glad_glMultMatrixf = NULL; +PFNGLMULTTRANSPOSEMATRIXDPROC glad_glMultTransposeMatrixd = NULL; +PFNGLMULTTRANSPOSEMATRIXFPROC glad_glMultTransposeMatrixf = NULL; +PFNGLMULTIDRAWARRAYSPROC glad_glMultiDrawArrays = NULL; +PFNGLMULTIDRAWELEMENTSPROC glad_glMultiDrawElements = NULL; +PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC glad_glMultiDrawElementsBaseVertex = NULL; +PFNGLMULTITEXCOORD1DPROC glad_glMultiTexCoord1d = NULL; +PFNGLMULTITEXCOORD1DVPROC glad_glMultiTexCoord1dv = NULL; +PFNGLMULTITEXCOORD1FPROC glad_glMultiTexCoord1f = NULL; +PFNGLMULTITEXCOORD1FVPROC glad_glMultiTexCoord1fv = NULL; +PFNGLMULTITEXCOORD1IPROC glad_glMultiTexCoord1i = NULL; +PFNGLMULTITEXCOORD1IVPROC glad_glMultiTexCoord1iv = NULL; +PFNGLMULTITEXCOORD1SPROC glad_glMultiTexCoord1s = NULL; +PFNGLMULTITEXCOORD1SVPROC glad_glMultiTexCoord1sv = NULL; +PFNGLMULTITEXCOORD2DPROC glad_glMultiTexCoord2d = NULL; +PFNGLMULTITEXCOORD2DVPROC glad_glMultiTexCoord2dv = NULL; +PFNGLMULTITEXCOORD2FPROC glad_glMultiTexCoord2f = NULL; +PFNGLMULTITEXCOORD2FVPROC glad_glMultiTexCoord2fv = NULL; +PFNGLMULTITEXCOORD2IPROC glad_glMultiTexCoord2i = NULL; +PFNGLMULTITEXCOORD2IVPROC glad_glMultiTexCoord2iv = NULL; +PFNGLMULTITEXCOORD2SPROC glad_glMultiTexCoord2s = NULL; +PFNGLMULTITEXCOORD2SVPROC glad_glMultiTexCoord2sv = NULL; +PFNGLMULTITEXCOORD3DPROC glad_glMultiTexCoord3d = NULL; +PFNGLMULTITEXCOORD3DVPROC glad_glMultiTexCoord3dv = NULL; +PFNGLMULTITEXCOORD3FPROC glad_glMultiTexCoord3f = NULL; +PFNGLMULTITEXCOORD3FVPROC glad_glMultiTexCoord3fv = NULL; +PFNGLMULTITEXCOORD3IPROC glad_glMultiTexCoord3i = NULL; +PFNGLMULTITEXCOORD3IVPROC glad_glMultiTexCoord3iv = NULL; +PFNGLMULTITEXCOORD3SPROC glad_glMultiTexCoord3s = NULL; +PFNGLMULTITEXCOORD3SVPROC glad_glMultiTexCoord3sv = NULL; +PFNGLMULTITEXCOORD4DPROC glad_glMultiTexCoord4d = NULL; +PFNGLMULTITEXCOORD4DVPROC glad_glMultiTexCoord4dv = NULL; +PFNGLMULTITEXCOORD4FPROC glad_glMultiTexCoord4f = NULL; +PFNGLMULTITEXCOORD4FVPROC glad_glMultiTexCoord4fv = NULL; +PFNGLMULTITEXCOORD4IPROC glad_glMultiTexCoord4i = NULL; +PFNGLMULTITEXCOORD4IVPROC glad_glMultiTexCoord4iv = NULL; +PFNGLMULTITEXCOORD4SPROC glad_glMultiTexCoord4s = NULL; +PFNGLMULTITEXCOORD4SVPROC glad_glMultiTexCoord4sv = NULL; +PFNGLMULTITEXCOORDP1UIPROC glad_glMultiTexCoordP1ui = NULL; +PFNGLMULTITEXCOORDP1UIVPROC glad_glMultiTexCoordP1uiv = NULL; +PFNGLMULTITEXCOORDP2UIPROC glad_glMultiTexCoordP2ui = NULL; +PFNGLMULTITEXCOORDP2UIVPROC glad_glMultiTexCoordP2uiv = NULL; +PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui = NULL; +PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv = NULL; +PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui = NULL; +PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv = NULL; +PFNGLNEWLISTPROC glad_glNewList = NULL; +PFNGLNORMAL3BPROC glad_glNormal3b = NULL; +PFNGLNORMAL3BVPROC glad_glNormal3bv = NULL; +PFNGLNORMAL3DPROC glad_glNormal3d = NULL; +PFNGLNORMAL3DVPROC glad_glNormal3dv = NULL; +PFNGLNORMAL3FPROC glad_glNormal3f = NULL; +PFNGLNORMAL3FVPROC glad_glNormal3fv = NULL; +PFNGLNORMAL3IPROC glad_glNormal3i = NULL; +PFNGLNORMAL3IVPROC glad_glNormal3iv = NULL; +PFNGLNORMAL3SPROC glad_glNormal3s = NULL; +PFNGLNORMAL3SVPROC glad_glNormal3sv = NULL; +PFNGLNORMALP3UIPROC glad_glNormalP3ui = NULL; +PFNGLNORMALP3UIVPROC glad_glNormalP3uiv = NULL; +PFNGLNORMALPOINTERPROC glad_glNormalPointer = NULL; +PFNGLORTHOPROC glad_glOrtho = NULL; +PFNGLPASSTHROUGHPROC glad_glPassThrough = NULL; +PFNGLPIXELMAPFVPROC glad_glPixelMapfv = NULL; +PFNGLPIXELMAPUIVPROC glad_glPixelMapuiv = NULL; +PFNGLPIXELMAPUSVPROC glad_glPixelMapusv = NULL; +PFNGLPIXELSTOREFPROC glad_glPixelStoref = NULL; +PFNGLPIXELSTOREIPROC glad_glPixelStorei = NULL; +PFNGLPIXELTRANSFERFPROC glad_glPixelTransferf = NULL; +PFNGLPIXELTRANSFERIPROC glad_glPixelTransferi = NULL; +PFNGLPIXELZOOMPROC glad_glPixelZoom = NULL; +PFNGLPOINTPARAMETERFPROC glad_glPointParameterf = NULL; +PFNGLPOINTPARAMETERFVPROC glad_glPointParameterfv = NULL; +PFNGLPOINTPARAMETERIPROC glad_glPointParameteri = NULL; +PFNGLPOINTPARAMETERIVPROC glad_glPointParameteriv = NULL; +PFNGLPOINTSIZEPROC glad_glPointSize = NULL; +PFNGLPOLYGONMODEPROC glad_glPolygonMode = NULL; +PFNGLPOLYGONOFFSETPROC glad_glPolygonOffset = NULL; +PFNGLPOLYGONSTIPPLEPROC glad_glPolygonStipple = NULL; +PFNGLPOPATTRIBPROC glad_glPopAttrib = NULL; +PFNGLPOPCLIENTATTRIBPROC glad_glPopClientAttrib = NULL; +PFNGLPOPMATRIXPROC glad_glPopMatrix = NULL; +PFNGLPOPNAMEPROC glad_glPopName = NULL; +PFNGLPRIMITIVERESTARTINDEXPROC glad_glPrimitiveRestartIndex = NULL; +PFNGLPRIORITIZETEXTURESPROC glad_glPrioritizeTextures = NULL; +PFNGLPROVOKINGVERTEXPROC glad_glProvokingVertex = NULL; +PFNGLPUSHATTRIBPROC glad_glPushAttrib = NULL; +PFNGLPUSHCLIENTATTRIBPROC glad_glPushClientAttrib = NULL; +PFNGLPUSHMATRIXPROC glad_glPushMatrix = NULL; +PFNGLPUSHNAMEPROC glad_glPushName = NULL; +PFNGLQUERYCOUNTERPROC glad_glQueryCounter = NULL; +PFNGLRASTERPOS2DPROC glad_glRasterPos2d = NULL; +PFNGLRASTERPOS2DVPROC glad_glRasterPos2dv = NULL; +PFNGLRASTERPOS2FPROC glad_glRasterPos2f = NULL; +PFNGLRASTERPOS2FVPROC glad_glRasterPos2fv = NULL; +PFNGLRASTERPOS2IPROC glad_glRasterPos2i = NULL; +PFNGLRASTERPOS2IVPROC glad_glRasterPos2iv = NULL; +PFNGLRASTERPOS2SPROC glad_glRasterPos2s = NULL; +PFNGLRASTERPOS2SVPROC glad_glRasterPos2sv = NULL; +PFNGLRASTERPOS3DPROC glad_glRasterPos3d = NULL; +PFNGLRASTERPOS3DVPROC glad_glRasterPos3dv = NULL; +PFNGLRASTERPOS3FPROC glad_glRasterPos3f = NULL; +PFNGLRASTERPOS3FVPROC glad_glRasterPos3fv = NULL; +PFNGLRASTERPOS3IPROC glad_glRasterPos3i = NULL; +PFNGLRASTERPOS3IVPROC glad_glRasterPos3iv = NULL; +PFNGLRASTERPOS3SPROC glad_glRasterPos3s = NULL; +PFNGLRASTERPOS3SVPROC glad_glRasterPos3sv = NULL; +PFNGLRASTERPOS4DPROC glad_glRasterPos4d = NULL; +PFNGLRASTERPOS4DVPROC glad_glRasterPos4dv = NULL; +PFNGLRASTERPOS4FPROC glad_glRasterPos4f = NULL; +PFNGLRASTERPOS4FVPROC glad_glRasterPos4fv = NULL; +PFNGLRASTERPOS4IPROC glad_glRasterPos4i = NULL; +PFNGLRASTERPOS4IVPROC glad_glRasterPos4iv = NULL; +PFNGLRASTERPOS4SPROC glad_glRasterPos4s = NULL; +PFNGLRASTERPOS4SVPROC glad_glRasterPos4sv = NULL; +PFNGLREADBUFFERPROC glad_glReadBuffer = NULL; +PFNGLREADPIXELSPROC glad_glReadPixels = NULL; +PFNGLRECTDPROC glad_glRectd = NULL; +PFNGLRECTDVPROC glad_glRectdv = NULL; +PFNGLRECTFPROC glad_glRectf = NULL; +PFNGLRECTFVPROC glad_glRectfv = NULL; +PFNGLRECTIPROC glad_glRecti = NULL; +PFNGLRECTIVPROC glad_glRectiv = NULL; +PFNGLRECTSPROC glad_glRects = NULL; +PFNGLRECTSVPROC glad_glRectsv = NULL; +PFNGLRENDERMODEPROC glad_glRenderMode = NULL; +PFNGLRENDERBUFFERSTORAGEPROC glad_glRenderbufferStorage = NULL; +PFNGLRENDERBUFFERSTORAGEEXTPROC glad_glRenderbufferStorageEXT = NULL; +PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC glad_glRenderbufferStorageMultisample = NULL; +PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glad_glRenderbufferStorageMultisampleEXT = NULL; +PFNGLROTATEDPROC glad_glRotated = NULL; +PFNGLROTATEFPROC glad_glRotatef = NULL; +PFNGLSAMPLECOVERAGEPROC glad_glSampleCoverage = NULL; +PFNGLSAMPLEMASKIPROC glad_glSampleMaski = NULL; +PFNGLSAMPLERPARAMETERIIVPROC glad_glSamplerParameterIiv = NULL; +PFNGLSAMPLERPARAMETERIUIVPROC glad_glSamplerParameterIuiv = NULL; +PFNGLSAMPLERPARAMETERFPROC glad_glSamplerParameterf = NULL; +PFNGLSAMPLERPARAMETERFVPROC glad_glSamplerParameterfv = NULL; +PFNGLSAMPLERPARAMETERIPROC glad_glSamplerParameteri = NULL; +PFNGLSAMPLERPARAMETERIVPROC glad_glSamplerParameteriv = NULL; +PFNGLSCALEDPROC glad_glScaled = NULL; +PFNGLSCALEFPROC glad_glScalef = NULL; +PFNGLSCISSORPROC glad_glScissor = NULL; +PFNGLSECONDARYCOLOR3BPROC glad_glSecondaryColor3b = NULL; +PFNGLSECONDARYCOLOR3BVPROC glad_glSecondaryColor3bv = NULL; +PFNGLSECONDARYCOLOR3DPROC glad_glSecondaryColor3d = NULL; +PFNGLSECONDARYCOLOR3DVPROC glad_glSecondaryColor3dv = NULL; +PFNGLSECONDARYCOLOR3FPROC glad_glSecondaryColor3f = NULL; +PFNGLSECONDARYCOLOR3FVPROC glad_glSecondaryColor3fv = NULL; +PFNGLSECONDARYCOLOR3IPROC glad_glSecondaryColor3i = NULL; +PFNGLSECONDARYCOLOR3IVPROC glad_glSecondaryColor3iv = NULL; +PFNGLSECONDARYCOLOR3SPROC glad_glSecondaryColor3s = NULL; +PFNGLSECONDARYCOLOR3SVPROC glad_glSecondaryColor3sv = NULL; +PFNGLSECONDARYCOLOR3UBPROC glad_glSecondaryColor3ub = NULL; +PFNGLSECONDARYCOLOR3UBVPROC glad_glSecondaryColor3ubv = NULL; +PFNGLSECONDARYCOLOR3UIPROC glad_glSecondaryColor3ui = NULL; +PFNGLSECONDARYCOLOR3UIVPROC glad_glSecondaryColor3uiv = NULL; +PFNGLSECONDARYCOLOR3USPROC glad_glSecondaryColor3us = NULL; +PFNGLSECONDARYCOLOR3USVPROC glad_glSecondaryColor3usv = NULL; +PFNGLSECONDARYCOLORP3UIPROC glad_glSecondaryColorP3ui = NULL; +PFNGLSECONDARYCOLORP3UIVPROC glad_glSecondaryColorP3uiv = NULL; +PFNGLSECONDARYCOLORPOINTERPROC glad_glSecondaryColorPointer = NULL; +PFNGLSELECTBUFFERPROC glad_glSelectBuffer = NULL; +PFNGLSHADEMODELPROC glad_glShadeModel = NULL; +PFNGLSHADERSOURCEPROC glad_glShaderSource = NULL; +PFNGLSTENCILFUNCPROC glad_glStencilFunc = NULL; +PFNGLSTENCILFUNCSEPARATEPROC glad_glStencilFuncSeparate = NULL; +PFNGLSTENCILMASKPROC glad_glStencilMask = NULL; +PFNGLSTENCILMASKSEPARATEPROC glad_glStencilMaskSeparate = NULL; +PFNGLSTENCILOPPROC glad_glStencilOp = NULL; +PFNGLSTENCILOPSEPARATEPROC glad_glStencilOpSeparate = NULL; +PFNGLTEXBUFFERPROC glad_glTexBuffer = NULL; +PFNGLTEXCOORD1DPROC glad_glTexCoord1d = NULL; +PFNGLTEXCOORD1DVPROC glad_glTexCoord1dv = NULL; +PFNGLTEXCOORD1FPROC glad_glTexCoord1f = NULL; +PFNGLTEXCOORD1FVPROC glad_glTexCoord1fv = NULL; +PFNGLTEXCOORD1IPROC glad_glTexCoord1i = NULL; +PFNGLTEXCOORD1IVPROC glad_glTexCoord1iv = NULL; +PFNGLTEXCOORD1SPROC glad_glTexCoord1s = NULL; +PFNGLTEXCOORD1SVPROC glad_glTexCoord1sv = NULL; +PFNGLTEXCOORD2DPROC glad_glTexCoord2d = NULL; +PFNGLTEXCOORD2DVPROC glad_glTexCoord2dv = NULL; +PFNGLTEXCOORD2FPROC glad_glTexCoord2f = NULL; +PFNGLTEXCOORD2FVPROC glad_glTexCoord2fv = NULL; +PFNGLTEXCOORD2IPROC glad_glTexCoord2i = NULL; +PFNGLTEXCOORD2IVPROC glad_glTexCoord2iv = NULL; +PFNGLTEXCOORD2SPROC glad_glTexCoord2s = NULL; +PFNGLTEXCOORD2SVPROC glad_glTexCoord2sv = NULL; +PFNGLTEXCOORD3DPROC glad_glTexCoord3d = NULL; +PFNGLTEXCOORD3DVPROC glad_glTexCoord3dv = NULL; +PFNGLTEXCOORD3FPROC glad_glTexCoord3f = NULL; +PFNGLTEXCOORD3FVPROC glad_glTexCoord3fv = NULL; +PFNGLTEXCOORD3IPROC glad_glTexCoord3i = NULL; +PFNGLTEXCOORD3IVPROC glad_glTexCoord3iv = NULL; +PFNGLTEXCOORD3SPROC glad_glTexCoord3s = NULL; +PFNGLTEXCOORD3SVPROC glad_glTexCoord3sv = NULL; +PFNGLTEXCOORD4DPROC glad_glTexCoord4d = NULL; +PFNGLTEXCOORD4DVPROC glad_glTexCoord4dv = NULL; +PFNGLTEXCOORD4FPROC glad_glTexCoord4f = NULL; +PFNGLTEXCOORD4FVPROC glad_glTexCoord4fv = NULL; +PFNGLTEXCOORD4IPROC glad_glTexCoord4i = NULL; +PFNGLTEXCOORD4IVPROC glad_glTexCoord4iv = NULL; +PFNGLTEXCOORD4SPROC glad_glTexCoord4s = NULL; +PFNGLTEXCOORD4SVPROC glad_glTexCoord4sv = NULL; +PFNGLTEXCOORDP1UIPROC glad_glTexCoordP1ui = NULL; +PFNGLTEXCOORDP1UIVPROC glad_glTexCoordP1uiv = NULL; +PFNGLTEXCOORDP2UIPROC glad_glTexCoordP2ui = NULL; +PFNGLTEXCOORDP2UIVPROC glad_glTexCoordP2uiv = NULL; +PFNGLTEXCOORDP3UIPROC glad_glTexCoordP3ui = NULL; +PFNGLTEXCOORDP3UIVPROC glad_glTexCoordP3uiv = NULL; +PFNGLTEXCOORDP4UIPROC glad_glTexCoordP4ui = NULL; +PFNGLTEXCOORDP4UIVPROC glad_glTexCoordP4uiv = NULL; +PFNGLTEXCOORDPOINTERPROC glad_glTexCoordPointer = NULL; +PFNGLTEXENVFPROC glad_glTexEnvf = NULL; +PFNGLTEXENVFVPROC glad_glTexEnvfv = NULL; +PFNGLTEXENVIPROC glad_glTexEnvi = NULL; +PFNGLTEXENVIVPROC glad_glTexEnviv = NULL; +PFNGLTEXGENDPROC glad_glTexGend = NULL; +PFNGLTEXGENDVPROC glad_glTexGendv = NULL; +PFNGLTEXGENFPROC glad_glTexGenf = NULL; +PFNGLTEXGENFVPROC glad_glTexGenfv = NULL; +PFNGLTEXGENIPROC glad_glTexGeni = NULL; +PFNGLTEXGENIVPROC glad_glTexGeniv = NULL; +PFNGLTEXIMAGE1DPROC glad_glTexImage1D = NULL; +PFNGLTEXIMAGE2DPROC glad_glTexImage2D = NULL; +PFNGLTEXIMAGE2DMULTISAMPLEPROC glad_glTexImage2DMultisample = NULL; +PFNGLTEXIMAGE3DPROC glad_glTexImage3D = NULL; +PFNGLTEXIMAGE3DMULTISAMPLEPROC glad_glTexImage3DMultisample = NULL; +PFNGLTEXPARAMETERIIVPROC glad_glTexParameterIiv = NULL; +PFNGLTEXPARAMETERIUIVPROC glad_glTexParameterIuiv = NULL; +PFNGLTEXPARAMETERFPROC glad_glTexParameterf = NULL; +PFNGLTEXPARAMETERFVPROC glad_glTexParameterfv = NULL; +PFNGLTEXPARAMETERIPROC glad_glTexParameteri = NULL; +PFNGLTEXPARAMETERIVPROC glad_glTexParameteriv = NULL; +PFNGLTEXSUBIMAGE1DPROC glad_glTexSubImage1D = NULL; +PFNGLTEXSUBIMAGE2DPROC glad_glTexSubImage2D = NULL; +PFNGLTEXSUBIMAGE3DPROC glad_glTexSubImage3D = NULL; +PFNGLTRANSFORMFEEDBACKVARYINGSPROC glad_glTransformFeedbackVaryings = NULL; +PFNGLTRANSLATEDPROC glad_glTranslated = NULL; +PFNGLTRANSLATEFPROC glad_glTranslatef = NULL; +PFNGLUNIFORM1FPROC glad_glUniform1f = NULL; +PFNGLUNIFORM1FVPROC glad_glUniform1fv = NULL; +PFNGLUNIFORM1IPROC glad_glUniform1i = NULL; +PFNGLUNIFORM1IVPROC glad_glUniform1iv = NULL; +PFNGLUNIFORM1UIPROC glad_glUniform1ui = NULL; +PFNGLUNIFORM1UIVPROC glad_glUniform1uiv = NULL; +PFNGLUNIFORM2FPROC glad_glUniform2f = NULL; +PFNGLUNIFORM2FVPROC glad_glUniform2fv = NULL; +PFNGLUNIFORM2IPROC glad_glUniform2i = NULL; +PFNGLUNIFORM2IVPROC glad_glUniform2iv = NULL; +PFNGLUNIFORM2UIPROC glad_glUniform2ui = NULL; +PFNGLUNIFORM2UIVPROC glad_glUniform2uiv = NULL; +PFNGLUNIFORM3FPROC glad_glUniform3f = NULL; +PFNGLUNIFORM3FVPROC glad_glUniform3fv = NULL; +PFNGLUNIFORM3IPROC glad_glUniform3i = NULL; +PFNGLUNIFORM3IVPROC glad_glUniform3iv = NULL; +PFNGLUNIFORM3UIPROC glad_glUniform3ui = NULL; +PFNGLUNIFORM3UIVPROC glad_glUniform3uiv = NULL; +PFNGLUNIFORM4FPROC glad_glUniform4f = NULL; +PFNGLUNIFORM4FVPROC glad_glUniform4fv = NULL; +PFNGLUNIFORM4IPROC glad_glUniform4i = NULL; +PFNGLUNIFORM4IVPROC glad_glUniform4iv = NULL; +PFNGLUNIFORM4UIPROC glad_glUniform4ui = NULL; +PFNGLUNIFORM4UIVPROC glad_glUniform4uiv = NULL; +PFNGLUNIFORMBLOCKBINDINGPROC glad_glUniformBlockBinding = NULL; +PFNGLUNIFORMMATRIX2FVPROC glad_glUniformMatrix2fv = NULL; +PFNGLUNIFORMMATRIX2X3FVPROC glad_glUniformMatrix2x3fv = NULL; +PFNGLUNIFORMMATRIX2X4FVPROC glad_glUniformMatrix2x4fv = NULL; +PFNGLUNIFORMMATRIX3FVPROC glad_glUniformMatrix3fv = NULL; +PFNGLUNIFORMMATRIX3X2FVPROC glad_glUniformMatrix3x2fv = NULL; +PFNGLUNIFORMMATRIX3X4FVPROC glad_glUniformMatrix3x4fv = NULL; +PFNGLUNIFORMMATRIX4FVPROC glad_glUniformMatrix4fv = NULL; +PFNGLUNIFORMMATRIX4X2FVPROC glad_glUniformMatrix4x2fv = NULL; +PFNGLUNIFORMMATRIX4X3FVPROC glad_glUniformMatrix4x3fv = NULL; +PFNGLUNMAPBUFFERPROC glad_glUnmapBuffer = NULL; +PFNGLUSEPROGRAMPROC glad_glUseProgram = NULL; +PFNGLVALIDATEPROGRAMPROC glad_glValidateProgram = NULL; +PFNGLVERTEX2DPROC glad_glVertex2d = NULL; +PFNGLVERTEX2DVPROC glad_glVertex2dv = NULL; +PFNGLVERTEX2FPROC glad_glVertex2f = NULL; +PFNGLVERTEX2FVPROC glad_glVertex2fv = NULL; +PFNGLVERTEX2IPROC glad_glVertex2i = NULL; +PFNGLVERTEX2IVPROC glad_glVertex2iv = NULL; +PFNGLVERTEX2SPROC glad_glVertex2s = NULL; +PFNGLVERTEX2SVPROC glad_glVertex2sv = NULL; +PFNGLVERTEX3DPROC glad_glVertex3d = NULL; +PFNGLVERTEX3DVPROC glad_glVertex3dv = NULL; +PFNGLVERTEX3FPROC glad_glVertex3f = NULL; +PFNGLVERTEX3FVPROC glad_glVertex3fv = NULL; +PFNGLVERTEX3IPROC glad_glVertex3i = NULL; +PFNGLVERTEX3IVPROC glad_glVertex3iv = NULL; +PFNGLVERTEX3SPROC glad_glVertex3s = NULL; +PFNGLVERTEX3SVPROC glad_glVertex3sv = NULL; +PFNGLVERTEX4DPROC glad_glVertex4d = NULL; +PFNGLVERTEX4DVPROC glad_glVertex4dv = NULL; +PFNGLVERTEX4FPROC glad_glVertex4f = NULL; +PFNGLVERTEX4FVPROC glad_glVertex4fv = NULL; +PFNGLVERTEX4IPROC glad_glVertex4i = NULL; +PFNGLVERTEX4IVPROC glad_glVertex4iv = NULL; +PFNGLVERTEX4SPROC glad_glVertex4s = NULL; +PFNGLVERTEX4SVPROC glad_glVertex4sv = NULL; +PFNGLVERTEXATTRIB1DPROC glad_glVertexAttrib1d = NULL; +PFNGLVERTEXATTRIB1DVPROC glad_glVertexAttrib1dv = NULL; +PFNGLVERTEXATTRIB1FPROC glad_glVertexAttrib1f = NULL; +PFNGLVERTEXATTRIB1FVPROC glad_glVertexAttrib1fv = NULL; +PFNGLVERTEXATTRIB1SPROC glad_glVertexAttrib1s = NULL; +PFNGLVERTEXATTRIB1SVPROC glad_glVertexAttrib1sv = NULL; +PFNGLVERTEXATTRIB2DPROC glad_glVertexAttrib2d = NULL; +PFNGLVERTEXATTRIB2DVPROC glad_glVertexAttrib2dv = NULL; +PFNGLVERTEXATTRIB2FPROC glad_glVertexAttrib2f = NULL; +PFNGLVERTEXATTRIB2FVPROC glad_glVertexAttrib2fv = NULL; +PFNGLVERTEXATTRIB2SPROC glad_glVertexAttrib2s = NULL; +PFNGLVERTEXATTRIB2SVPROC glad_glVertexAttrib2sv = NULL; +PFNGLVERTEXATTRIB3DPROC glad_glVertexAttrib3d = NULL; +PFNGLVERTEXATTRIB3DVPROC glad_glVertexAttrib3dv = NULL; +PFNGLVERTEXATTRIB3FPROC glad_glVertexAttrib3f = NULL; +PFNGLVERTEXATTRIB3FVPROC glad_glVertexAttrib3fv = NULL; +PFNGLVERTEXATTRIB3SPROC glad_glVertexAttrib3s = NULL; +PFNGLVERTEXATTRIB3SVPROC glad_glVertexAttrib3sv = NULL; +PFNGLVERTEXATTRIB4NBVPROC glad_glVertexAttrib4Nbv = NULL; +PFNGLVERTEXATTRIB4NIVPROC glad_glVertexAttrib4Niv = NULL; +PFNGLVERTEXATTRIB4NSVPROC glad_glVertexAttrib4Nsv = NULL; +PFNGLVERTEXATTRIB4NUBPROC glad_glVertexAttrib4Nub = NULL; +PFNGLVERTEXATTRIB4NUBVPROC glad_glVertexAttrib4Nubv = NULL; +PFNGLVERTEXATTRIB4NUIVPROC glad_glVertexAttrib4Nuiv = NULL; +PFNGLVERTEXATTRIB4NUSVPROC glad_glVertexAttrib4Nusv = NULL; +PFNGLVERTEXATTRIB4BVPROC glad_glVertexAttrib4bv = NULL; +PFNGLVERTEXATTRIB4DPROC glad_glVertexAttrib4d = NULL; +PFNGLVERTEXATTRIB4DVPROC glad_glVertexAttrib4dv = NULL; +PFNGLVERTEXATTRIB4FPROC glad_glVertexAttrib4f = NULL; +PFNGLVERTEXATTRIB4FVPROC glad_glVertexAttrib4fv = NULL; +PFNGLVERTEXATTRIB4IVPROC glad_glVertexAttrib4iv = NULL; +PFNGLVERTEXATTRIB4SPROC glad_glVertexAttrib4s = NULL; +PFNGLVERTEXATTRIB4SVPROC glad_glVertexAttrib4sv = NULL; +PFNGLVERTEXATTRIB4UBVPROC glad_glVertexAttrib4ubv = NULL; +PFNGLVERTEXATTRIB4UIVPROC glad_glVertexAttrib4uiv = NULL; +PFNGLVERTEXATTRIB4USVPROC glad_glVertexAttrib4usv = NULL; +PFNGLVERTEXATTRIBDIVISORPROC glad_glVertexAttribDivisor = NULL; +PFNGLVERTEXATTRIBI1IPROC glad_glVertexAttribI1i = NULL; +PFNGLVERTEXATTRIBI1IVPROC glad_glVertexAttribI1iv = NULL; +PFNGLVERTEXATTRIBI1UIPROC glad_glVertexAttribI1ui = NULL; +PFNGLVERTEXATTRIBI1UIVPROC glad_glVertexAttribI1uiv = NULL; +PFNGLVERTEXATTRIBI2IPROC glad_glVertexAttribI2i = NULL; +PFNGLVERTEXATTRIBI2IVPROC glad_glVertexAttribI2iv = NULL; +PFNGLVERTEXATTRIBI2UIPROC glad_glVertexAttribI2ui = NULL; +PFNGLVERTEXATTRIBI2UIVPROC glad_glVertexAttribI2uiv = NULL; +PFNGLVERTEXATTRIBI3IPROC glad_glVertexAttribI3i = NULL; +PFNGLVERTEXATTRIBI3IVPROC glad_glVertexAttribI3iv = NULL; +PFNGLVERTEXATTRIBI3UIPROC glad_glVertexAttribI3ui = NULL; +PFNGLVERTEXATTRIBI3UIVPROC glad_glVertexAttribI3uiv = NULL; +PFNGLVERTEXATTRIBI4BVPROC glad_glVertexAttribI4bv = NULL; +PFNGLVERTEXATTRIBI4IPROC glad_glVertexAttribI4i = NULL; +PFNGLVERTEXATTRIBI4IVPROC glad_glVertexAttribI4iv = NULL; +PFNGLVERTEXATTRIBI4SVPROC glad_glVertexAttribI4sv = NULL; +PFNGLVERTEXATTRIBI4UBVPROC glad_glVertexAttribI4ubv = NULL; +PFNGLVERTEXATTRIBI4UIPROC glad_glVertexAttribI4ui = NULL; +PFNGLVERTEXATTRIBI4UIVPROC glad_glVertexAttribI4uiv = NULL; +PFNGLVERTEXATTRIBI4USVPROC glad_glVertexAttribI4usv = NULL; +PFNGLVERTEXATTRIBIPOINTERPROC glad_glVertexAttribIPointer = NULL; +PFNGLVERTEXATTRIBP1UIPROC glad_glVertexAttribP1ui = NULL; +PFNGLVERTEXATTRIBP1UIVPROC glad_glVertexAttribP1uiv = NULL; +PFNGLVERTEXATTRIBP2UIPROC glad_glVertexAttribP2ui = NULL; +PFNGLVERTEXATTRIBP2UIVPROC glad_glVertexAttribP2uiv = NULL; +PFNGLVERTEXATTRIBP3UIPROC glad_glVertexAttribP3ui = NULL; +PFNGLVERTEXATTRIBP3UIVPROC glad_glVertexAttribP3uiv = NULL; +PFNGLVERTEXATTRIBP4UIPROC glad_glVertexAttribP4ui = NULL; +PFNGLVERTEXATTRIBP4UIVPROC glad_glVertexAttribP4uiv = NULL; +PFNGLVERTEXATTRIBPOINTERPROC glad_glVertexAttribPointer = NULL; +PFNGLVERTEXP2UIPROC glad_glVertexP2ui = NULL; +PFNGLVERTEXP2UIVPROC glad_glVertexP2uiv = NULL; +PFNGLVERTEXP3UIPROC glad_glVertexP3ui = NULL; +PFNGLVERTEXP3UIVPROC glad_glVertexP3uiv = NULL; +PFNGLVERTEXP4UIPROC glad_glVertexP4ui = NULL; +PFNGLVERTEXP4UIVPROC glad_glVertexP4uiv = NULL; +PFNGLVERTEXPOINTERPROC glad_glVertexPointer = NULL; +PFNGLVIEWPORTPROC glad_glViewport = NULL; +PFNGLWAITSYNCPROC glad_glWaitSync = NULL; +PFNGLWINDOWPOS2DPROC glad_glWindowPos2d = NULL; +PFNGLWINDOWPOS2DVPROC glad_glWindowPos2dv = NULL; +PFNGLWINDOWPOS2FPROC glad_glWindowPos2f = NULL; +PFNGLWINDOWPOS2FVPROC glad_glWindowPos2fv = NULL; +PFNGLWINDOWPOS2IPROC glad_glWindowPos2i = NULL; +PFNGLWINDOWPOS2IVPROC glad_glWindowPos2iv = NULL; +PFNGLWINDOWPOS2SPROC glad_glWindowPos2s = NULL; +PFNGLWINDOWPOS2SVPROC glad_glWindowPos2sv = NULL; +PFNGLWINDOWPOS3DPROC glad_glWindowPos3d = NULL; +PFNGLWINDOWPOS3DVPROC glad_glWindowPos3dv = NULL; +PFNGLWINDOWPOS3FPROC glad_glWindowPos3f = NULL; +PFNGLWINDOWPOS3FVPROC glad_glWindowPos3fv = NULL; +PFNGLWINDOWPOS3IPROC glad_glWindowPos3i = NULL; +PFNGLWINDOWPOS3IVPROC glad_glWindowPos3iv = NULL; +PFNGLWINDOWPOS3SPROC glad_glWindowPos3s = NULL; +PFNGLWINDOWPOS3SVPROC glad_glWindowPos3sv = NULL; + + +static void glad_gl_load_GL_VERSION_1_0( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_0) return; + glad_glAccum = (PFNGLACCUMPROC) load(userptr, "glAccum"); + glad_glAlphaFunc = (PFNGLALPHAFUNCPROC) load(userptr, "glAlphaFunc"); + glad_glBegin = (PFNGLBEGINPROC) load(userptr, "glBegin"); + glad_glBitmap = (PFNGLBITMAPPROC) load(userptr, "glBitmap"); + glad_glBlendFunc = (PFNGLBLENDFUNCPROC) load(userptr, "glBlendFunc"); + glad_glCallList = (PFNGLCALLLISTPROC) load(userptr, "glCallList"); + glad_glCallLists = (PFNGLCALLLISTSPROC) load(userptr, "glCallLists"); + glad_glClear = (PFNGLCLEARPROC) load(userptr, "glClear"); + glad_glClearAccum = (PFNGLCLEARACCUMPROC) load(userptr, "glClearAccum"); + glad_glClearColor = (PFNGLCLEARCOLORPROC) load(userptr, "glClearColor"); + glad_glClearDepth = (PFNGLCLEARDEPTHPROC) load(userptr, "glClearDepth"); + glad_glClearIndex = (PFNGLCLEARINDEXPROC) load(userptr, "glClearIndex"); + glad_glClearStencil = (PFNGLCLEARSTENCILPROC) load(userptr, "glClearStencil"); + glad_glClipPlane = (PFNGLCLIPPLANEPROC) load(userptr, "glClipPlane"); + glad_glColor3b = (PFNGLCOLOR3BPROC) load(userptr, "glColor3b"); + glad_glColor3bv = (PFNGLCOLOR3BVPROC) load(userptr, "glColor3bv"); + glad_glColor3d = (PFNGLCOLOR3DPROC) load(userptr, "glColor3d"); + glad_glColor3dv = (PFNGLCOLOR3DVPROC) load(userptr, "glColor3dv"); + glad_glColor3f = (PFNGLCOLOR3FPROC) load(userptr, "glColor3f"); + glad_glColor3fv = (PFNGLCOLOR3FVPROC) load(userptr, "glColor3fv"); + glad_glColor3i = (PFNGLCOLOR3IPROC) load(userptr, "glColor3i"); + glad_glColor3iv = (PFNGLCOLOR3IVPROC) load(userptr, "glColor3iv"); + glad_glColor3s = (PFNGLCOLOR3SPROC) load(userptr, "glColor3s"); + glad_glColor3sv = (PFNGLCOLOR3SVPROC) load(userptr, "glColor3sv"); + glad_glColor3ub = (PFNGLCOLOR3UBPROC) load(userptr, "glColor3ub"); + glad_glColor3ubv = (PFNGLCOLOR3UBVPROC) load(userptr, "glColor3ubv"); + glad_glColor3ui = (PFNGLCOLOR3UIPROC) load(userptr, "glColor3ui"); + glad_glColor3uiv = (PFNGLCOLOR3UIVPROC) load(userptr, "glColor3uiv"); + glad_glColor3us = (PFNGLCOLOR3USPROC) load(userptr, "glColor3us"); + glad_glColor3usv = (PFNGLCOLOR3USVPROC) load(userptr, "glColor3usv"); + glad_glColor4b = (PFNGLCOLOR4BPROC) load(userptr, "glColor4b"); + glad_glColor4bv = (PFNGLCOLOR4BVPROC) load(userptr, "glColor4bv"); + glad_glColor4d = (PFNGLCOLOR4DPROC) load(userptr, "glColor4d"); + glad_glColor4dv = (PFNGLCOLOR4DVPROC) load(userptr, "glColor4dv"); + glad_glColor4f = (PFNGLCOLOR4FPROC) load(userptr, "glColor4f"); + glad_glColor4fv = (PFNGLCOLOR4FVPROC) load(userptr, "glColor4fv"); + glad_glColor4i = (PFNGLCOLOR4IPROC) load(userptr, "glColor4i"); + glad_glColor4iv = (PFNGLCOLOR4IVPROC) load(userptr, "glColor4iv"); + glad_glColor4s = (PFNGLCOLOR4SPROC) load(userptr, "glColor4s"); + glad_glColor4sv = (PFNGLCOLOR4SVPROC) load(userptr, "glColor4sv"); + glad_glColor4ub = (PFNGLCOLOR4UBPROC) load(userptr, "glColor4ub"); + glad_glColor4ubv = (PFNGLCOLOR4UBVPROC) load(userptr, "glColor4ubv"); + glad_glColor4ui = (PFNGLCOLOR4UIPROC) load(userptr, "glColor4ui"); + glad_glColor4uiv = (PFNGLCOLOR4UIVPROC) load(userptr, "glColor4uiv"); + glad_glColor4us = (PFNGLCOLOR4USPROC) load(userptr, "glColor4us"); + glad_glColor4usv = (PFNGLCOLOR4USVPROC) load(userptr, "glColor4usv"); + glad_glColorMask = (PFNGLCOLORMASKPROC) load(userptr, "glColorMask"); + glad_glColorMaterial = (PFNGLCOLORMATERIALPROC) load(userptr, "glColorMaterial"); + glad_glCopyPixels = (PFNGLCOPYPIXELSPROC) load(userptr, "glCopyPixels"); + glad_glCullFace = (PFNGLCULLFACEPROC) load(userptr, "glCullFace"); + glad_glDeleteLists = (PFNGLDELETELISTSPROC) load(userptr, "glDeleteLists"); + glad_glDepthFunc = (PFNGLDEPTHFUNCPROC) load(userptr, "glDepthFunc"); + glad_glDepthMask = (PFNGLDEPTHMASKPROC) load(userptr, "glDepthMask"); + glad_glDepthRange = (PFNGLDEPTHRANGEPROC) load(userptr, "glDepthRange"); + glad_glDisable = (PFNGLDISABLEPROC) load(userptr, "glDisable"); + glad_glDrawBuffer = (PFNGLDRAWBUFFERPROC) load(userptr, "glDrawBuffer"); + glad_glDrawPixels = (PFNGLDRAWPIXELSPROC) load(userptr, "glDrawPixels"); + glad_glEdgeFlag = (PFNGLEDGEFLAGPROC) load(userptr, "glEdgeFlag"); + glad_glEdgeFlagv = (PFNGLEDGEFLAGVPROC) load(userptr, "glEdgeFlagv"); + glad_glEnable = (PFNGLENABLEPROC) load(userptr, "glEnable"); + glad_glEnd = (PFNGLENDPROC) load(userptr, "glEnd"); + glad_glEndList = (PFNGLENDLISTPROC) load(userptr, "glEndList"); + glad_glEvalCoord1d = (PFNGLEVALCOORD1DPROC) load(userptr, "glEvalCoord1d"); + glad_glEvalCoord1dv = (PFNGLEVALCOORD1DVPROC) load(userptr, "glEvalCoord1dv"); + glad_glEvalCoord1f = (PFNGLEVALCOORD1FPROC) load(userptr, "glEvalCoord1f"); + glad_glEvalCoord1fv = (PFNGLEVALCOORD1FVPROC) load(userptr, "glEvalCoord1fv"); + glad_glEvalCoord2d = (PFNGLEVALCOORD2DPROC) load(userptr, "glEvalCoord2d"); + glad_glEvalCoord2dv = (PFNGLEVALCOORD2DVPROC) load(userptr, "glEvalCoord2dv"); + glad_glEvalCoord2f = (PFNGLEVALCOORD2FPROC) load(userptr, "glEvalCoord2f"); + glad_glEvalCoord2fv = (PFNGLEVALCOORD2FVPROC) load(userptr, "glEvalCoord2fv"); + glad_glEvalMesh1 = (PFNGLEVALMESH1PROC) load(userptr, "glEvalMesh1"); + glad_glEvalMesh2 = (PFNGLEVALMESH2PROC) load(userptr, "glEvalMesh2"); + glad_glEvalPoint1 = (PFNGLEVALPOINT1PROC) load(userptr, "glEvalPoint1"); + glad_glEvalPoint2 = (PFNGLEVALPOINT2PROC) load(userptr, "glEvalPoint2"); + glad_glFeedbackBuffer = (PFNGLFEEDBACKBUFFERPROC) load(userptr, "glFeedbackBuffer"); + glad_glFinish = (PFNGLFINISHPROC) load(userptr, "glFinish"); + glad_glFlush = (PFNGLFLUSHPROC) load(userptr, "glFlush"); + glad_glFogf = (PFNGLFOGFPROC) load(userptr, "glFogf"); + glad_glFogfv = (PFNGLFOGFVPROC) load(userptr, "glFogfv"); + glad_glFogi = (PFNGLFOGIPROC) load(userptr, "glFogi"); + glad_glFogiv = (PFNGLFOGIVPROC) load(userptr, "glFogiv"); + glad_glFrontFace = (PFNGLFRONTFACEPROC) load(userptr, "glFrontFace"); + glad_glFrustum = (PFNGLFRUSTUMPROC) load(userptr, "glFrustum"); + glad_glGenLists = (PFNGLGENLISTSPROC) load(userptr, "glGenLists"); + glad_glGetBooleanv = (PFNGLGETBOOLEANVPROC) load(userptr, "glGetBooleanv"); + glad_glGetClipPlane = (PFNGLGETCLIPPLANEPROC) load(userptr, "glGetClipPlane"); + glad_glGetDoublev = (PFNGLGETDOUBLEVPROC) load(userptr, "glGetDoublev"); + glad_glGetError = (PFNGLGETERRORPROC) load(userptr, "glGetError"); + glad_glGetFloatv = (PFNGLGETFLOATVPROC) load(userptr, "glGetFloatv"); + glad_glGetIntegerv = (PFNGLGETINTEGERVPROC) load(userptr, "glGetIntegerv"); + glad_glGetLightfv = (PFNGLGETLIGHTFVPROC) load(userptr, "glGetLightfv"); + glad_glGetLightiv = (PFNGLGETLIGHTIVPROC) load(userptr, "glGetLightiv"); + glad_glGetMapdv = (PFNGLGETMAPDVPROC) load(userptr, "glGetMapdv"); + glad_glGetMapfv = (PFNGLGETMAPFVPROC) load(userptr, "glGetMapfv"); + glad_glGetMapiv = (PFNGLGETMAPIVPROC) load(userptr, "glGetMapiv"); + glad_glGetMaterialfv = (PFNGLGETMATERIALFVPROC) load(userptr, "glGetMaterialfv"); + glad_glGetMaterialiv = (PFNGLGETMATERIALIVPROC) load(userptr, "glGetMaterialiv"); + glad_glGetPixelMapfv = (PFNGLGETPIXELMAPFVPROC) load(userptr, "glGetPixelMapfv"); + glad_glGetPixelMapuiv = (PFNGLGETPIXELMAPUIVPROC) load(userptr, "glGetPixelMapuiv"); + glad_glGetPixelMapusv = (PFNGLGETPIXELMAPUSVPROC) load(userptr, "glGetPixelMapusv"); + glad_glGetPolygonStipple = (PFNGLGETPOLYGONSTIPPLEPROC) load(userptr, "glGetPolygonStipple"); + glad_glGetString = (PFNGLGETSTRINGPROC) load(userptr, "glGetString"); + glad_glGetTexEnvfv = (PFNGLGETTEXENVFVPROC) load(userptr, "glGetTexEnvfv"); + glad_glGetTexEnviv = (PFNGLGETTEXENVIVPROC) load(userptr, "glGetTexEnviv"); + glad_glGetTexGendv = (PFNGLGETTEXGENDVPROC) load(userptr, "glGetTexGendv"); + glad_glGetTexGenfv = (PFNGLGETTEXGENFVPROC) load(userptr, "glGetTexGenfv"); + glad_glGetTexGeniv = (PFNGLGETTEXGENIVPROC) load(userptr, "glGetTexGeniv"); + glad_glGetTexImage = (PFNGLGETTEXIMAGEPROC) load(userptr, "glGetTexImage"); + glad_glGetTexLevelParameterfv = (PFNGLGETTEXLEVELPARAMETERFVPROC) load(userptr, "glGetTexLevelParameterfv"); + glad_glGetTexLevelParameteriv = (PFNGLGETTEXLEVELPARAMETERIVPROC) load(userptr, "glGetTexLevelParameteriv"); + glad_glGetTexParameterfv = (PFNGLGETTEXPARAMETERFVPROC) load(userptr, "glGetTexParameterfv"); + glad_glGetTexParameteriv = (PFNGLGETTEXPARAMETERIVPROC) load(userptr, "glGetTexParameteriv"); + glad_glHint = (PFNGLHINTPROC) load(userptr, "glHint"); + glad_glIndexMask = (PFNGLINDEXMASKPROC) load(userptr, "glIndexMask"); + glad_glIndexd = (PFNGLINDEXDPROC) load(userptr, "glIndexd"); + glad_glIndexdv = (PFNGLINDEXDVPROC) load(userptr, "glIndexdv"); + glad_glIndexf = (PFNGLINDEXFPROC) load(userptr, "glIndexf"); + glad_glIndexfv = (PFNGLINDEXFVPROC) load(userptr, "glIndexfv"); + glad_glIndexi = (PFNGLINDEXIPROC) load(userptr, "glIndexi"); + glad_glIndexiv = (PFNGLINDEXIVPROC) load(userptr, "glIndexiv"); + glad_glIndexs = (PFNGLINDEXSPROC) load(userptr, "glIndexs"); + glad_glIndexsv = (PFNGLINDEXSVPROC) load(userptr, "glIndexsv"); + glad_glInitNames = (PFNGLINITNAMESPROC) load(userptr, "glInitNames"); + glad_glIsEnabled = (PFNGLISENABLEDPROC) load(userptr, "glIsEnabled"); + glad_glIsList = (PFNGLISLISTPROC) load(userptr, "glIsList"); + glad_glLightModelf = (PFNGLLIGHTMODELFPROC) load(userptr, "glLightModelf"); + glad_glLightModelfv = (PFNGLLIGHTMODELFVPROC) load(userptr, "glLightModelfv"); + glad_glLightModeli = (PFNGLLIGHTMODELIPROC) load(userptr, "glLightModeli"); + glad_glLightModeliv = (PFNGLLIGHTMODELIVPROC) load(userptr, "glLightModeliv"); + glad_glLightf = (PFNGLLIGHTFPROC) load(userptr, "glLightf"); + glad_glLightfv = (PFNGLLIGHTFVPROC) load(userptr, "glLightfv"); + glad_glLighti = (PFNGLLIGHTIPROC) load(userptr, "glLighti"); + glad_glLightiv = (PFNGLLIGHTIVPROC) load(userptr, "glLightiv"); + glad_glLineStipple = (PFNGLLINESTIPPLEPROC) load(userptr, "glLineStipple"); + glad_glLineWidth = (PFNGLLINEWIDTHPROC) load(userptr, "glLineWidth"); + glad_glListBase = (PFNGLLISTBASEPROC) load(userptr, "glListBase"); + glad_glLoadIdentity = (PFNGLLOADIDENTITYPROC) load(userptr, "glLoadIdentity"); + glad_glLoadMatrixd = (PFNGLLOADMATRIXDPROC) load(userptr, "glLoadMatrixd"); + glad_glLoadMatrixf = (PFNGLLOADMATRIXFPROC) load(userptr, "glLoadMatrixf"); + glad_glLoadName = (PFNGLLOADNAMEPROC) load(userptr, "glLoadName"); + glad_glLogicOp = (PFNGLLOGICOPPROC) load(userptr, "glLogicOp"); + glad_glMap1d = (PFNGLMAP1DPROC) load(userptr, "glMap1d"); + glad_glMap1f = (PFNGLMAP1FPROC) load(userptr, "glMap1f"); + glad_glMap2d = (PFNGLMAP2DPROC) load(userptr, "glMap2d"); + glad_glMap2f = (PFNGLMAP2FPROC) load(userptr, "glMap2f"); + glad_glMapGrid1d = (PFNGLMAPGRID1DPROC) load(userptr, "glMapGrid1d"); + glad_glMapGrid1f = (PFNGLMAPGRID1FPROC) load(userptr, "glMapGrid1f"); + glad_glMapGrid2d = (PFNGLMAPGRID2DPROC) load(userptr, "glMapGrid2d"); + glad_glMapGrid2f = (PFNGLMAPGRID2FPROC) load(userptr, "glMapGrid2f"); + glad_glMaterialf = (PFNGLMATERIALFPROC) load(userptr, "glMaterialf"); + glad_glMaterialfv = (PFNGLMATERIALFVPROC) load(userptr, "glMaterialfv"); + glad_glMateriali = (PFNGLMATERIALIPROC) load(userptr, "glMateriali"); + glad_glMaterialiv = (PFNGLMATERIALIVPROC) load(userptr, "glMaterialiv"); + glad_glMatrixMode = (PFNGLMATRIXMODEPROC) load(userptr, "glMatrixMode"); + glad_glMultMatrixd = (PFNGLMULTMATRIXDPROC) load(userptr, "glMultMatrixd"); + glad_glMultMatrixf = (PFNGLMULTMATRIXFPROC) load(userptr, "glMultMatrixf"); + glad_glNewList = (PFNGLNEWLISTPROC) load(userptr, "glNewList"); + glad_glNormal3b = (PFNGLNORMAL3BPROC) load(userptr, "glNormal3b"); + glad_glNormal3bv = (PFNGLNORMAL3BVPROC) load(userptr, "glNormal3bv"); + glad_glNormal3d = (PFNGLNORMAL3DPROC) load(userptr, "glNormal3d"); + glad_glNormal3dv = (PFNGLNORMAL3DVPROC) load(userptr, "glNormal3dv"); + glad_glNormal3f = (PFNGLNORMAL3FPROC) load(userptr, "glNormal3f"); + glad_glNormal3fv = (PFNGLNORMAL3FVPROC) load(userptr, "glNormal3fv"); + glad_glNormal3i = (PFNGLNORMAL3IPROC) load(userptr, "glNormal3i"); + glad_glNormal3iv = (PFNGLNORMAL3IVPROC) load(userptr, "glNormal3iv"); + glad_glNormal3s = (PFNGLNORMAL3SPROC) load(userptr, "glNormal3s"); + glad_glNormal3sv = (PFNGLNORMAL3SVPROC) load(userptr, "glNormal3sv"); + glad_glOrtho = (PFNGLORTHOPROC) load(userptr, "glOrtho"); + glad_glPassThrough = (PFNGLPASSTHROUGHPROC) load(userptr, "glPassThrough"); + glad_glPixelMapfv = (PFNGLPIXELMAPFVPROC) load(userptr, "glPixelMapfv"); + glad_glPixelMapuiv = (PFNGLPIXELMAPUIVPROC) load(userptr, "glPixelMapuiv"); + glad_glPixelMapusv = (PFNGLPIXELMAPUSVPROC) load(userptr, "glPixelMapusv"); + glad_glPixelStoref = (PFNGLPIXELSTOREFPROC) load(userptr, "glPixelStoref"); + glad_glPixelStorei = (PFNGLPIXELSTOREIPROC) load(userptr, "glPixelStorei"); + glad_glPixelTransferf = (PFNGLPIXELTRANSFERFPROC) load(userptr, "glPixelTransferf"); + glad_glPixelTransferi = (PFNGLPIXELTRANSFERIPROC) load(userptr, "glPixelTransferi"); + glad_glPixelZoom = (PFNGLPIXELZOOMPROC) load(userptr, "glPixelZoom"); + glad_glPointSize = (PFNGLPOINTSIZEPROC) load(userptr, "glPointSize"); + glad_glPolygonMode = (PFNGLPOLYGONMODEPROC) load(userptr, "glPolygonMode"); + glad_glPolygonStipple = (PFNGLPOLYGONSTIPPLEPROC) load(userptr, "glPolygonStipple"); + glad_glPopAttrib = (PFNGLPOPATTRIBPROC) load(userptr, "glPopAttrib"); + glad_glPopMatrix = (PFNGLPOPMATRIXPROC) load(userptr, "glPopMatrix"); + glad_glPopName = (PFNGLPOPNAMEPROC) load(userptr, "glPopName"); + glad_glPushAttrib = (PFNGLPUSHATTRIBPROC) load(userptr, "glPushAttrib"); + glad_glPushMatrix = (PFNGLPUSHMATRIXPROC) load(userptr, "glPushMatrix"); + glad_glPushName = (PFNGLPUSHNAMEPROC) load(userptr, "glPushName"); + glad_glRasterPos2d = (PFNGLRASTERPOS2DPROC) load(userptr, "glRasterPos2d"); + glad_glRasterPos2dv = (PFNGLRASTERPOS2DVPROC) load(userptr, "glRasterPos2dv"); + glad_glRasterPos2f = (PFNGLRASTERPOS2FPROC) load(userptr, "glRasterPos2f"); + glad_glRasterPos2fv = (PFNGLRASTERPOS2FVPROC) load(userptr, "glRasterPos2fv"); + glad_glRasterPos2i = (PFNGLRASTERPOS2IPROC) load(userptr, "glRasterPos2i"); + glad_glRasterPos2iv = (PFNGLRASTERPOS2IVPROC) load(userptr, "glRasterPos2iv"); + glad_glRasterPos2s = (PFNGLRASTERPOS2SPROC) load(userptr, "glRasterPos2s"); + glad_glRasterPos2sv = (PFNGLRASTERPOS2SVPROC) load(userptr, "glRasterPos2sv"); + glad_glRasterPos3d = (PFNGLRASTERPOS3DPROC) load(userptr, "glRasterPos3d"); + glad_glRasterPos3dv = (PFNGLRASTERPOS3DVPROC) load(userptr, "glRasterPos3dv"); + glad_glRasterPos3f = (PFNGLRASTERPOS3FPROC) load(userptr, "glRasterPos3f"); + glad_glRasterPos3fv = (PFNGLRASTERPOS3FVPROC) load(userptr, "glRasterPos3fv"); + glad_glRasterPos3i = (PFNGLRASTERPOS3IPROC) load(userptr, "glRasterPos3i"); + glad_glRasterPos3iv = (PFNGLRASTERPOS3IVPROC) load(userptr, "glRasterPos3iv"); + glad_glRasterPos3s = (PFNGLRASTERPOS3SPROC) load(userptr, "glRasterPos3s"); + glad_glRasterPos3sv = (PFNGLRASTERPOS3SVPROC) load(userptr, "glRasterPos3sv"); + glad_glRasterPos4d = (PFNGLRASTERPOS4DPROC) load(userptr, "glRasterPos4d"); + glad_glRasterPos4dv = (PFNGLRASTERPOS4DVPROC) load(userptr, "glRasterPos4dv"); + glad_glRasterPos4f = (PFNGLRASTERPOS4FPROC) load(userptr, "glRasterPos4f"); + glad_glRasterPos4fv = (PFNGLRASTERPOS4FVPROC) load(userptr, "glRasterPos4fv"); + glad_glRasterPos4i = (PFNGLRASTERPOS4IPROC) load(userptr, "glRasterPos4i"); + glad_glRasterPos4iv = (PFNGLRASTERPOS4IVPROC) load(userptr, "glRasterPos4iv"); + glad_glRasterPos4s = (PFNGLRASTERPOS4SPROC) load(userptr, "glRasterPos4s"); + glad_glRasterPos4sv = (PFNGLRASTERPOS4SVPROC) load(userptr, "glRasterPos4sv"); + glad_glReadBuffer = (PFNGLREADBUFFERPROC) load(userptr, "glReadBuffer"); + glad_glReadPixels = (PFNGLREADPIXELSPROC) load(userptr, "glReadPixels"); + glad_glRectd = (PFNGLRECTDPROC) load(userptr, "glRectd"); + glad_glRectdv = (PFNGLRECTDVPROC) load(userptr, "glRectdv"); + glad_glRectf = (PFNGLRECTFPROC) load(userptr, "glRectf"); + glad_glRectfv = (PFNGLRECTFVPROC) load(userptr, "glRectfv"); + glad_glRecti = (PFNGLRECTIPROC) load(userptr, "glRecti"); + glad_glRectiv = (PFNGLRECTIVPROC) load(userptr, "glRectiv"); + glad_glRects = (PFNGLRECTSPROC) load(userptr, "glRects"); + glad_glRectsv = (PFNGLRECTSVPROC) load(userptr, "glRectsv"); + glad_glRenderMode = (PFNGLRENDERMODEPROC) load(userptr, "glRenderMode"); + glad_glRotated = (PFNGLROTATEDPROC) load(userptr, "glRotated"); + glad_glRotatef = (PFNGLROTATEFPROC) load(userptr, "glRotatef"); + glad_glScaled = (PFNGLSCALEDPROC) load(userptr, "glScaled"); + glad_glScalef = (PFNGLSCALEFPROC) load(userptr, "glScalef"); + glad_glScissor = (PFNGLSCISSORPROC) load(userptr, "glScissor"); + glad_glSelectBuffer = (PFNGLSELECTBUFFERPROC) load(userptr, "glSelectBuffer"); + glad_glShadeModel = (PFNGLSHADEMODELPROC) load(userptr, "glShadeModel"); + glad_glStencilFunc = (PFNGLSTENCILFUNCPROC) load(userptr, "glStencilFunc"); + glad_glStencilMask = (PFNGLSTENCILMASKPROC) load(userptr, "glStencilMask"); + glad_glStencilOp = (PFNGLSTENCILOPPROC) load(userptr, "glStencilOp"); + glad_glTexCoord1d = (PFNGLTEXCOORD1DPROC) load(userptr, "glTexCoord1d"); + glad_glTexCoord1dv = (PFNGLTEXCOORD1DVPROC) load(userptr, "glTexCoord1dv"); + glad_glTexCoord1f = (PFNGLTEXCOORD1FPROC) load(userptr, "glTexCoord1f"); + glad_glTexCoord1fv = (PFNGLTEXCOORD1FVPROC) load(userptr, "glTexCoord1fv"); + glad_glTexCoord1i = (PFNGLTEXCOORD1IPROC) load(userptr, "glTexCoord1i"); + glad_glTexCoord1iv = (PFNGLTEXCOORD1IVPROC) load(userptr, "glTexCoord1iv"); + glad_glTexCoord1s = (PFNGLTEXCOORD1SPROC) load(userptr, "glTexCoord1s"); + glad_glTexCoord1sv = (PFNGLTEXCOORD1SVPROC) load(userptr, "glTexCoord1sv"); + glad_glTexCoord2d = (PFNGLTEXCOORD2DPROC) load(userptr, "glTexCoord2d"); + glad_glTexCoord2dv = (PFNGLTEXCOORD2DVPROC) load(userptr, "glTexCoord2dv"); + glad_glTexCoord2f = (PFNGLTEXCOORD2FPROC) load(userptr, "glTexCoord2f"); + glad_glTexCoord2fv = (PFNGLTEXCOORD2FVPROC) load(userptr, "glTexCoord2fv"); + glad_glTexCoord2i = (PFNGLTEXCOORD2IPROC) load(userptr, "glTexCoord2i"); + glad_glTexCoord2iv = (PFNGLTEXCOORD2IVPROC) load(userptr, "glTexCoord2iv"); + glad_glTexCoord2s = (PFNGLTEXCOORD2SPROC) load(userptr, "glTexCoord2s"); + glad_glTexCoord2sv = (PFNGLTEXCOORD2SVPROC) load(userptr, "glTexCoord2sv"); + glad_glTexCoord3d = (PFNGLTEXCOORD3DPROC) load(userptr, "glTexCoord3d"); + glad_glTexCoord3dv = (PFNGLTEXCOORD3DVPROC) load(userptr, "glTexCoord3dv"); + glad_glTexCoord3f = (PFNGLTEXCOORD3FPROC) load(userptr, "glTexCoord3f"); + glad_glTexCoord3fv = (PFNGLTEXCOORD3FVPROC) load(userptr, "glTexCoord3fv"); + glad_glTexCoord3i = (PFNGLTEXCOORD3IPROC) load(userptr, "glTexCoord3i"); + glad_glTexCoord3iv = (PFNGLTEXCOORD3IVPROC) load(userptr, "glTexCoord3iv"); + glad_glTexCoord3s = (PFNGLTEXCOORD3SPROC) load(userptr, "glTexCoord3s"); + glad_glTexCoord3sv = (PFNGLTEXCOORD3SVPROC) load(userptr, "glTexCoord3sv"); + glad_glTexCoord4d = (PFNGLTEXCOORD4DPROC) load(userptr, "glTexCoord4d"); + glad_glTexCoord4dv = (PFNGLTEXCOORD4DVPROC) load(userptr, "glTexCoord4dv"); + glad_glTexCoord4f = (PFNGLTEXCOORD4FPROC) load(userptr, "glTexCoord4f"); + glad_glTexCoord4fv = (PFNGLTEXCOORD4FVPROC) load(userptr, "glTexCoord4fv"); + glad_glTexCoord4i = (PFNGLTEXCOORD4IPROC) load(userptr, "glTexCoord4i"); + glad_glTexCoord4iv = (PFNGLTEXCOORD4IVPROC) load(userptr, "glTexCoord4iv"); + glad_glTexCoord4s = (PFNGLTEXCOORD4SPROC) load(userptr, "glTexCoord4s"); + glad_glTexCoord4sv = (PFNGLTEXCOORD4SVPROC) load(userptr, "glTexCoord4sv"); + glad_glTexEnvf = (PFNGLTEXENVFPROC) load(userptr, "glTexEnvf"); + glad_glTexEnvfv = (PFNGLTEXENVFVPROC) load(userptr, "glTexEnvfv"); + glad_glTexEnvi = (PFNGLTEXENVIPROC) load(userptr, "glTexEnvi"); + glad_glTexEnviv = (PFNGLTEXENVIVPROC) load(userptr, "glTexEnviv"); + glad_glTexGend = (PFNGLTEXGENDPROC) load(userptr, "glTexGend"); + glad_glTexGendv = (PFNGLTEXGENDVPROC) load(userptr, "glTexGendv"); + glad_glTexGenf = (PFNGLTEXGENFPROC) load(userptr, "glTexGenf"); + glad_glTexGenfv = (PFNGLTEXGENFVPROC) load(userptr, "glTexGenfv"); + glad_glTexGeni = (PFNGLTEXGENIPROC) load(userptr, "glTexGeni"); + glad_glTexGeniv = (PFNGLTEXGENIVPROC) load(userptr, "glTexGeniv"); + glad_glTexImage1D = (PFNGLTEXIMAGE1DPROC) load(userptr, "glTexImage1D"); + glad_glTexImage2D = (PFNGLTEXIMAGE2DPROC) load(userptr, "glTexImage2D"); + glad_glTexParameterf = (PFNGLTEXPARAMETERFPROC) load(userptr, "glTexParameterf"); + glad_glTexParameterfv = (PFNGLTEXPARAMETERFVPROC) load(userptr, "glTexParameterfv"); + glad_glTexParameteri = (PFNGLTEXPARAMETERIPROC) load(userptr, "glTexParameteri"); + glad_glTexParameteriv = (PFNGLTEXPARAMETERIVPROC) load(userptr, "glTexParameteriv"); + glad_glTranslated = (PFNGLTRANSLATEDPROC) load(userptr, "glTranslated"); + glad_glTranslatef = (PFNGLTRANSLATEFPROC) load(userptr, "glTranslatef"); + glad_glVertex2d = (PFNGLVERTEX2DPROC) load(userptr, "glVertex2d"); + glad_glVertex2dv = (PFNGLVERTEX2DVPROC) load(userptr, "glVertex2dv"); + glad_glVertex2f = (PFNGLVERTEX2FPROC) load(userptr, "glVertex2f"); + glad_glVertex2fv = (PFNGLVERTEX2FVPROC) load(userptr, "glVertex2fv"); + glad_glVertex2i = (PFNGLVERTEX2IPROC) load(userptr, "glVertex2i"); + glad_glVertex2iv = (PFNGLVERTEX2IVPROC) load(userptr, "glVertex2iv"); + glad_glVertex2s = (PFNGLVERTEX2SPROC) load(userptr, "glVertex2s"); + glad_glVertex2sv = (PFNGLVERTEX2SVPROC) load(userptr, "glVertex2sv"); + glad_glVertex3d = (PFNGLVERTEX3DPROC) load(userptr, "glVertex3d"); + glad_glVertex3dv = (PFNGLVERTEX3DVPROC) load(userptr, "glVertex3dv"); + glad_glVertex3f = (PFNGLVERTEX3FPROC) load(userptr, "glVertex3f"); + glad_glVertex3fv = (PFNGLVERTEX3FVPROC) load(userptr, "glVertex3fv"); + glad_glVertex3i = (PFNGLVERTEX3IPROC) load(userptr, "glVertex3i"); + glad_glVertex3iv = (PFNGLVERTEX3IVPROC) load(userptr, "glVertex3iv"); + glad_glVertex3s = (PFNGLVERTEX3SPROC) load(userptr, "glVertex3s"); + glad_glVertex3sv = (PFNGLVERTEX3SVPROC) load(userptr, "glVertex3sv"); + glad_glVertex4d = (PFNGLVERTEX4DPROC) load(userptr, "glVertex4d"); + glad_glVertex4dv = (PFNGLVERTEX4DVPROC) load(userptr, "glVertex4dv"); + glad_glVertex4f = (PFNGLVERTEX4FPROC) load(userptr, "glVertex4f"); + glad_glVertex4fv = (PFNGLVERTEX4FVPROC) load(userptr, "glVertex4fv"); + glad_glVertex4i = (PFNGLVERTEX4IPROC) load(userptr, "glVertex4i"); + glad_glVertex4iv = (PFNGLVERTEX4IVPROC) load(userptr, "glVertex4iv"); + glad_glVertex4s = (PFNGLVERTEX4SPROC) load(userptr, "glVertex4s"); + glad_glVertex4sv = (PFNGLVERTEX4SVPROC) load(userptr, "glVertex4sv"); + glad_glViewport = (PFNGLVIEWPORTPROC) load(userptr, "glViewport"); +} +static void glad_gl_load_GL_VERSION_1_1( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_1) return; + glad_glAreTexturesResident = (PFNGLARETEXTURESRESIDENTPROC) load(userptr, "glAreTexturesResident"); + glad_glArrayElement = (PFNGLARRAYELEMENTPROC) load(userptr, "glArrayElement"); + glad_glBindTexture = (PFNGLBINDTEXTUREPROC) load(userptr, "glBindTexture"); + glad_glColorPointer = (PFNGLCOLORPOINTERPROC) load(userptr, "glColorPointer"); + glad_glCopyTexImage1D = (PFNGLCOPYTEXIMAGE1DPROC) load(userptr, "glCopyTexImage1D"); + glad_glCopyTexImage2D = (PFNGLCOPYTEXIMAGE2DPROC) load(userptr, "glCopyTexImage2D"); + glad_glCopyTexSubImage1D = (PFNGLCOPYTEXSUBIMAGE1DPROC) load(userptr, "glCopyTexSubImage1D"); + glad_glCopyTexSubImage2D = (PFNGLCOPYTEXSUBIMAGE2DPROC) load(userptr, "glCopyTexSubImage2D"); + glad_glDeleteTextures = (PFNGLDELETETEXTURESPROC) load(userptr, "glDeleteTextures"); + glad_glDisableClientState = (PFNGLDISABLECLIENTSTATEPROC) load(userptr, "glDisableClientState"); + glad_glDrawArrays = (PFNGLDRAWARRAYSPROC) load(userptr, "glDrawArrays"); + glad_glDrawElements = (PFNGLDRAWELEMENTSPROC) load(userptr, "glDrawElements"); + glad_glEdgeFlagPointer = (PFNGLEDGEFLAGPOINTERPROC) load(userptr, "glEdgeFlagPointer"); + glad_glEnableClientState = (PFNGLENABLECLIENTSTATEPROC) load(userptr, "glEnableClientState"); + glad_glGenTextures = (PFNGLGENTEXTURESPROC) load(userptr, "glGenTextures"); + glad_glGetPointerv = (PFNGLGETPOINTERVPROC) load(userptr, "glGetPointerv"); + glad_glIndexPointer = (PFNGLINDEXPOINTERPROC) load(userptr, "glIndexPointer"); + glad_glIndexub = (PFNGLINDEXUBPROC) load(userptr, "glIndexub"); + glad_glIndexubv = (PFNGLINDEXUBVPROC) load(userptr, "glIndexubv"); + glad_glInterleavedArrays = (PFNGLINTERLEAVEDARRAYSPROC) load(userptr, "glInterleavedArrays"); + glad_glIsTexture = (PFNGLISTEXTUREPROC) load(userptr, "glIsTexture"); + glad_glNormalPointer = (PFNGLNORMALPOINTERPROC) load(userptr, "glNormalPointer"); + glad_glPolygonOffset = (PFNGLPOLYGONOFFSETPROC) load(userptr, "glPolygonOffset"); + glad_glPopClientAttrib = (PFNGLPOPCLIENTATTRIBPROC) load(userptr, "glPopClientAttrib"); + glad_glPrioritizeTextures = (PFNGLPRIORITIZETEXTURESPROC) load(userptr, "glPrioritizeTextures"); + glad_glPushClientAttrib = (PFNGLPUSHCLIENTATTRIBPROC) load(userptr, "glPushClientAttrib"); + glad_glTexCoordPointer = (PFNGLTEXCOORDPOINTERPROC) load(userptr, "glTexCoordPointer"); + glad_glTexSubImage1D = (PFNGLTEXSUBIMAGE1DPROC) load(userptr, "glTexSubImage1D"); + glad_glTexSubImage2D = (PFNGLTEXSUBIMAGE2DPROC) load(userptr, "glTexSubImage2D"); + glad_glVertexPointer = (PFNGLVERTEXPOINTERPROC) load(userptr, "glVertexPointer"); +} +static void glad_gl_load_GL_VERSION_1_2( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_2) return; + glad_glCopyTexSubImage3D = (PFNGLCOPYTEXSUBIMAGE3DPROC) load(userptr, "glCopyTexSubImage3D"); + glad_glDrawRangeElements = (PFNGLDRAWRANGEELEMENTSPROC) load(userptr, "glDrawRangeElements"); + glad_glTexImage3D = (PFNGLTEXIMAGE3DPROC) load(userptr, "glTexImage3D"); + glad_glTexSubImage3D = (PFNGLTEXSUBIMAGE3DPROC) load(userptr, "glTexSubImage3D"); +} +static void glad_gl_load_GL_VERSION_1_3( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_3) return; + glad_glActiveTexture = (PFNGLACTIVETEXTUREPROC) load(userptr, "glActiveTexture"); + glad_glClientActiveTexture = (PFNGLCLIENTACTIVETEXTUREPROC) load(userptr, "glClientActiveTexture"); + glad_glCompressedTexImage1D = (PFNGLCOMPRESSEDTEXIMAGE1DPROC) load(userptr, "glCompressedTexImage1D"); + glad_glCompressedTexImage2D = (PFNGLCOMPRESSEDTEXIMAGE2DPROC) load(userptr, "glCompressedTexImage2D"); + glad_glCompressedTexImage3D = (PFNGLCOMPRESSEDTEXIMAGE3DPROC) load(userptr, "glCompressedTexImage3D"); + glad_glCompressedTexSubImage1D = (PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC) load(userptr, "glCompressedTexSubImage1D"); + glad_glCompressedTexSubImage2D = (PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC) load(userptr, "glCompressedTexSubImage2D"); + glad_glCompressedTexSubImage3D = (PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC) load(userptr, "glCompressedTexSubImage3D"); + glad_glGetCompressedTexImage = (PFNGLGETCOMPRESSEDTEXIMAGEPROC) load(userptr, "glGetCompressedTexImage"); + glad_glLoadTransposeMatrixd = (PFNGLLOADTRANSPOSEMATRIXDPROC) load(userptr, "glLoadTransposeMatrixd"); + glad_glLoadTransposeMatrixf = (PFNGLLOADTRANSPOSEMATRIXFPROC) load(userptr, "glLoadTransposeMatrixf"); + glad_glMultTransposeMatrixd = (PFNGLMULTTRANSPOSEMATRIXDPROC) load(userptr, "glMultTransposeMatrixd"); + glad_glMultTransposeMatrixf = (PFNGLMULTTRANSPOSEMATRIXFPROC) load(userptr, "glMultTransposeMatrixf"); + glad_glMultiTexCoord1d = (PFNGLMULTITEXCOORD1DPROC) load(userptr, "glMultiTexCoord1d"); + glad_glMultiTexCoord1dv = (PFNGLMULTITEXCOORD1DVPROC) load(userptr, "glMultiTexCoord1dv"); + glad_glMultiTexCoord1f = (PFNGLMULTITEXCOORD1FPROC) load(userptr, "glMultiTexCoord1f"); + glad_glMultiTexCoord1fv = (PFNGLMULTITEXCOORD1FVPROC) load(userptr, "glMultiTexCoord1fv"); + glad_glMultiTexCoord1i = (PFNGLMULTITEXCOORD1IPROC) load(userptr, "glMultiTexCoord1i"); + glad_glMultiTexCoord1iv = (PFNGLMULTITEXCOORD1IVPROC) load(userptr, "glMultiTexCoord1iv"); + glad_glMultiTexCoord1s = (PFNGLMULTITEXCOORD1SPROC) load(userptr, "glMultiTexCoord1s"); + glad_glMultiTexCoord1sv = (PFNGLMULTITEXCOORD1SVPROC) load(userptr, "glMultiTexCoord1sv"); + glad_glMultiTexCoord2d = (PFNGLMULTITEXCOORD2DPROC) load(userptr, "glMultiTexCoord2d"); + glad_glMultiTexCoord2dv = (PFNGLMULTITEXCOORD2DVPROC) load(userptr, "glMultiTexCoord2dv"); + glad_glMultiTexCoord2f = (PFNGLMULTITEXCOORD2FPROC) load(userptr, "glMultiTexCoord2f"); + glad_glMultiTexCoord2fv = (PFNGLMULTITEXCOORD2FVPROC) load(userptr, "glMultiTexCoord2fv"); + glad_glMultiTexCoord2i = (PFNGLMULTITEXCOORD2IPROC) load(userptr, "glMultiTexCoord2i"); + glad_glMultiTexCoord2iv = (PFNGLMULTITEXCOORD2IVPROC) load(userptr, "glMultiTexCoord2iv"); + glad_glMultiTexCoord2s = (PFNGLMULTITEXCOORD2SPROC) load(userptr, "glMultiTexCoord2s"); + glad_glMultiTexCoord2sv = (PFNGLMULTITEXCOORD2SVPROC) load(userptr, "glMultiTexCoord2sv"); + glad_glMultiTexCoord3d = (PFNGLMULTITEXCOORD3DPROC) load(userptr, "glMultiTexCoord3d"); + glad_glMultiTexCoord3dv = (PFNGLMULTITEXCOORD3DVPROC) load(userptr, "glMultiTexCoord3dv"); + glad_glMultiTexCoord3f = (PFNGLMULTITEXCOORD3FPROC) load(userptr, "glMultiTexCoord3f"); + glad_glMultiTexCoord3fv = (PFNGLMULTITEXCOORD3FVPROC) load(userptr, "glMultiTexCoord3fv"); + glad_glMultiTexCoord3i = (PFNGLMULTITEXCOORD3IPROC) load(userptr, "glMultiTexCoord3i"); + glad_glMultiTexCoord3iv = (PFNGLMULTITEXCOORD3IVPROC) load(userptr, "glMultiTexCoord3iv"); + glad_glMultiTexCoord3s = (PFNGLMULTITEXCOORD3SPROC) load(userptr, "glMultiTexCoord3s"); + glad_glMultiTexCoord3sv = (PFNGLMULTITEXCOORD3SVPROC) load(userptr, "glMultiTexCoord3sv"); + glad_glMultiTexCoord4d = (PFNGLMULTITEXCOORD4DPROC) load(userptr, "glMultiTexCoord4d"); + glad_glMultiTexCoord4dv = (PFNGLMULTITEXCOORD4DVPROC) load(userptr, "glMultiTexCoord4dv"); + glad_glMultiTexCoord4f = (PFNGLMULTITEXCOORD4FPROC) load(userptr, "glMultiTexCoord4f"); + glad_glMultiTexCoord4fv = (PFNGLMULTITEXCOORD4FVPROC) load(userptr, "glMultiTexCoord4fv"); + glad_glMultiTexCoord4i = (PFNGLMULTITEXCOORD4IPROC) load(userptr, "glMultiTexCoord4i"); + glad_glMultiTexCoord4iv = (PFNGLMULTITEXCOORD4IVPROC) load(userptr, "glMultiTexCoord4iv"); + glad_glMultiTexCoord4s = (PFNGLMULTITEXCOORD4SPROC) load(userptr, "glMultiTexCoord4s"); + glad_glMultiTexCoord4sv = (PFNGLMULTITEXCOORD4SVPROC) load(userptr, "glMultiTexCoord4sv"); + glad_glSampleCoverage = (PFNGLSAMPLECOVERAGEPROC) load(userptr, "glSampleCoverage"); +} +static void glad_gl_load_GL_VERSION_1_4( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_4) return; + glad_glBlendColor = (PFNGLBLENDCOLORPROC) load(userptr, "glBlendColor"); + glad_glBlendEquation = (PFNGLBLENDEQUATIONPROC) load(userptr, "glBlendEquation"); + glad_glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEPROC) load(userptr, "glBlendFuncSeparate"); + glad_glFogCoordPointer = (PFNGLFOGCOORDPOINTERPROC) load(userptr, "glFogCoordPointer"); + glad_glFogCoordd = (PFNGLFOGCOORDDPROC) load(userptr, "glFogCoordd"); + glad_glFogCoorddv = (PFNGLFOGCOORDDVPROC) load(userptr, "glFogCoorddv"); + glad_glFogCoordf = (PFNGLFOGCOORDFPROC) load(userptr, "glFogCoordf"); + glad_glFogCoordfv = (PFNGLFOGCOORDFVPROC) load(userptr, "glFogCoordfv"); + glad_glMultiDrawArrays = (PFNGLMULTIDRAWARRAYSPROC) load(userptr, "glMultiDrawArrays"); + glad_glMultiDrawElements = (PFNGLMULTIDRAWELEMENTSPROC) load(userptr, "glMultiDrawElements"); + glad_glPointParameterf = (PFNGLPOINTPARAMETERFPROC) load(userptr, "glPointParameterf"); + glad_glPointParameterfv = (PFNGLPOINTPARAMETERFVPROC) load(userptr, "glPointParameterfv"); + glad_glPointParameteri = (PFNGLPOINTPARAMETERIPROC) load(userptr, "glPointParameteri"); + glad_glPointParameteriv = (PFNGLPOINTPARAMETERIVPROC) load(userptr, "glPointParameteriv"); + glad_glSecondaryColor3b = (PFNGLSECONDARYCOLOR3BPROC) load(userptr, "glSecondaryColor3b"); + glad_glSecondaryColor3bv = (PFNGLSECONDARYCOLOR3BVPROC) load(userptr, "glSecondaryColor3bv"); + glad_glSecondaryColor3d = (PFNGLSECONDARYCOLOR3DPROC) load(userptr, "glSecondaryColor3d"); + glad_glSecondaryColor3dv = (PFNGLSECONDARYCOLOR3DVPROC) load(userptr, "glSecondaryColor3dv"); + glad_glSecondaryColor3f = (PFNGLSECONDARYCOLOR3FPROC) load(userptr, "glSecondaryColor3f"); + glad_glSecondaryColor3fv = (PFNGLSECONDARYCOLOR3FVPROC) load(userptr, "glSecondaryColor3fv"); + glad_glSecondaryColor3i = (PFNGLSECONDARYCOLOR3IPROC) load(userptr, "glSecondaryColor3i"); + glad_glSecondaryColor3iv = (PFNGLSECONDARYCOLOR3IVPROC) load(userptr, "glSecondaryColor3iv"); + glad_glSecondaryColor3s = (PFNGLSECONDARYCOLOR3SPROC) load(userptr, "glSecondaryColor3s"); + glad_glSecondaryColor3sv = (PFNGLSECONDARYCOLOR3SVPROC) load(userptr, "glSecondaryColor3sv"); + glad_glSecondaryColor3ub = (PFNGLSECONDARYCOLOR3UBPROC) load(userptr, "glSecondaryColor3ub"); + glad_glSecondaryColor3ubv = (PFNGLSECONDARYCOLOR3UBVPROC) load(userptr, "glSecondaryColor3ubv"); + glad_glSecondaryColor3ui = (PFNGLSECONDARYCOLOR3UIPROC) load(userptr, "glSecondaryColor3ui"); + glad_glSecondaryColor3uiv = (PFNGLSECONDARYCOLOR3UIVPROC) load(userptr, "glSecondaryColor3uiv"); + glad_glSecondaryColor3us = (PFNGLSECONDARYCOLOR3USPROC) load(userptr, "glSecondaryColor3us"); + glad_glSecondaryColor3usv = (PFNGLSECONDARYCOLOR3USVPROC) load(userptr, "glSecondaryColor3usv"); + glad_glSecondaryColorPointer = (PFNGLSECONDARYCOLORPOINTERPROC) load(userptr, "glSecondaryColorPointer"); + glad_glWindowPos2d = (PFNGLWINDOWPOS2DPROC) load(userptr, "glWindowPos2d"); + glad_glWindowPos2dv = (PFNGLWINDOWPOS2DVPROC) load(userptr, "glWindowPos2dv"); + glad_glWindowPos2f = (PFNGLWINDOWPOS2FPROC) load(userptr, "glWindowPos2f"); + glad_glWindowPos2fv = (PFNGLWINDOWPOS2FVPROC) load(userptr, "glWindowPos2fv"); + glad_glWindowPos2i = (PFNGLWINDOWPOS2IPROC) load(userptr, "glWindowPos2i"); + glad_glWindowPos2iv = (PFNGLWINDOWPOS2IVPROC) load(userptr, "glWindowPos2iv"); + glad_glWindowPos2s = (PFNGLWINDOWPOS2SPROC) load(userptr, "glWindowPos2s"); + glad_glWindowPos2sv = (PFNGLWINDOWPOS2SVPROC) load(userptr, "glWindowPos2sv"); + glad_glWindowPos3d = (PFNGLWINDOWPOS3DPROC) load(userptr, "glWindowPos3d"); + glad_glWindowPos3dv = (PFNGLWINDOWPOS3DVPROC) load(userptr, "glWindowPos3dv"); + glad_glWindowPos3f = (PFNGLWINDOWPOS3FPROC) load(userptr, "glWindowPos3f"); + glad_glWindowPos3fv = (PFNGLWINDOWPOS3FVPROC) load(userptr, "glWindowPos3fv"); + glad_glWindowPos3i = (PFNGLWINDOWPOS3IPROC) load(userptr, "glWindowPos3i"); + glad_glWindowPos3iv = (PFNGLWINDOWPOS3IVPROC) load(userptr, "glWindowPos3iv"); + glad_glWindowPos3s = (PFNGLWINDOWPOS3SPROC) load(userptr, "glWindowPos3s"); + glad_glWindowPos3sv = (PFNGLWINDOWPOS3SVPROC) load(userptr, "glWindowPos3sv"); +} +static void glad_gl_load_GL_VERSION_1_5( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_1_5) return; + glad_glBeginQuery = (PFNGLBEGINQUERYPROC) load(userptr, "glBeginQuery"); + glad_glBindBuffer = (PFNGLBINDBUFFERPROC) load(userptr, "glBindBuffer"); + glad_glBufferData = (PFNGLBUFFERDATAPROC) load(userptr, "glBufferData"); + glad_glBufferSubData = (PFNGLBUFFERSUBDATAPROC) load(userptr, "glBufferSubData"); + glad_glDeleteBuffers = (PFNGLDELETEBUFFERSPROC) load(userptr, "glDeleteBuffers"); + glad_glDeleteQueries = (PFNGLDELETEQUERIESPROC) load(userptr, "glDeleteQueries"); + glad_glEndQuery = (PFNGLENDQUERYPROC) load(userptr, "glEndQuery"); + glad_glGenBuffers = (PFNGLGENBUFFERSPROC) load(userptr, "glGenBuffers"); + glad_glGenQueries = (PFNGLGENQUERIESPROC) load(userptr, "glGenQueries"); + glad_glGetBufferParameteriv = (PFNGLGETBUFFERPARAMETERIVPROC) load(userptr, "glGetBufferParameteriv"); + glad_glGetBufferPointerv = (PFNGLGETBUFFERPOINTERVPROC) load(userptr, "glGetBufferPointerv"); + glad_glGetBufferSubData = (PFNGLGETBUFFERSUBDATAPROC) load(userptr, "glGetBufferSubData"); + glad_glGetQueryObjectiv = (PFNGLGETQUERYOBJECTIVPROC) load(userptr, "glGetQueryObjectiv"); + glad_glGetQueryObjectuiv = (PFNGLGETQUERYOBJECTUIVPROC) load(userptr, "glGetQueryObjectuiv"); + glad_glGetQueryiv = (PFNGLGETQUERYIVPROC) load(userptr, "glGetQueryiv"); + glad_glIsBuffer = (PFNGLISBUFFERPROC) load(userptr, "glIsBuffer"); + glad_glIsQuery = (PFNGLISQUERYPROC) load(userptr, "glIsQuery"); + glad_glMapBuffer = (PFNGLMAPBUFFERPROC) load(userptr, "glMapBuffer"); + glad_glUnmapBuffer = (PFNGLUNMAPBUFFERPROC) load(userptr, "glUnmapBuffer"); +} +static void glad_gl_load_GL_VERSION_2_0( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_2_0) return; + glad_glAttachShader = (PFNGLATTACHSHADERPROC) load(userptr, "glAttachShader"); + glad_glBindAttribLocation = (PFNGLBINDATTRIBLOCATIONPROC) load(userptr, "glBindAttribLocation"); + glad_glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEPROC) load(userptr, "glBlendEquationSeparate"); + glad_glCompileShader = (PFNGLCOMPILESHADERPROC) load(userptr, "glCompileShader"); + glad_glCreateProgram = (PFNGLCREATEPROGRAMPROC) load(userptr, "glCreateProgram"); + glad_glCreateShader = (PFNGLCREATESHADERPROC) load(userptr, "glCreateShader"); + glad_glDeleteProgram = (PFNGLDELETEPROGRAMPROC) load(userptr, "glDeleteProgram"); + glad_glDeleteShader = (PFNGLDELETESHADERPROC) load(userptr, "glDeleteShader"); + glad_glDetachShader = (PFNGLDETACHSHADERPROC) load(userptr, "glDetachShader"); + glad_glDisableVertexAttribArray = (PFNGLDISABLEVERTEXATTRIBARRAYPROC) load(userptr, "glDisableVertexAttribArray"); + glad_glDrawBuffers = (PFNGLDRAWBUFFERSPROC) load(userptr, "glDrawBuffers"); + glad_glEnableVertexAttribArray = (PFNGLENABLEVERTEXATTRIBARRAYPROC) load(userptr, "glEnableVertexAttribArray"); + glad_glGetActiveAttrib = (PFNGLGETACTIVEATTRIBPROC) load(userptr, "glGetActiveAttrib"); + glad_glGetActiveUniform = (PFNGLGETACTIVEUNIFORMPROC) load(userptr, "glGetActiveUniform"); + glad_glGetAttachedShaders = (PFNGLGETATTACHEDSHADERSPROC) load(userptr, "glGetAttachedShaders"); + glad_glGetAttribLocation = (PFNGLGETATTRIBLOCATIONPROC) load(userptr, "glGetAttribLocation"); + glad_glGetProgramInfoLog = (PFNGLGETPROGRAMINFOLOGPROC) load(userptr, "glGetProgramInfoLog"); + glad_glGetProgramiv = (PFNGLGETPROGRAMIVPROC) load(userptr, "glGetProgramiv"); + glad_glGetShaderInfoLog = (PFNGLGETSHADERINFOLOGPROC) load(userptr, "glGetShaderInfoLog"); + glad_glGetShaderSource = (PFNGLGETSHADERSOURCEPROC) load(userptr, "glGetShaderSource"); + glad_glGetShaderiv = (PFNGLGETSHADERIVPROC) load(userptr, "glGetShaderiv"); + glad_glGetUniformLocation = (PFNGLGETUNIFORMLOCATIONPROC) load(userptr, "glGetUniformLocation"); + glad_glGetUniformfv = (PFNGLGETUNIFORMFVPROC) load(userptr, "glGetUniformfv"); + glad_glGetUniformiv = (PFNGLGETUNIFORMIVPROC) load(userptr, "glGetUniformiv"); + glad_glGetVertexAttribPointerv = (PFNGLGETVERTEXATTRIBPOINTERVPROC) load(userptr, "glGetVertexAttribPointerv"); + glad_glGetVertexAttribdv = (PFNGLGETVERTEXATTRIBDVPROC) load(userptr, "glGetVertexAttribdv"); + glad_glGetVertexAttribfv = (PFNGLGETVERTEXATTRIBFVPROC) load(userptr, "glGetVertexAttribfv"); + glad_glGetVertexAttribiv = (PFNGLGETVERTEXATTRIBIVPROC) load(userptr, "glGetVertexAttribiv"); + glad_glIsProgram = (PFNGLISPROGRAMPROC) load(userptr, "glIsProgram"); + glad_glIsShader = (PFNGLISSHADERPROC) load(userptr, "glIsShader"); + glad_glLinkProgram = (PFNGLLINKPROGRAMPROC) load(userptr, "glLinkProgram"); + glad_glShaderSource = (PFNGLSHADERSOURCEPROC) load(userptr, "glShaderSource"); + glad_glStencilFuncSeparate = (PFNGLSTENCILFUNCSEPARATEPROC) load(userptr, "glStencilFuncSeparate"); + glad_glStencilMaskSeparate = (PFNGLSTENCILMASKSEPARATEPROC) load(userptr, "glStencilMaskSeparate"); + glad_glStencilOpSeparate = (PFNGLSTENCILOPSEPARATEPROC) load(userptr, "glStencilOpSeparate"); + glad_glUniform1f = (PFNGLUNIFORM1FPROC) load(userptr, "glUniform1f"); + glad_glUniform1fv = (PFNGLUNIFORM1FVPROC) load(userptr, "glUniform1fv"); + glad_glUniform1i = (PFNGLUNIFORM1IPROC) load(userptr, "glUniform1i"); + glad_glUniform1iv = (PFNGLUNIFORM1IVPROC) load(userptr, "glUniform1iv"); + glad_glUniform2f = (PFNGLUNIFORM2FPROC) load(userptr, "glUniform2f"); + glad_glUniform2fv = (PFNGLUNIFORM2FVPROC) load(userptr, "glUniform2fv"); + glad_glUniform2i = (PFNGLUNIFORM2IPROC) load(userptr, "glUniform2i"); + glad_glUniform2iv = (PFNGLUNIFORM2IVPROC) load(userptr, "glUniform2iv"); + glad_glUniform3f = (PFNGLUNIFORM3FPROC) load(userptr, "glUniform3f"); + glad_glUniform3fv = (PFNGLUNIFORM3FVPROC) load(userptr, "glUniform3fv"); + glad_glUniform3i = (PFNGLUNIFORM3IPROC) load(userptr, "glUniform3i"); + glad_glUniform3iv = (PFNGLUNIFORM3IVPROC) load(userptr, "glUniform3iv"); + glad_glUniform4f = (PFNGLUNIFORM4FPROC) load(userptr, "glUniform4f"); + glad_glUniform4fv = (PFNGLUNIFORM4FVPROC) load(userptr, "glUniform4fv"); + glad_glUniform4i = (PFNGLUNIFORM4IPROC) load(userptr, "glUniform4i"); + glad_glUniform4iv = (PFNGLUNIFORM4IVPROC) load(userptr, "glUniform4iv"); + glad_glUniformMatrix2fv = (PFNGLUNIFORMMATRIX2FVPROC) load(userptr, "glUniformMatrix2fv"); + glad_glUniformMatrix3fv = (PFNGLUNIFORMMATRIX3FVPROC) load(userptr, "glUniformMatrix3fv"); + glad_glUniformMatrix4fv = (PFNGLUNIFORMMATRIX4FVPROC) load(userptr, "glUniformMatrix4fv"); + glad_glUseProgram = (PFNGLUSEPROGRAMPROC) load(userptr, "glUseProgram"); + glad_glValidateProgram = (PFNGLVALIDATEPROGRAMPROC) load(userptr, "glValidateProgram"); + glad_glVertexAttrib1d = (PFNGLVERTEXATTRIB1DPROC) load(userptr, "glVertexAttrib1d"); + glad_glVertexAttrib1dv = (PFNGLVERTEXATTRIB1DVPROC) load(userptr, "glVertexAttrib1dv"); + glad_glVertexAttrib1f = (PFNGLVERTEXATTRIB1FPROC) load(userptr, "glVertexAttrib1f"); + glad_glVertexAttrib1fv = (PFNGLVERTEXATTRIB1FVPROC) load(userptr, "glVertexAttrib1fv"); + glad_glVertexAttrib1s = (PFNGLVERTEXATTRIB1SPROC) load(userptr, "glVertexAttrib1s"); + glad_glVertexAttrib1sv = (PFNGLVERTEXATTRIB1SVPROC) load(userptr, "glVertexAttrib1sv"); + glad_glVertexAttrib2d = (PFNGLVERTEXATTRIB2DPROC) load(userptr, "glVertexAttrib2d"); + glad_glVertexAttrib2dv = (PFNGLVERTEXATTRIB2DVPROC) load(userptr, "glVertexAttrib2dv"); + glad_glVertexAttrib2f = (PFNGLVERTEXATTRIB2FPROC) load(userptr, "glVertexAttrib2f"); + glad_glVertexAttrib2fv = (PFNGLVERTEXATTRIB2FVPROC) load(userptr, "glVertexAttrib2fv"); + glad_glVertexAttrib2s = (PFNGLVERTEXATTRIB2SPROC) load(userptr, "glVertexAttrib2s"); + glad_glVertexAttrib2sv = (PFNGLVERTEXATTRIB2SVPROC) load(userptr, "glVertexAttrib2sv"); + glad_glVertexAttrib3d = (PFNGLVERTEXATTRIB3DPROC) load(userptr, "glVertexAttrib3d"); + glad_glVertexAttrib3dv = (PFNGLVERTEXATTRIB3DVPROC) load(userptr, "glVertexAttrib3dv"); + glad_glVertexAttrib3f = (PFNGLVERTEXATTRIB3FPROC) load(userptr, "glVertexAttrib3f"); + glad_glVertexAttrib3fv = (PFNGLVERTEXATTRIB3FVPROC) load(userptr, "glVertexAttrib3fv"); + glad_glVertexAttrib3s = (PFNGLVERTEXATTRIB3SPROC) load(userptr, "glVertexAttrib3s"); + glad_glVertexAttrib3sv = (PFNGLVERTEXATTRIB3SVPROC) load(userptr, "glVertexAttrib3sv"); + glad_glVertexAttrib4Nbv = (PFNGLVERTEXATTRIB4NBVPROC) load(userptr, "glVertexAttrib4Nbv"); + glad_glVertexAttrib4Niv = (PFNGLVERTEXATTRIB4NIVPROC) load(userptr, "glVertexAttrib4Niv"); + glad_glVertexAttrib4Nsv = (PFNGLVERTEXATTRIB4NSVPROC) load(userptr, "glVertexAttrib4Nsv"); + glad_glVertexAttrib4Nub = (PFNGLVERTEXATTRIB4NUBPROC) load(userptr, "glVertexAttrib4Nub"); + glad_glVertexAttrib4Nubv = (PFNGLVERTEXATTRIB4NUBVPROC) load(userptr, "glVertexAttrib4Nubv"); + glad_glVertexAttrib4Nuiv = (PFNGLVERTEXATTRIB4NUIVPROC) load(userptr, "glVertexAttrib4Nuiv"); + glad_glVertexAttrib4Nusv = (PFNGLVERTEXATTRIB4NUSVPROC) load(userptr, "glVertexAttrib4Nusv"); + glad_glVertexAttrib4bv = (PFNGLVERTEXATTRIB4BVPROC) load(userptr, "glVertexAttrib4bv"); + glad_glVertexAttrib4d = (PFNGLVERTEXATTRIB4DPROC) load(userptr, "glVertexAttrib4d"); + glad_glVertexAttrib4dv = (PFNGLVERTEXATTRIB4DVPROC) load(userptr, "glVertexAttrib4dv"); + glad_glVertexAttrib4f = (PFNGLVERTEXATTRIB4FPROC) load(userptr, "glVertexAttrib4f"); + glad_glVertexAttrib4fv = (PFNGLVERTEXATTRIB4FVPROC) load(userptr, "glVertexAttrib4fv"); + glad_glVertexAttrib4iv = (PFNGLVERTEXATTRIB4IVPROC) load(userptr, "glVertexAttrib4iv"); + glad_glVertexAttrib4s = (PFNGLVERTEXATTRIB4SPROC) load(userptr, "glVertexAttrib4s"); + glad_glVertexAttrib4sv = (PFNGLVERTEXATTRIB4SVPROC) load(userptr, "glVertexAttrib4sv"); + glad_glVertexAttrib4ubv = (PFNGLVERTEXATTRIB4UBVPROC) load(userptr, "glVertexAttrib4ubv"); + glad_glVertexAttrib4uiv = (PFNGLVERTEXATTRIB4UIVPROC) load(userptr, "glVertexAttrib4uiv"); + glad_glVertexAttrib4usv = (PFNGLVERTEXATTRIB4USVPROC) load(userptr, "glVertexAttrib4usv"); + glad_glVertexAttribPointer = (PFNGLVERTEXATTRIBPOINTERPROC) load(userptr, "glVertexAttribPointer"); +} +static void glad_gl_load_GL_VERSION_2_1( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_2_1) return; + glad_glUniformMatrix2x3fv = (PFNGLUNIFORMMATRIX2X3FVPROC) load(userptr, "glUniformMatrix2x3fv"); + glad_glUniformMatrix2x4fv = (PFNGLUNIFORMMATRIX2X4FVPROC) load(userptr, "glUniformMatrix2x4fv"); + glad_glUniformMatrix3x2fv = (PFNGLUNIFORMMATRIX3X2FVPROC) load(userptr, "glUniformMatrix3x2fv"); + glad_glUniformMatrix3x4fv = (PFNGLUNIFORMMATRIX3X4FVPROC) load(userptr, "glUniformMatrix3x4fv"); + glad_glUniformMatrix4x2fv = (PFNGLUNIFORMMATRIX4X2FVPROC) load(userptr, "glUniformMatrix4x2fv"); + glad_glUniformMatrix4x3fv = (PFNGLUNIFORMMATRIX4X3FVPROC) load(userptr, "glUniformMatrix4x3fv"); +} +static void glad_gl_load_GL_VERSION_3_0( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_3_0) return; + glad_glBeginConditionalRender = (PFNGLBEGINCONDITIONALRENDERPROC) load(userptr, "glBeginConditionalRender"); + glad_glBeginTransformFeedback = (PFNGLBEGINTRANSFORMFEEDBACKPROC) load(userptr, "glBeginTransformFeedback"); + glad_glBindBufferBase = (PFNGLBINDBUFFERBASEPROC) load(userptr, "glBindBufferBase"); + glad_glBindBufferRange = (PFNGLBINDBUFFERRANGEPROC) load(userptr, "glBindBufferRange"); + glad_glBindFragDataLocation = (PFNGLBINDFRAGDATALOCATIONPROC) load(userptr, "glBindFragDataLocation"); + glad_glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC) load(userptr, "glBindFramebuffer"); + glad_glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC) load(userptr, "glBindRenderbuffer"); + glad_glBindVertexArray = (PFNGLBINDVERTEXARRAYPROC) load(userptr, "glBindVertexArray"); + glad_glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC) load(userptr, "glBlitFramebuffer"); + glad_glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC) load(userptr, "glCheckFramebufferStatus"); + glad_glClampColor = (PFNGLCLAMPCOLORPROC) load(userptr, "glClampColor"); + glad_glClearBufferfi = (PFNGLCLEARBUFFERFIPROC) load(userptr, "glClearBufferfi"); + glad_glClearBufferfv = (PFNGLCLEARBUFFERFVPROC) load(userptr, "glClearBufferfv"); + glad_glClearBufferiv = (PFNGLCLEARBUFFERIVPROC) load(userptr, "glClearBufferiv"); + glad_glClearBufferuiv = (PFNGLCLEARBUFFERUIVPROC) load(userptr, "glClearBufferuiv"); + glad_glColorMaski = (PFNGLCOLORMASKIPROC) load(userptr, "glColorMaski"); + glad_glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC) load(userptr, "glDeleteFramebuffers"); + glad_glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC) load(userptr, "glDeleteRenderbuffers"); + glad_glDeleteVertexArrays = (PFNGLDELETEVERTEXARRAYSPROC) load(userptr, "glDeleteVertexArrays"); + glad_glDisablei = (PFNGLDISABLEIPROC) load(userptr, "glDisablei"); + glad_glEnablei = (PFNGLENABLEIPROC) load(userptr, "glEnablei"); + glad_glEndConditionalRender = (PFNGLENDCONDITIONALRENDERPROC) load(userptr, "glEndConditionalRender"); + glad_glEndTransformFeedback = (PFNGLENDTRANSFORMFEEDBACKPROC) load(userptr, "glEndTransformFeedback"); + glad_glFlushMappedBufferRange = (PFNGLFLUSHMAPPEDBUFFERRANGEPROC) load(userptr, "glFlushMappedBufferRange"); + glad_glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC) load(userptr, "glFramebufferRenderbuffer"); + glad_glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC) load(userptr, "glFramebufferTexture1D"); + glad_glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC) load(userptr, "glFramebufferTexture2D"); + glad_glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC) load(userptr, "glFramebufferTexture3D"); + glad_glFramebufferTextureLayer = (PFNGLFRAMEBUFFERTEXTURELAYERPROC) load(userptr, "glFramebufferTextureLayer"); + glad_glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC) load(userptr, "glGenFramebuffers"); + glad_glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC) load(userptr, "glGenRenderbuffers"); + glad_glGenVertexArrays = (PFNGLGENVERTEXARRAYSPROC) load(userptr, "glGenVertexArrays"); + glad_glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC) load(userptr, "glGenerateMipmap"); + glad_glGetBooleani_v = (PFNGLGETBOOLEANI_VPROC) load(userptr, "glGetBooleani_v"); + glad_glGetFragDataLocation = (PFNGLGETFRAGDATALOCATIONPROC) load(userptr, "glGetFragDataLocation"); + glad_glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC) load(userptr, "glGetFramebufferAttachmentParameteriv"); + glad_glGetIntegeri_v = (PFNGLGETINTEGERI_VPROC) load(userptr, "glGetIntegeri_v"); + glad_glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC) load(userptr, "glGetRenderbufferParameteriv"); + glad_glGetStringi = (PFNGLGETSTRINGIPROC) load(userptr, "glGetStringi"); + glad_glGetTexParameterIiv = (PFNGLGETTEXPARAMETERIIVPROC) load(userptr, "glGetTexParameterIiv"); + glad_glGetTexParameterIuiv = (PFNGLGETTEXPARAMETERIUIVPROC) load(userptr, "glGetTexParameterIuiv"); + glad_glGetTransformFeedbackVarying = (PFNGLGETTRANSFORMFEEDBACKVARYINGPROC) load(userptr, "glGetTransformFeedbackVarying"); + glad_glGetUniformuiv = (PFNGLGETUNIFORMUIVPROC) load(userptr, "glGetUniformuiv"); + glad_glGetVertexAttribIiv = (PFNGLGETVERTEXATTRIBIIVPROC) load(userptr, "glGetVertexAttribIiv"); + glad_glGetVertexAttribIuiv = (PFNGLGETVERTEXATTRIBIUIVPROC) load(userptr, "glGetVertexAttribIuiv"); + glad_glIsEnabledi = (PFNGLISENABLEDIPROC) load(userptr, "glIsEnabledi"); + glad_glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC) load(userptr, "glIsFramebuffer"); + glad_glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC) load(userptr, "glIsRenderbuffer"); + glad_glIsVertexArray = (PFNGLISVERTEXARRAYPROC) load(userptr, "glIsVertexArray"); + glad_glMapBufferRange = (PFNGLMAPBUFFERRANGEPROC) load(userptr, "glMapBufferRange"); + glad_glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC) load(userptr, "glRenderbufferStorage"); + glad_glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC) load(userptr, "glRenderbufferStorageMultisample"); + glad_glTexParameterIiv = (PFNGLTEXPARAMETERIIVPROC) load(userptr, "glTexParameterIiv"); + glad_glTexParameterIuiv = (PFNGLTEXPARAMETERIUIVPROC) load(userptr, "glTexParameterIuiv"); + glad_glTransformFeedbackVaryings = (PFNGLTRANSFORMFEEDBACKVARYINGSPROC) load(userptr, "glTransformFeedbackVaryings"); + glad_glUniform1ui = (PFNGLUNIFORM1UIPROC) load(userptr, "glUniform1ui"); + glad_glUniform1uiv = (PFNGLUNIFORM1UIVPROC) load(userptr, "glUniform1uiv"); + glad_glUniform2ui = (PFNGLUNIFORM2UIPROC) load(userptr, "glUniform2ui"); + glad_glUniform2uiv = (PFNGLUNIFORM2UIVPROC) load(userptr, "glUniform2uiv"); + glad_glUniform3ui = (PFNGLUNIFORM3UIPROC) load(userptr, "glUniform3ui"); + glad_glUniform3uiv = (PFNGLUNIFORM3UIVPROC) load(userptr, "glUniform3uiv"); + glad_glUniform4ui = (PFNGLUNIFORM4UIPROC) load(userptr, "glUniform4ui"); + glad_glUniform4uiv = (PFNGLUNIFORM4UIVPROC) load(userptr, "glUniform4uiv"); + glad_glVertexAttribI1i = (PFNGLVERTEXATTRIBI1IPROC) load(userptr, "glVertexAttribI1i"); + glad_glVertexAttribI1iv = (PFNGLVERTEXATTRIBI1IVPROC) load(userptr, "glVertexAttribI1iv"); + glad_glVertexAttribI1ui = (PFNGLVERTEXATTRIBI1UIPROC) load(userptr, "glVertexAttribI1ui"); + glad_glVertexAttribI1uiv = (PFNGLVERTEXATTRIBI1UIVPROC) load(userptr, "glVertexAttribI1uiv"); + glad_glVertexAttribI2i = (PFNGLVERTEXATTRIBI2IPROC) load(userptr, "glVertexAttribI2i"); + glad_glVertexAttribI2iv = (PFNGLVERTEXATTRIBI2IVPROC) load(userptr, "glVertexAttribI2iv"); + glad_glVertexAttribI2ui = (PFNGLVERTEXATTRIBI2UIPROC) load(userptr, "glVertexAttribI2ui"); + glad_glVertexAttribI2uiv = (PFNGLVERTEXATTRIBI2UIVPROC) load(userptr, "glVertexAttribI2uiv"); + glad_glVertexAttribI3i = (PFNGLVERTEXATTRIBI3IPROC) load(userptr, "glVertexAttribI3i"); + glad_glVertexAttribI3iv = (PFNGLVERTEXATTRIBI3IVPROC) load(userptr, "glVertexAttribI3iv"); + glad_glVertexAttribI3ui = (PFNGLVERTEXATTRIBI3UIPROC) load(userptr, "glVertexAttribI3ui"); + glad_glVertexAttribI3uiv = (PFNGLVERTEXATTRIBI3UIVPROC) load(userptr, "glVertexAttribI3uiv"); + glad_glVertexAttribI4bv = (PFNGLVERTEXATTRIBI4BVPROC) load(userptr, "glVertexAttribI4bv"); + glad_glVertexAttribI4i = (PFNGLVERTEXATTRIBI4IPROC) load(userptr, "glVertexAttribI4i"); + glad_glVertexAttribI4iv = (PFNGLVERTEXATTRIBI4IVPROC) load(userptr, "glVertexAttribI4iv"); + glad_glVertexAttribI4sv = (PFNGLVERTEXATTRIBI4SVPROC) load(userptr, "glVertexAttribI4sv"); + glad_glVertexAttribI4ubv = (PFNGLVERTEXATTRIBI4UBVPROC) load(userptr, "glVertexAttribI4ubv"); + glad_glVertexAttribI4ui = (PFNGLVERTEXATTRIBI4UIPROC) load(userptr, "glVertexAttribI4ui"); + glad_glVertexAttribI4uiv = (PFNGLVERTEXATTRIBI4UIVPROC) load(userptr, "glVertexAttribI4uiv"); + glad_glVertexAttribI4usv = (PFNGLVERTEXATTRIBI4USVPROC) load(userptr, "glVertexAttribI4usv"); + glad_glVertexAttribIPointer = (PFNGLVERTEXATTRIBIPOINTERPROC) load(userptr, "glVertexAttribIPointer"); +} +static void glad_gl_load_GL_VERSION_3_1( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_3_1) return; + glad_glBindBufferBase = (PFNGLBINDBUFFERBASEPROC) load(userptr, "glBindBufferBase"); + glad_glBindBufferRange = (PFNGLBINDBUFFERRANGEPROC) load(userptr, "glBindBufferRange"); + glad_glCopyBufferSubData = (PFNGLCOPYBUFFERSUBDATAPROC) load(userptr, "glCopyBufferSubData"); + glad_glDrawArraysInstanced = (PFNGLDRAWARRAYSINSTANCEDPROC) load(userptr, "glDrawArraysInstanced"); + glad_glDrawElementsInstanced = (PFNGLDRAWELEMENTSINSTANCEDPROC) load(userptr, "glDrawElementsInstanced"); + glad_glGetActiveUniformBlockName = (PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC) load(userptr, "glGetActiveUniformBlockName"); + glad_glGetActiveUniformBlockiv = (PFNGLGETACTIVEUNIFORMBLOCKIVPROC) load(userptr, "glGetActiveUniformBlockiv"); + glad_glGetActiveUniformName = (PFNGLGETACTIVEUNIFORMNAMEPROC) load(userptr, "glGetActiveUniformName"); + glad_glGetActiveUniformsiv = (PFNGLGETACTIVEUNIFORMSIVPROC) load(userptr, "glGetActiveUniformsiv"); + glad_glGetIntegeri_v = (PFNGLGETINTEGERI_VPROC) load(userptr, "glGetIntegeri_v"); + glad_glGetUniformBlockIndex = (PFNGLGETUNIFORMBLOCKINDEXPROC) load(userptr, "glGetUniformBlockIndex"); + glad_glGetUniformIndices = (PFNGLGETUNIFORMINDICESPROC) load(userptr, "glGetUniformIndices"); + glad_glPrimitiveRestartIndex = (PFNGLPRIMITIVERESTARTINDEXPROC) load(userptr, "glPrimitiveRestartIndex"); + glad_glTexBuffer = (PFNGLTEXBUFFERPROC) load(userptr, "glTexBuffer"); + glad_glUniformBlockBinding = (PFNGLUNIFORMBLOCKBINDINGPROC) load(userptr, "glUniformBlockBinding"); +} +static void glad_gl_load_GL_VERSION_3_2( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_3_2) return; + glad_glClientWaitSync = (PFNGLCLIENTWAITSYNCPROC) load(userptr, "glClientWaitSync"); + glad_glDeleteSync = (PFNGLDELETESYNCPROC) load(userptr, "glDeleteSync"); + glad_glDrawElementsBaseVertex = (PFNGLDRAWELEMENTSBASEVERTEXPROC) load(userptr, "glDrawElementsBaseVertex"); + glad_glDrawElementsInstancedBaseVertex = (PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC) load(userptr, "glDrawElementsInstancedBaseVertex"); + glad_glDrawRangeElementsBaseVertex = (PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC) load(userptr, "glDrawRangeElementsBaseVertex"); + glad_glFenceSync = (PFNGLFENCESYNCPROC) load(userptr, "glFenceSync"); + glad_glFramebufferTexture = (PFNGLFRAMEBUFFERTEXTUREPROC) load(userptr, "glFramebufferTexture"); + glad_glGetBufferParameteri64v = (PFNGLGETBUFFERPARAMETERI64VPROC) load(userptr, "glGetBufferParameteri64v"); + glad_glGetInteger64i_v = (PFNGLGETINTEGER64I_VPROC) load(userptr, "glGetInteger64i_v"); + glad_glGetInteger64v = (PFNGLGETINTEGER64VPROC) load(userptr, "glGetInteger64v"); + glad_glGetMultisamplefv = (PFNGLGETMULTISAMPLEFVPROC) load(userptr, "glGetMultisamplefv"); + glad_glGetSynciv = (PFNGLGETSYNCIVPROC) load(userptr, "glGetSynciv"); + glad_glIsSync = (PFNGLISSYNCPROC) load(userptr, "glIsSync"); + glad_glMultiDrawElementsBaseVertex = (PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC) load(userptr, "glMultiDrawElementsBaseVertex"); + glad_glProvokingVertex = (PFNGLPROVOKINGVERTEXPROC) load(userptr, "glProvokingVertex"); + glad_glSampleMaski = (PFNGLSAMPLEMASKIPROC) load(userptr, "glSampleMaski"); + glad_glTexImage2DMultisample = (PFNGLTEXIMAGE2DMULTISAMPLEPROC) load(userptr, "glTexImage2DMultisample"); + glad_glTexImage3DMultisample = (PFNGLTEXIMAGE3DMULTISAMPLEPROC) load(userptr, "glTexImage3DMultisample"); + glad_glWaitSync = (PFNGLWAITSYNCPROC) load(userptr, "glWaitSync"); +} +static void glad_gl_load_GL_VERSION_3_3( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_VERSION_3_3) return; + glad_glBindFragDataLocationIndexed = (PFNGLBINDFRAGDATALOCATIONINDEXEDPROC) load(userptr, "glBindFragDataLocationIndexed"); + glad_glBindSampler = (PFNGLBINDSAMPLERPROC) load(userptr, "glBindSampler"); + glad_glColorP3ui = (PFNGLCOLORP3UIPROC) load(userptr, "glColorP3ui"); + glad_glColorP3uiv = (PFNGLCOLORP3UIVPROC) load(userptr, "glColorP3uiv"); + glad_glColorP4ui = (PFNGLCOLORP4UIPROC) load(userptr, "glColorP4ui"); + glad_glColorP4uiv = (PFNGLCOLORP4UIVPROC) load(userptr, "glColorP4uiv"); + glad_glDeleteSamplers = (PFNGLDELETESAMPLERSPROC) load(userptr, "glDeleteSamplers"); + glad_glGenSamplers = (PFNGLGENSAMPLERSPROC) load(userptr, "glGenSamplers"); + glad_glGetFragDataIndex = (PFNGLGETFRAGDATAINDEXPROC) load(userptr, "glGetFragDataIndex"); + glad_glGetQueryObjecti64v = (PFNGLGETQUERYOBJECTI64VPROC) load(userptr, "glGetQueryObjecti64v"); + glad_glGetQueryObjectui64v = (PFNGLGETQUERYOBJECTUI64VPROC) load(userptr, "glGetQueryObjectui64v"); + glad_glGetSamplerParameterIiv = (PFNGLGETSAMPLERPARAMETERIIVPROC) load(userptr, "glGetSamplerParameterIiv"); + glad_glGetSamplerParameterIuiv = (PFNGLGETSAMPLERPARAMETERIUIVPROC) load(userptr, "glGetSamplerParameterIuiv"); + glad_glGetSamplerParameterfv = (PFNGLGETSAMPLERPARAMETERFVPROC) load(userptr, "glGetSamplerParameterfv"); + glad_glGetSamplerParameteriv = (PFNGLGETSAMPLERPARAMETERIVPROC) load(userptr, "glGetSamplerParameteriv"); + glad_glIsSampler = (PFNGLISSAMPLERPROC) load(userptr, "glIsSampler"); + glad_glMultiTexCoordP1ui = (PFNGLMULTITEXCOORDP1UIPROC) load(userptr, "glMultiTexCoordP1ui"); + glad_glMultiTexCoordP1uiv = (PFNGLMULTITEXCOORDP1UIVPROC) load(userptr, "glMultiTexCoordP1uiv"); + glad_glMultiTexCoordP2ui = (PFNGLMULTITEXCOORDP2UIPROC) load(userptr, "glMultiTexCoordP2ui"); + glad_glMultiTexCoordP2uiv = (PFNGLMULTITEXCOORDP2UIVPROC) load(userptr, "glMultiTexCoordP2uiv"); + glad_glMultiTexCoordP3ui = (PFNGLMULTITEXCOORDP3UIPROC) load(userptr, "glMultiTexCoordP3ui"); + glad_glMultiTexCoordP3uiv = (PFNGLMULTITEXCOORDP3UIVPROC) load(userptr, "glMultiTexCoordP3uiv"); + glad_glMultiTexCoordP4ui = (PFNGLMULTITEXCOORDP4UIPROC) load(userptr, "glMultiTexCoordP4ui"); + glad_glMultiTexCoordP4uiv = (PFNGLMULTITEXCOORDP4UIVPROC) load(userptr, "glMultiTexCoordP4uiv"); + glad_glNormalP3ui = (PFNGLNORMALP3UIPROC) load(userptr, "glNormalP3ui"); + glad_glNormalP3uiv = (PFNGLNORMALP3UIVPROC) load(userptr, "glNormalP3uiv"); + glad_glQueryCounter = (PFNGLQUERYCOUNTERPROC) load(userptr, "glQueryCounter"); + glad_glSamplerParameterIiv = (PFNGLSAMPLERPARAMETERIIVPROC) load(userptr, "glSamplerParameterIiv"); + glad_glSamplerParameterIuiv = (PFNGLSAMPLERPARAMETERIUIVPROC) load(userptr, "glSamplerParameterIuiv"); + glad_glSamplerParameterf = (PFNGLSAMPLERPARAMETERFPROC) load(userptr, "glSamplerParameterf"); + glad_glSamplerParameterfv = (PFNGLSAMPLERPARAMETERFVPROC) load(userptr, "glSamplerParameterfv"); + glad_glSamplerParameteri = (PFNGLSAMPLERPARAMETERIPROC) load(userptr, "glSamplerParameteri"); + glad_glSamplerParameteriv = (PFNGLSAMPLERPARAMETERIVPROC) load(userptr, "glSamplerParameteriv"); + glad_glSecondaryColorP3ui = (PFNGLSECONDARYCOLORP3UIPROC) load(userptr, "glSecondaryColorP3ui"); + glad_glSecondaryColorP3uiv = (PFNGLSECONDARYCOLORP3UIVPROC) load(userptr, "glSecondaryColorP3uiv"); + glad_glTexCoordP1ui = (PFNGLTEXCOORDP1UIPROC) load(userptr, "glTexCoordP1ui"); + glad_glTexCoordP1uiv = (PFNGLTEXCOORDP1UIVPROC) load(userptr, "glTexCoordP1uiv"); + glad_glTexCoordP2ui = (PFNGLTEXCOORDP2UIPROC) load(userptr, "glTexCoordP2ui"); + glad_glTexCoordP2uiv = (PFNGLTEXCOORDP2UIVPROC) load(userptr, "glTexCoordP2uiv"); + glad_glTexCoordP3ui = (PFNGLTEXCOORDP3UIPROC) load(userptr, "glTexCoordP3ui"); + glad_glTexCoordP3uiv = (PFNGLTEXCOORDP3UIVPROC) load(userptr, "glTexCoordP3uiv"); + glad_glTexCoordP4ui = (PFNGLTEXCOORDP4UIPROC) load(userptr, "glTexCoordP4ui"); + glad_glTexCoordP4uiv = (PFNGLTEXCOORDP4UIVPROC) load(userptr, "glTexCoordP4uiv"); + glad_glVertexAttribDivisor = (PFNGLVERTEXATTRIBDIVISORPROC) load(userptr, "glVertexAttribDivisor"); + glad_glVertexAttribP1ui = (PFNGLVERTEXATTRIBP1UIPROC) load(userptr, "glVertexAttribP1ui"); + glad_glVertexAttribP1uiv = (PFNGLVERTEXATTRIBP1UIVPROC) load(userptr, "glVertexAttribP1uiv"); + glad_glVertexAttribP2ui = (PFNGLVERTEXATTRIBP2UIPROC) load(userptr, "glVertexAttribP2ui"); + glad_glVertexAttribP2uiv = (PFNGLVERTEXATTRIBP2UIVPROC) load(userptr, "glVertexAttribP2uiv"); + glad_glVertexAttribP3ui = (PFNGLVERTEXATTRIBP3UIPROC) load(userptr, "glVertexAttribP3ui"); + glad_glVertexAttribP3uiv = (PFNGLVERTEXATTRIBP3UIVPROC) load(userptr, "glVertexAttribP3uiv"); + glad_glVertexAttribP4ui = (PFNGLVERTEXATTRIBP4UIPROC) load(userptr, "glVertexAttribP4ui"); + glad_glVertexAttribP4uiv = (PFNGLVERTEXATTRIBP4UIVPROC) load(userptr, "glVertexAttribP4uiv"); + glad_glVertexP2ui = (PFNGLVERTEXP2UIPROC) load(userptr, "glVertexP2ui"); + glad_glVertexP2uiv = (PFNGLVERTEXP2UIVPROC) load(userptr, "glVertexP2uiv"); + glad_glVertexP3ui = (PFNGLVERTEXP3UIPROC) load(userptr, "glVertexP3ui"); + glad_glVertexP3uiv = (PFNGLVERTEXP3UIVPROC) load(userptr, "glVertexP3uiv"); + glad_glVertexP4ui = (PFNGLVERTEXP4UIPROC) load(userptr, "glVertexP4ui"); + glad_glVertexP4uiv = (PFNGLVERTEXP4UIVPROC) load(userptr, "glVertexP4uiv"); +} +static void glad_gl_load_GL_ARB_debug_output( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_ARB_debug_output) return; + glad_glDebugMessageCallbackARB = (PFNGLDEBUGMESSAGECALLBACKARBPROC) load(userptr, "glDebugMessageCallbackARB"); + glad_glDebugMessageControlARB = (PFNGLDEBUGMESSAGECONTROLARBPROC) load(userptr, "glDebugMessageControlARB"); + glad_glDebugMessageInsertARB = (PFNGLDEBUGMESSAGEINSERTARBPROC) load(userptr, "glDebugMessageInsertARB"); + glad_glGetDebugMessageLogARB = (PFNGLGETDEBUGMESSAGELOGARBPROC) load(userptr, "glGetDebugMessageLogARB"); +} +static void glad_gl_load_GL_ARB_framebuffer_object( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_ARB_framebuffer_object) return; + glad_glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC) load(userptr, "glBindFramebuffer"); + glad_glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC) load(userptr, "glBindRenderbuffer"); + glad_glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC) load(userptr, "glBlitFramebuffer"); + glad_glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC) load(userptr, "glCheckFramebufferStatus"); + glad_glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC) load(userptr, "glDeleteFramebuffers"); + glad_glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC) load(userptr, "glDeleteRenderbuffers"); + glad_glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC) load(userptr, "glFramebufferRenderbuffer"); + glad_glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC) load(userptr, "glFramebufferTexture1D"); + glad_glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC) load(userptr, "glFramebufferTexture2D"); + glad_glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC) load(userptr, "glFramebufferTexture3D"); + glad_glFramebufferTextureLayer = (PFNGLFRAMEBUFFERTEXTURELAYERPROC) load(userptr, "glFramebufferTextureLayer"); + glad_glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC) load(userptr, "glGenFramebuffers"); + glad_glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC) load(userptr, "glGenRenderbuffers"); + glad_glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC) load(userptr, "glGenerateMipmap"); + glad_glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC) load(userptr, "glGetFramebufferAttachmentParameteriv"); + glad_glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC) load(userptr, "glGetRenderbufferParameteriv"); + glad_glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC) load(userptr, "glIsFramebuffer"); + glad_glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC) load(userptr, "glIsRenderbuffer"); + glad_glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC) load(userptr, "glRenderbufferStorage"); + glad_glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC) load(userptr, "glRenderbufferStorageMultisample"); +} +static void glad_gl_load_GL_EXT_framebuffer_blit( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_EXT_framebuffer_blit) return; + glad_glBlitFramebufferEXT = (PFNGLBLITFRAMEBUFFEREXTPROC) load(userptr, "glBlitFramebufferEXT"); +} +static void glad_gl_load_GL_EXT_framebuffer_multisample( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_EXT_framebuffer_multisample) return; + glad_glRenderbufferStorageMultisampleEXT = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC) load(userptr, "glRenderbufferStorageMultisampleEXT"); +} +static void glad_gl_load_GL_EXT_framebuffer_object( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_EXT_framebuffer_object) return; + glad_glBindFramebufferEXT = (PFNGLBINDFRAMEBUFFEREXTPROC) load(userptr, "glBindFramebufferEXT"); + glad_glBindRenderbufferEXT = (PFNGLBINDRENDERBUFFEREXTPROC) load(userptr, "glBindRenderbufferEXT"); + glad_glCheckFramebufferStatusEXT = (PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC) load(userptr, "glCheckFramebufferStatusEXT"); + glad_glDeleteFramebuffersEXT = (PFNGLDELETEFRAMEBUFFERSEXTPROC) load(userptr, "glDeleteFramebuffersEXT"); + glad_glDeleteRenderbuffersEXT = (PFNGLDELETERENDERBUFFERSEXTPROC) load(userptr, "glDeleteRenderbuffersEXT"); + glad_glFramebufferRenderbufferEXT = (PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC) load(userptr, "glFramebufferRenderbufferEXT"); + glad_glFramebufferTexture1DEXT = (PFNGLFRAMEBUFFERTEXTURE1DEXTPROC) load(userptr, "glFramebufferTexture1DEXT"); + glad_glFramebufferTexture2DEXT = (PFNGLFRAMEBUFFERTEXTURE2DEXTPROC) load(userptr, "glFramebufferTexture2DEXT"); + glad_glFramebufferTexture3DEXT = (PFNGLFRAMEBUFFERTEXTURE3DEXTPROC) load(userptr, "glFramebufferTexture3DEXT"); + glad_glGenFramebuffersEXT = (PFNGLGENFRAMEBUFFERSEXTPROC) load(userptr, "glGenFramebuffersEXT"); + glad_glGenRenderbuffersEXT = (PFNGLGENRENDERBUFFERSEXTPROC) load(userptr, "glGenRenderbuffersEXT"); + glad_glGenerateMipmapEXT = (PFNGLGENERATEMIPMAPEXTPROC) load(userptr, "glGenerateMipmapEXT"); + glad_glGetFramebufferAttachmentParameterivEXT = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC) load(userptr, "glGetFramebufferAttachmentParameterivEXT"); + glad_glGetRenderbufferParameterivEXT = (PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC) load(userptr, "glGetRenderbufferParameterivEXT"); + glad_glIsFramebufferEXT = (PFNGLISFRAMEBUFFEREXTPROC) load(userptr, "glIsFramebufferEXT"); + glad_glIsRenderbufferEXT = (PFNGLISRENDERBUFFEREXTPROC) load(userptr, "glIsRenderbufferEXT"); + glad_glRenderbufferStorageEXT = (PFNGLRENDERBUFFERSTORAGEEXTPROC) load(userptr, "glRenderbufferStorageEXT"); +} +static void glad_gl_load_GL_OVR_multiview( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GL_OVR_multiview) return; + glad_glFramebufferTextureMultiviewOVR = (PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) load(userptr, "glFramebufferTextureMultiviewOVR"); +} + + + +#if defined(GL_ES_VERSION_3_0) || defined(GL_VERSION_3_0) +#define GLAD_GL_IS_SOME_NEW_VERSION 1 +#else +#define GLAD_GL_IS_SOME_NEW_VERSION 0 +#endif + +static int glad_gl_get_extensions( int version, const char **out_exts, unsigned int *out_num_exts_i, char ***out_exts_i) { +#if GLAD_GL_IS_SOME_NEW_VERSION + if(GLAD_VERSION_MAJOR(version) < 3) { +#else + GLAD_UNUSED(version); + GLAD_UNUSED(out_num_exts_i); + GLAD_UNUSED(out_exts_i); +#endif + if (glad_glGetString == NULL) { + return 0; + } + *out_exts = (const char *)glad_glGetString(GL_EXTENSIONS); +#if GLAD_GL_IS_SOME_NEW_VERSION + } else { + unsigned int index = 0; + unsigned int num_exts_i = 0; + char **exts_i = NULL; + if (glad_glGetStringi == NULL || glad_glGetIntegerv == NULL) { + return 0; + } + glad_glGetIntegerv(GL_NUM_EXTENSIONS, (int*) &num_exts_i); + if (num_exts_i > 0) { + exts_i = (char **) malloc(num_exts_i * (sizeof *exts_i)); + } + if (exts_i == NULL) { + return 0; + } + for(index = 0; index < num_exts_i; index++) { + const char *gl_str_tmp = (const char*) glad_glGetStringi(GL_EXTENSIONS, index); + size_t len = strlen(gl_str_tmp) + 1; + + char *local_str = (char*) malloc(len * sizeof(char)); + if(local_str != NULL) { + memcpy(local_str, gl_str_tmp, len * sizeof(char)); + } + + exts_i[index] = local_str; + } + + *out_num_exts_i = num_exts_i; + *out_exts_i = exts_i; + } +#endif + return 1; +} +static void glad_gl_free_extensions(char **exts_i, unsigned int num_exts_i) { + if (exts_i != NULL) { + unsigned int index; + for(index = 0; index < num_exts_i; index++) { + free((void *) (exts_i[index])); + } + free((void *)exts_i); + exts_i = NULL; + } +} +static int glad_gl_has_extension(int version, const char *exts, unsigned int num_exts_i, char **exts_i, const char *ext) { + if(GLAD_VERSION_MAJOR(version) < 3 || !GLAD_GL_IS_SOME_NEW_VERSION) { + const char *extensions; + const char *loc; + const char *terminator; + extensions = exts; + if(extensions == NULL || ext == NULL) { + return 0; + } + while(1) { + loc = strstr(extensions, ext); + if(loc == NULL) { + return 0; + } + terminator = loc + strlen(ext); + if((loc == extensions || *(loc - 1) == ' ') && + (*terminator == ' ' || *terminator == '\0')) { + return 1; + } + extensions = terminator; + } + } else { + unsigned int index; + for(index = 0; index < num_exts_i; index++) { + const char *e = exts_i[index]; + if(strcmp(e, ext) == 0) { + return 1; + } + } + } + return 0; +} + +static GLADapiproc glad_gl_get_proc_from_userptr(void *userptr, const char* name) { + return (GLAD_GNUC_EXTENSION (GLADapiproc (*)(const char *name)) userptr)(name); +} + +static int glad_gl_find_extensions_gl( int version) { + const char *exts = NULL; + unsigned int num_exts_i = 0; + char **exts_i = NULL; + if (!glad_gl_get_extensions(version, &exts, &num_exts_i, &exts_i)) return 0; + + GLAD_GL_ARB_debug_output = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_ARB_debug_output"); + GLAD_GL_ARB_framebuffer_object = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_ARB_framebuffer_object"); + GLAD_GL_EXT_framebuffer_blit = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_blit"); + GLAD_GL_EXT_framebuffer_multisample = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_multisample"); + GLAD_GL_EXT_framebuffer_object = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_EXT_framebuffer_object"); + GLAD_GL_OVR_multiview = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview"); + GLAD_GL_OVR_multiview2 = glad_gl_has_extension(version, exts, num_exts_i, exts_i, "GL_OVR_multiview2"); + + glad_gl_free_extensions(exts_i, num_exts_i); + + return 1; +} + +static int glad_gl_find_core_gl(void) { + int i; + const char* version; + const char* prefixes[] = { + "OpenGL ES-CM ", + "OpenGL ES-CL ", + "OpenGL ES ", + "OpenGL SC ", + NULL + }; + int major = 0; + int minor = 0; + version = (const char*) glad_glGetString(GL_VERSION); + if (!version) return 0; + for (i = 0; prefixes[i]; i++) { + const size_t length = strlen(prefixes[i]); + if (strncmp(version, prefixes[i], length) == 0) { + version += length; + break; + } + } + + GLAD_IMPL_UTIL_SSCANF(version, "%d.%d", &major, &minor); + + GLAD_GL_VERSION_1_0 = (major == 1 && minor >= 0) || major > 1; + GLAD_GL_VERSION_1_1 = (major == 1 && minor >= 1) || major > 1; + GLAD_GL_VERSION_1_2 = (major == 1 && minor >= 2) || major > 1; + GLAD_GL_VERSION_1_3 = (major == 1 && minor >= 3) || major > 1; + GLAD_GL_VERSION_1_4 = (major == 1 && minor >= 4) || major > 1; + GLAD_GL_VERSION_1_5 = (major == 1 && minor >= 5) || major > 1; + GLAD_GL_VERSION_2_0 = (major == 2 && minor >= 0) || major > 2; + GLAD_GL_VERSION_2_1 = (major == 2 && minor >= 1) || major > 2; + GLAD_GL_VERSION_3_0 = (major == 3 && minor >= 0) || major > 3; + GLAD_GL_VERSION_3_1 = (major == 3 && minor >= 1) || major > 3; + GLAD_GL_VERSION_3_2 = (major == 3 && minor >= 2) || major > 3; + GLAD_GL_VERSION_3_3 = (major == 3 && minor >= 3) || major > 3; + + return GLAD_MAKE_VERSION(major, minor); +} + +int gladLoadGLUserPtr( GLADuserptrloadfunc load, void *userptr) { + int version; + + glad_glGetString = (PFNGLGETSTRINGPROC) load(userptr, "glGetString"); + if(glad_glGetString == NULL) return 0; + if(glad_glGetString(GL_VERSION) == NULL) return 0; + version = glad_gl_find_core_gl(); + + glad_gl_load_GL_VERSION_1_0(load, userptr); + glad_gl_load_GL_VERSION_1_1(load, userptr); + glad_gl_load_GL_VERSION_1_2(load, userptr); + glad_gl_load_GL_VERSION_1_3(load, userptr); + glad_gl_load_GL_VERSION_1_4(load, userptr); + glad_gl_load_GL_VERSION_1_5(load, userptr); + glad_gl_load_GL_VERSION_2_0(load, userptr); + glad_gl_load_GL_VERSION_2_1(load, userptr); + glad_gl_load_GL_VERSION_3_0(load, userptr); + glad_gl_load_GL_VERSION_3_1(load, userptr); + glad_gl_load_GL_VERSION_3_2(load, userptr); + glad_gl_load_GL_VERSION_3_3(load, userptr); + + if (!glad_gl_find_extensions_gl(version)) return 0; + glad_gl_load_GL_ARB_debug_output(load, userptr); + glad_gl_load_GL_ARB_framebuffer_object(load, userptr); + glad_gl_load_GL_EXT_framebuffer_blit(load, userptr); + glad_gl_load_GL_EXT_framebuffer_multisample(load, userptr); + glad_gl_load_GL_EXT_framebuffer_object(load, userptr); + glad_gl_load_GL_OVR_multiview(load, userptr); + + + + return version; +} + + +int gladLoadGL( GLADloadfunc load) { + return gladLoadGLUserPtr( glad_gl_get_proc_from_userptr, GLAD_GNUC_EXTENSION (void*) load); +} + + + + + +#ifdef GLAD_GL + +#ifndef GLAD_LOADER_LIBRARY_C_ +#define GLAD_LOADER_LIBRARY_C_ + +#include <stddef.h> +#include <stdlib.h> + +#if GLAD_PLATFORM_WIN32 +#include <windows.h> +#else +#include <dlfcn.h> +#endif + + +static void* glad_get_dlopen_handle(const char *lib_names[], int length) { + void *handle = NULL; + int i; + + for (i = 0; i < length; ++i) { +#if GLAD_PLATFORM_WIN32 + #if GLAD_PLATFORM_UWP + size_t buffer_size = (strlen(lib_names[i]) + 1) * sizeof(WCHAR); + LPWSTR buffer = (LPWSTR) malloc(buffer_size); + if (buffer != NULL) { + int ret = MultiByteToWideChar(CP_ACP, 0, lib_names[i], -1, buffer, buffer_size); + if (ret != 0) { + handle = (void*) LoadPackagedLibrary(buffer, 0); + } + free((void*) buffer); + } + #else + handle = (void*) LoadLibraryA(lib_names[i]); + #endif +#else + handle = dlopen(lib_names[i], RTLD_LAZY | RTLD_LOCAL); +#endif + if (handle != NULL) { + return handle; + } + } + + return NULL; +} + +static void glad_close_dlopen_handle(void* handle) { + if (handle != NULL) { +#if GLAD_PLATFORM_WIN32 + FreeLibrary((HMODULE) handle); +#else + dlclose(handle); +#endif + } +} + +static GLADapiproc glad_dlsym_handle(void* handle, const char *name) { + if (handle == NULL) { + return NULL; + } + +#if GLAD_PLATFORM_WIN32 + return (GLADapiproc) GetProcAddress((HMODULE) handle, name); +#else + return GLAD_GNUC_EXTENSION (GLADapiproc) dlsym(handle, name); +#endif +} + +#endif /* GLAD_LOADER_LIBRARY_C_ */ + +typedef void* (GLAD_API_PTR *GLADglprocaddrfunc)(const char*); +struct _glad_gl_userptr { + void *handle; + GLADglprocaddrfunc gl_get_proc_address_ptr; +}; + +static GLADapiproc glad_gl_get_proc(void *vuserptr, const char *name) { + struct _glad_gl_userptr userptr = *(struct _glad_gl_userptr*) vuserptr; + GLADapiproc result = NULL; + + if(userptr.gl_get_proc_address_ptr != NULL) { + result = GLAD_GNUC_EXTENSION (GLADapiproc) userptr.gl_get_proc_address_ptr(name); + } + if(result == NULL) { + result = glad_dlsym_handle(userptr.handle, name); + } + + return result; +} + +static void* _glad_GL_loader_handle = NULL; + +static void* glad_gl_dlopen_handle(void) { +#if GLAD_PLATFORM_APPLE + static const char *NAMES[] = { + "../Frameworks/OpenGL.framework/OpenGL", + "/Library/Frameworks/OpenGL.framework/OpenGL", + "/System/Library/Frameworks/OpenGL.framework/OpenGL", + "/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL" + }; +#elif GLAD_PLATFORM_WIN32 + static const char *NAMES[] = {"opengl32.dll"}; +#else + static const char *NAMES[] = { + #if defined(__CYGWIN__) + "libGL-1.so", + #endif + "libGL.so.1", + "libGL.so" + }; +#endif + + if (_glad_GL_loader_handle == NULL) { + _glad_GL_loader_handle = glad_get_dlopen_handle(NAMES, sizeof(NAMES) / sizeof(NAMES[0])); + } + + return _glad_GL_loader_handle; +} + +static struct _glad_gl_userptr glad_gl_build_userptr(void *handle) { + struct _glad_gl_userptr userptr; + + userptr.handle = handle; +#if GLAD_PLATFORM_APPLE || defined(__HAIKU__) + userptr.gl_get_proc_address_ptr = NULL; +#elif GLAD_PLATFORM_WIN32 + userptr.gl_get_proc_address_ptr = + (GLADglprocaddrfunc) glad_dlsym_handle(handle, "wglGetProcAddress"); +#else + userptr.gl_get_proc_address_ptr = + (GLADglprocaddrfunc) glad_dlsym_handle(handle, "glXGetProcAddressARB"); +#endif + + return userptr; +} + +int gladLoaderLoadGL(void) { + int version = 0; + void *handle; + int did_load = 0; + struct _glad_gl_userptr userptr; + + did_load = _glad_GL_loader_handle == NULL; + handle = glad_gl_dlopen_handle(); + if (handle) { + userptr = glad_gl_build_userptr(handle); + + version = gladLoadGLUserPtr(glad_gl_get_proc, &userptr); + + if (did_load) { + gladLoaderUnloadGL(); + } + } + + return version; +} + + + +void gladLoaderUnloadGL(void) { + if (_glad_GL_loader_handle != NULL) { + glad_close_dlopen_handle(_glad_GL_loader_handle); + _glad_GL_loader_handle = NULL; + } +} + +#endif /* GLAD_GL */ + +#ifdef __cplusplus +} +#endif diff --git a/thirdparty/glad/glad.c b/thirdparty/glad/glad.c deleted file mode 100644 index 4b20178ef7..0000000000 --- a/thirdparty/glad/glad.c +++ /dev/null @@ -1,1951 +0,0 @@ -/* - - OpenGL loader generated by glad 0.1.36 on Sun Sep 4 15:50:32 2022. - - Language/Generator: C/C++ - Specification: gl - APIs: gl=3.3 - Profile: compatibility - Extensions: - GL_ARB_debug_output, - GL_ARB_framebuffer_object, - GL_EXT_framebuffer_blit, - GL_EXT_framebuffer_multisample, - GL_EXT_framebuffer_object, - GL_OVR_multiview, - GL_OVR_multiview2 - Loader: True - Local files: False - Omit khrplatform: False - Reproducible: False - - Commandline: - --profile="compatibility" --api="gl=3.3" --generator="c" --spec="gl" --extensions="GL_ARB_debug_output,GL_ARB_framebuffer_object,GL_EXT_framebuffer_blit,GL_EXT_framebuffer_multisample,GL_EXT_framebuffer_object,GL_OVR_multiview,GL_OVR_multiview2" - Online: - https://glad.dav1d.de/#profile=compatibility&language=c&specification=gl&loader=on&api=gl%3D3.3&extensions=GL_ARB_debug_output&extensions=GL_ARB_framebuffer_object&extensions=GL_EXT_framebuffer_blit&extensions=GL_EXT_framebuffer_multisample&extensions=GL_EXT_framebuffer_object&extensions=GL_OVR_multiview&extensions=GL_OVR_multiview2 -*/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <glad/glad.h> - -static void* get_proc(const char *namez); - -#if defined(_WIN32) || defined(__CYGWIN__) -#ifndef _WINDOWS_ -#undef APIENTRY -#endif -#include <windows.h> -static HMODULE libGL; - -typedef void* (APIENTRYP PFNWGLGETPROCADDRESSPROC_PRIVATE)(const char*); -static PFNWGLGETPROCADDRESSPROC_PRIVATE gladGetProcAddressPtr; - -#ifdef _MSC_VER -#ifdef __has_include - #if __has_include(<winapifamily.h>) - #define HAVE_WINAPIFAMILY 1 - #endif -#elif _MSC_VER >= 1700 && !_USING_V110_SDK71_ - #define HAVE_WINAPIFAMILY 1 -#endif -#endif - -#ifdef HAVE_WINAPIFAMILY - #include <winapifamily.h> - #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) - #define IS_UWP 1 - #endif -#endif - -static -int open_gl(void) { -#ifndef IS_UWP - libGL = LoadLibraryW(L"opengl32.dll"); - if(libGL != NULL) { - void (* tmp)(void); - tmp = (void(*)(void)) GetProcAddress(libGL, "wglGetProcAddress"); - gladGetProcAddressPtr = (PFNWGLGETPROCADDRESSPROC_PRIVATE) tmp; - return gladGetProcAddressPtr != NULL; - } -#endif - - return 0; -} - -static -void close_gl(void) { - if(libGL != NULL) { - FreeLibrary((HMODULE) libGL); - libGL = NULL; - } -} -#else -#include <dlfcn.h> -static void* libGL; - -#if !defined(__APPLE__) && !defined(__HAIKU__) -typedef void* (APIENTRYP PFNGLXGETPROCADDRESSPROC_PRIVATE)(const char*); -static PFNGLXGETPROCADDRESSPROC_PRIVATE gladGetProcAddressPtr; -#endif - -static -int open_gl(void) { -#ifdef __APPLE__ - static const char *NAMES[] = { - "../Frameworks/OpenGL.framework/OpenGL", - "/Library/Frameworks/OpenGL.framework/OpenGL", - "/System/Library/Frameworks/OpenGL.framework/OpenGL", - "/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL" - }; -#else - static const char *NAMES[] = {"libGL.so.1", "libGL.so"}; -#endif - - unsigned int index = 0; - for(index = 0; index < (sizeof(NAMES) / sizeof(NAMES[0])); index++) { - libGL = dlopen(NAMES[index], RTLD_NOW | RTLD_GLOBAL); - - if(libGL != NULL) { -#if defined(__APPLE__) || defined(__HAIKU__) - return 1; -#else - gladGetProcAddressPtr = (PFNGLXGETPROCADDRESSPROC_PRIVATE)dlsym(libGL, - "glXGetProcAddressARB"); - return gladGetProcAddressPtr != NULL; -#endif - } - } - - return 0; -} - -static -void close_gl(void) { - if(libGL != NULL) { - dlclose(libGL); - libGL = NULL; - } -} -#endif - -static -void* get_proc(const char *namez) { - void* result = NULL; - if(libGL == NULL) return NULL; - -#if !defined(__APPLE__) && !defined(__HAIKU__) - if(gladGetProcAddressPtr != NULL) { - result = gladGetProcAddressPtr(namez); - } -#endif - if(result == NULL) { -#if defined(_WIN32) || defined(__CYGWIN__) - result = (void*)GetProcAddress((HMODULE) libGL, namez); -#else - result = dlsym(libGL, namez); -#endif - } - - return result; -} - -int gladLoadGL(void) { - int status = 0; - - if(open_gl()) { - status = gladLoadGLLoader(&get_proc); - close_gl(); - } - - return status; -} - -struct gladGLversionStruct GLVersion = { 0, 0 }; - -#if defined(GL_ES_VERSION_3_0) || defined(GL_VERSION_3_0) -#define _GLAD_IS_SOME_NEW_VERSION 1 -#endif - -static int max_loaded_major; -static int max_loaded_minor; - -static const char *exts = NULL; -static int num_exts_i = 0; -static char **exts_i = NULL; - -static int get_exts(void) { -#ifdef _GLAD_IS_SOME_NEW_VERSION - if(max_loaded_major < 3) { -#endif - exts = (const char *)glGetString(GL_EXTENSIONS); -#ifdef _GLAD_IS_SOME_NEW_VERSION - } else { - unsigned int index; - - num_exts_i = 0; - glGetIntegerv(GL_NUM_EXTENSIONS, &num_exts_i); - if (num_exts_i > 0) { - exts_i = (char **)malloc((size_t)num_exts_i * (sizeof *exts_i)); - } - - if (exts_i == NULL) { - return 0; - } - - for(index = 0; index < (unsigned)num_exts_i; index++) { - const char *gl_str_tmp = (const char*)glGetStringi(GL_EXTENSIONS, index); - size_t len = strlen(gl_str_tmp); - - char *local_str = (char*)malloc((len+1) * sizeof(char)); - if(local_str != NULL) { - memcpy(local_str, gl_str_tmp, (len+1) * sizeof(char)); - } - exts_i[index] = local_str; - } - } -#endif - return 1; -} - -static void free_exts(void) { - if (exts_i != NULL) { - int index; - for(index = 0; index < num_exts_i; index++) { - free((char *)exts_i[index]); - } - free((void *)exts_i); - exts_i = NULL; - } -} - -static int has_ext(const char *ext) { -#ifdef _GLAD_IS_SOME_NEW_VERSION - if(max_loaded_major < 3) { -#endif - const char *extensions; - const char *loc; - const char *terminator; - extensions = exts; - if(extensions == NULL || ext == NULL) { - return 0; - } - - while(1) { - loc = strstr(extensions, ext); - if(loc == NULL) { - return 0; - } - - terminator = loc + strlen(ext); - if((loc == extensions || *(loc - 1) == ' ') && - (*terminator == ' ' || *terminator == '\0')) { - return 1; - } - extensions = terminator; - } -#ifdef _GLAD_IS_SOME_NEW_VERSION - } else { - int index; - if(exts_i == NULL) return 0; - for(index = 0; index < num_exts_i; index++) { - const char *e = exts_i[index]; - - if(exts_i[index] != NULL && strcmp(e, ext) == 0) { - return 1; - } - } - } -#endif - - return 0; -} -int GLAD_GL_VERSION_1_0 = 0; -int GLAD_GL_VERSION_1_1 = 0; -int GLAD_GL_VERSION_1_2 = 0; -int GLAD_GL_VERSION_1_3 = 0; -int GLAD_GL_VERSION_1_4 = 0; -int GLAD_GL_VERSION_1_5 = 0; -int GLAD_GL_VERSION_2_0 = 0; -int GLAD_GL_VERSION_2_1 = 0; -int GLAD_GL_VERSION_3_0 = 0; -int GLAD_GL_VERSION_3_1 = 0; -int GLAD_GL_VERSION_3_2 = 0; -int GLAD_GL_VERSION_3_3 = 0; -PFNGLACCUMPROC glad_glAccum = NULL; -PFNGLACTIVETEXTUREPROC glad_glActiveTexture = NULL; -PFNGLALPHAFUNCPROC glad_glAlphaFunc = NULL; -PFNGLARETEXTURESRESIDENTPROC glad_glAreTexturesResident = NULL; -PFNGLARRAYELEMENTPROC glad_glArrayElement = NULL; -PFNGLATTACHSHADERPROC glad_glAttachShader = NULL; -PFNGLBEGINPROC glad_glBegin = NULL; -PFNGLBEGINCONDITIONALRENDERPROC glad_glBeginConditionalRender = NULL; -PFNGLBEGINQUERYPROC glad_glBeginQuery = NULL; -PFNGLBEGINTRANSFORMFEEDBACKPROC glad_glBeginTransformFeedback = NULL; -PFNGLBINDATTRIBLOCATIONPROC glad_glBindAttribLocation = NULL; -PFNGLBINDBUFFERPROC glad_glBindBuffer = NULL; -PFNGLBINDBUFFERBASEPROC glad_glBindBufferBase = NULL; -PFNGLBINDBUFFERRANGEPROC glad_glBindBufferRange = NULL; -PFNGLBINDFRAGDATALOCATIONPROC glad_glBindFragDataLocation = NULL; -PFNGLBINDFRAGDATALOCATIONINDEXEDPROC glad_glBindFragDataLocationIndexed = NULL; -PFNGLBINDFRAMEBUFFERPROC glad_glBindFramebuffer = NULL; -PFNGLBINDRENDERBUFFERPROC glad_glBindRenderbuffer = NULL; -PFNGLBINDSAMPLERPROC glad_glBindSampler = NULL; -PFNGLBINDTEXTUREPROC glad_glBindTexture = NULL; -PFNGLBINDVERTEXARRAYPROC glad_glBindVertexArray = NULL; -PFNGLBITMAPPROC glad_glBitmap = NULL; -PFNGLBLENDCOLORPROC glad_glBlendColor = NULL; -PFNGLBLENDEQUATIONPROC glad_glBlendEquation = NULL; -PFNGLBLENDEQUATIONSEPARATEPROC glad_glBlendEquationSeparate = NULL; -PFNGLBLENDFUNCPROC glad_glBlendFunc = NULL; -PFNGLBLENDFUNCSEPARATEPROC glad_glBlendFuncSeparate = NULL; -PFNGLBLITFRAMEBUFFERPROC glad_glBlitFramebuffer = NULL; -PFNGLBUFFERDATAPROC glad_glBufferData = NULL; -PFNGLBUFFERSUBDATAPROC glad_glBufferSubData = NULL; -PFNGLCALLLISTPROC glad_glCallList = NULL; -PFNGLCALLLISTSPROC glad_glCallLists = NULL; -PFNGLCHECKFRAMEBUFFERSTATUSPROC glad_glCheckFramebufferStatus = NULL; -PFNGLCLAMPCOLORPROC glad_glClampColor = NULL; -PFNGLCLEARPROC glad_glClear = NULL; -PFNGLCLEARACCUMPROC glad_glClearAccum = NULL; -PFNGLCLEARBUFFERFIPROC glad_glClearBufferfi = NULL; -PFNGLCLEARBUFFERFVPROC glad_glClearBufferfv = NULL; -PFNGLCLEARBUFFERIVPROC glad_glClearBufferiv = NULL; -PFNGLCLEARBUFFERUIVPROC glad_glClearBufferuiv = NULL; -PFNGLCLEARCOLORPROC glad_glClearColor = NULL; -PFNGLCLEARDEPTHPROC glad_glClearDepth = NULL; -PFNGLCLEARINDEXPROC glad_glClearIndex = NULL; -PFNGLCLEARSTENCILPROC glad_glClearStencil = NULL; -PFNGLCLIENTACTIVETEXTUREPROC glad_glClientActiveTexture = NULL; -PFNGLCLIENTWAITSYNCPROC glad_glClientWaitSync = NULL; -PFNGLCLIPPLANEPROC glad_glClipPlane = NULL; -PFNGLCOLOR3BPROC glad_glColor3b = NULL; -PFNGLCOLOR3BVPROC glad_glColor3bv = NULL; -PFNGLCOLOR3DPROC glad_glColor3d = NULL; -PFNGLCOLOR3DVPROC glad_glColor3dv = NULL; -PFNGLCOLOR3FPROC glad_glColor3f = NULL; -PFNGLCOLOR3FVPROC glad_glColor3fv = NULL; -PFNGLCOLOR3IPROC glad_glColor3i = NULL; -PFNGLCOLOR3IVPROC glad_glColor3iv = NULL; -PFNGLCOLOR3SPROC glad_glColor3s = NULL; -PFNGLCOLOR3SVPROC glad_glColor3sv = NULL; -PFNGLCOLOR3UBPROC glad_glColor3ub = NULL; -PFNGLCOLOR3UBVPROC glad_glColor3ubv = NULL; -PFNGLCOLOR3UIPROC glad_glColor3ui = NULL; -PFNGLCOLOR3UIVPROC glad_glColor3uiv = NULL; -PFNGLCOLOR3USPROC glad_glColor3us = NULL; -PFNGLCOLOR3USVPROC glad_glColor3usv = NULL; -PFNGLCOLOR4BPROC glad_glColor4b = NULL; -PFNGLCOLOR4BVPROC glad_glColor4bv = NULL; -PFNGLCOLOR4DPROC glad_glColor4d = NULL; -PFNGLCOLOR4DVPROC glad_glColor4dv = NULL; -PFNGLCOLOR4FPROC glad_glColor4f = NULL; -PFNGLCOLOR4FVPROC glad_glColor4fv = NULL; -PFNGLCOLOR4IPROC glad_glColor4i = NULL; -PFNGLCOLOR4IVPROC glad_glColor4iv = NULL; -PFNGLCOLOR4SPROC glad_glColor4s = NULL; -PFNGLCOLOR4SVPROC glad_glColor4sv = NULL; -PFNGLCOLOR4UBPROC glad_glColor4ub = NULL; -PFNGLCOLOR4UBVPROC glad_glColor4ubv = NULL; -PFNGLCOLOR4UIPROC glad_glColor4ui = NULL; -PFNGLCOLOR4UIVPROC glad_glColor4uiv = NULL; -PFNGLCOLOR4USPROC glad_glColor4us = NULL; -PFNGLCOLOR4USVPROC glad_glColor4usv = NULL; -PFNGLCOLORMASKPROC glad_glColorMask = NULL; -PFNGLCOLORMASKIPROC glad_glColorMaski = NULL; -PFNGLCOLORMATERIALPROC glad_glColorMaterial = NULL; -PFNGLCOLORP3UIPROC glad_glColorP3ui = NULL; -PFNGLCOLORP3UIVPROC glad_glColorP3uiv = NULL; -PFNGLCOLORP4UIPROC glad_glColorP4ui = NULL; -PFNGLCOLORP4UIVPROC glad_glColorP4uiv = NULL; -PFNGLCOLORPOINTERPROC glad_glColorPointer = NULL; -PFNGLCOMPILESHADERPROC glad_glCompileShader = NULL; -PFNGLCOMPRESSEDTEXIMAGE1DPROC glad_glCompressedTexImage1D = NULL; -PFNGLCOMPRESSEDTEXIMAGE2DPROC glad_glCompressedTexImage2D = NULL; -PFNGLCOMPRESSEDTEXIMAGE3DPROC glad_glCompressedTexImage3D = NULL; -PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC glad_glCompressedTexSubImage1D = NULL; -PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC glad_glCompressedTexSubImage2D = NULL; -PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC glad_glCompressedTexSubImage3D = NULL; -PFNGLCOPYBUFFERSUBDATAPROC glad_glCopyBufferSubData = NULL; -PFNGLCOPYPIXELSPROC glad_glCopyPixels = NULL; -PFNGLCOPYTEXIMAGE1DPROC glad_glCopyTexImage1D = NULL; -PFNGLCOPYTEXIMAGE2DPROC glad_glCopyTexImage2D = NULL; -PFNGLCOPYTEXSUBIMAGE1DPROC glad_glCopyTexSubImage1D = NULL; -PFNGLCOPYTEXSUBIMAGE2DPROC glad_glCopyTexSubImage2D = NULL; -PFNGLCOPYTEXSUBIMAGE3DPROC glad_glCopyTexSubImage3D = NULL; -PFNGLCREATEPROGRAMPROC glad_glCreateProgram = NULL; -PFNGLCREATESHADERPROC glad_glCreateShader = NULL; -PFNGLCULLFACEPROC glad_glCullFace = NULL; -PFNGLDELETEBUFFERSPROC glad_glDeleteBuffers = NULL; -PFNGLDELETEFRAMEBUFFERSPROC glad_glDeleteFramebuffers = NULL; -PFNGLDELETELISTSPROC glad_glDeleteLists = NULL; -PFNGLDELETEPROGRAMPROC glad_glDeleteProgram = NULL; -PFNGLDELETEQUERIESPROC glad_glDeleteQueries = NULL; -PFNGLDELETERENDERBUFFERSPROC glad_glDeleteRenderbuffers = NULL; -PFNGLDELETESAMPLERSPROC glad_glDeleteSamplers = NULL; -PFNGLDELETESHADERPROC glad_glDeleteShader = NULL; -PFNGLDELETESYNCPROC glad_glDeleteSync = NULL; -PFNGLDELETETEXTURESPROC glad_glDeleteTextures = NULL; -PFNGLDELETEVERTEXARRAYSPROC glad_glDeleteVertexArrays = NULL; -PFNGLDEPTHFUNCPROC glad_glDepthFunc = NULL; -PFNGLDEPTHMASKPROC glad_glDepthMask = NULL; -PFNGLDEPTHRANGEPROC glad_glDepthRange = NULL; -PFNGLDETACHSHADERPROC glad_glDetachShader = NULL; -PFNGLDISABLEPROC glad_glDisable = NULL; -PFNGLDISABLECLIENTSTATEPROC glad_glDisableClientState = NULL; -PFNGLDISABLEVERTEXATTRIBARRAYPROC glad_glDisableVertexAttribArray = NULL; -PFNGLDISABLEIPROC glad_glDisablei = NULL; -PFNGLDRAWARRAYSPROC glad_glDrawArrays = NULL; -PFNGLDRAWARRAYSINSTANCEDPROC glad_glDrawArraysInstanced = NULL; -PFNGLDRAWBUFFERPROC glad_glDrawBuffer = NULL; -PFNGLDRAWBUFFERSPROC glad_glDrawBuffers = NULL; -PFNGLDRAWELEMENTSPROC glad_glDrawElements = NULL; -PFNGLDRAWELEMENTSBASEVERTEXPROC glad_glDrawElementsBaseVertex = NULL; -PFNGLDRAWELEMENTSINSTANCEDPROC glad_glDrawElementsInstanced = NULL; -PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC glad_glDrawElementsInstancedBaseVertex = NULL; -PFNGLDRAWPIXELSPROC glad_glDrawPixels = NULL; -PFNGLDRAWRANGEELEMENTSPROC glad_glDrawRangeElements = NULL; -PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC glad_glDrawRangeElementsBaseVertex = NULL; -PFNGLEDGEFLAGPROC glad_glEdgeFlag = NULL; -PFNGLEDGEFLAGPOINTERPROC glad_glEdgeFlagPointer = NULL; -PFNGLEDGEFLAGVPROC glad_glEdgeFlagv = NULL; -PFNGLENABLEPROC glad_glEnable = NULL; -PFNGLENABLECLIENTSTATEPROC glad_glEnableClientState = NULL; -PFNGLENABLEVERTEXATTRIBARRAYPROC glad_glEnableVertexAttribArray = NULL; -PFNGLENABLEIPROC glad_glEnablei = NULL; -PFNGLENDPROC glad_glEnd = NULL; -PFNGLENDCONDITIONALRENDERPROC glad_glEndConditionalRender = NULL; -PFNGLENDLISTPROC glad_glEndList = NULL; -PFNGLENDQUERYPROC glad_glEndQuery = NULL; -PFNGLENDTRANSFORMFEEDBACKPROC glad_glEndTransformFeedback = NULL; -PFNGLEVALCOORD1DPROC glad_glEvalCoord1d = NULL; -PFNGLEVALCOORD1DVPROC glad_glEvalCoord1dv = NULL; -PFNGLEVALCOORD1FPROC glad_glEvalCoord1f = NULL; -PFNGLEVALCOORD1FVPROC glad_glEvalCoord1fv = NULL; -PFNGLEVALCOORD2DPROC glad_glEvalCoord2d = NULL; -PFNGLEVALCOORD2DVPROC glad_glEvalCoord2dv = NULL; -PFNGLEVALCOORD2FPROC glad_glEvalCoord2f = NULL; -PFNGLEVALCOORD2FVPROC glad_glEvalCoord2fv = NULL; -PFNGLEVALMESH1PROC glad_glEvalMesh1 = NULL; -PFNGLEVALMESH2PROC glad_glEvalMesh2 = NULL; -PFNGLEVALPOINT1PROC glad_glEvalPoint1 = NULL; -PFNGLEVALPOINT2PROC glad_glEvalPoint2 = NULL; -PFNGLFEEDBACKBUFFERPROC glad_glFeedbackBuffer = NULL; -PFNGLFENCESYNCPROC glad_glFenceSync = NULL; -PFNGLFINISHPROC glad_glFinish = NULL; -PFNGLFLUSHPROC glad_glFlush = NULL; -PFNGLFLUSHMAPPEDBUFFERRANGEPROC glad_glFlushMappedBufferRange = NULL; -PFNGLFOGCOORDPOINTERPROC glad_glFogCoordPointer = NULL; -PFNGLFOGCOORDDPROC glad_glFogCoordd = NULL; -PFNGLFOGCOORDDVPROC glad_glFogCoorddv = NULL; -PFNGLFOGCOORDFPROC glad_glFogCoordf = NULL; -PFNGLFOGCOORDFVPROC glad_glFogCoordfv = NULL; -PFNGLFOGFPROC glad_glFogf = NULL; -PFNGLFOGFVPROC glad_glFogfv = NULL; -PFNGLFOGIPROC glad_glFogi = NULL; -PFNGLFOGIVPROC glad_glFogiv = NULL; -PFNGLFRAMEBUFFERRENDERBUFFERPROC glad_glFramebufferRenderbuffer = NULL; -PFNGLFRAMEBUFFERTEXTUREPROC glad_glFramebufferTexture = NULL; -PFNGLFRAMEBUFFERTEXTURE1DPROC glad_glFramebufferTexture1D = NULL; -PFNGLFRAMEBUFFERTEXTURE2DPROC glad_glFramebufferTexture2D = NULL; -PFNGLFRAMEBUFFERTEXTURE3DPROC glad_glFramebufferTexture3D = NULL; -PFNGLFRAMEBUFFERTEXTURELAYERPROC glad_glFramebufferTextureLayer = NULL; -PFNGLFRONTFACEPROC glad_glFrontFace = NULL; -PFNGLFRUSTUMPROC glad_glFrustum = NULL; -PFNGLGENBUFFERSPROC glad_glGenBuffers = NULL; -PFNGLGENFRAMEBUFFERSPROC glad_glGenFramebuffers = NULL; -PFNGLGENLISTSPROC glad_glGenLists = NULL; -PFNGLGENQUERIESPROC glad_glGenQueries = NULL; -PFNGLGENRENDERBUFFERSPROC glad_glGenRenderbuffers = NULL; -PFNGLGENSAMPLERSPROC glad_glGenSamplers = NULL; -PFNGLGENTEXTURESPROC glad_glGenTextures = NULL; -PFNGLGENVERTEXARRAYSPROC glad_glGenVertexArrays = NULL; -PFNGLGENERATEMIPMAPPROC glad_glGenerateMipmap = NULL; -PFNGLGETACTIVEATTRIBPROC glad_glGetActiveAttrib = NULL; -PFNGLGETACTIVEUNIFORMPROC glad_glGetActiveUniform = NULL; -PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC glad_glGetActiveUniformBlockName = NULL; -PFNGLGETACTIVEUNIFORMBLOCKIVPROC glad_glGetActiveUniformBlockiv = NULL; -PFNGLGETACTIVEUNIFORMNAMEPROC glad_glGetActiveUniformName = NULL; -PFNGLGETACTIVEUNIFORMSIVPROC glad_glGetActiveUniformsiv = NULL; -PFNGLGETATTACHEDSHADERSPROC glad_glGetAttachedShaders = NULL; -PFNGLGETATTRIBLOCATIONPROC glad_glGetAttribLocation = NULL; -PFNGLGETBOOLEANI_VPROC glad_glGetBooleani_v = NULL; -PFNGLGETBOOLEANVPROC glad_glGetBooleanv = NULL; -PFNGLGETBUFFERPARAMETERI64VPROC glad_glGetBufferParameteri64v = NULL; -PFNGLGETBUFFERPARAMETERIVPROC glad_glGetBufferParameteriv = NULL; -PFNGLGETBUFFERPOINTERVPROC glad_glGetBufferPointerv = NULL; -PFNGLGETBUFFERSUBDATAPROC glad_glGetBufferSubData = NULL; -PFNGLGETCLIPPLANEPROC glad_glGetClipPlane = NULL; -PFNGLGETCOMPRESSEDTEXIMAGEPROC glad_glGetCompressedTexImage = NULL; -PFNGLGETDOUBLEVPROC glad_glGetDoublev = NULL; -PFNGLGETERRORPROC glad_glGetError = NULL; -PFNGLGETFLOATVPROC glad_glGetFloatv = NULL; -PFNGLGETFRAGDATAINDEXPROC glad_glGetFragDataIndex = NULL; -PFNGLGETFRAGDATALOCATIONPROC glad_glGetFragDataLocation = NULL; -PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC glad_glGetFramebufferAttachmentParameteriv = NULL; -PFNGLGETINTEGER64I_VPROC glad_glGetInteger64i_v = NULL; -PFNGLGETINTEGER64VPROC glad_glGetInteger64v = NULL; -PFNGLGETINTEGERI_VPROC glad_glGetIntegeri_v = NULL; -PFNGLGETINTEGERVPROC glad_glGetIntegerv = NULL; -PFNGLGETLIGHTFVPROC glad_glGetLightfv = NULL; -PFNGLGETLIGHTIVPROC glad_glGetLightiv = NULL; -PFNGLGETMAPDVPROC glad_glGetMapdv = NULL; -PFNGLGETMAPFVPROC glad_glGetMapfv = NULL; -PFNGLGETMAPIVPROC glad_glGetMapiv = NULL; -PFNGLGETMATERIALFVPROC glad_glGetMaterialfv = NULL; -PFNGLGETMATERIALIVPROC glad_glGetMaterialiv = NULL; -PFNGLGETMULTISAMPLEFVPROC glad_glGetMultisamplefv = NULL; -PFNGLGETPIXELMAPFVPROC glad_glGetPixelMapfv = NULL; -PFNGLGETPIXELMAPUIVPROC glad_glGetPixelMapuiv = NULL; -PFNGLGETPIXELMAPUSVPROC glad_glGetPixelMapusv = NULL; -PFNGLGETPOINTERVPROC glad_glGetPointerv = NULL; -PFNGLGETPOLYGONSTIPPLEPROC glad_glGetPolygonStipple = NULL; -PFNGLGETPROGRAMINFOLOGPROC glad_glGetProgramInfoLog = NULL; -PFNGLGETPROGRAMIVPROC glad_glGetProgramiv = NULL; -PFNGLGETQUERYOBJECTI64VPROC glad_glGetQueryObjecti64v = NULL; -PFNGLGETQUERYOBJECTIVPROC glad_glGetQueryObjectiv = NULL; -PFNGLGETQUERYOBJECTUI64VPROC glad_glGetQueryObjectui64v = NULL; -PFNGLGETQUERYOBJECTUIVPROC glad_glGetQueryObjectuiv = NULL; -PFNGLGETQUERYIVPROC glad_glGetQueryiv = NULL; -PFNGLGETRENDERBUFFERPARAMETERIVPROC glad_glGetRenderbufferParameteriv = NULL; -PFNGLGETSAMPLERPARAMETERIIVPROC glad_glGetSamplerParameterIiv = NULL; -PFNGLGETSAMPLERPARAMETERIUIVPROC glad_glGetSamplerParameterIuiv = NULL; -PFNGLGETSAMPLERPARAMETERFVPROC glad_glGetSamplerParameterfv = NULL; -PFNGLGETSAMPLERPARAMETERIVPROC glad_glGetSamplerParameteriv = NULL; -PFNGLGETSHADERINFOLOGPROC glad_glGetShaderInfoLog = NULL; -PFNGLGETSHADERSOURCEPROC glad_glGetShaderSource = NULL; -PFNGLGETSHADERIVPROC glad_glGetShaderiv = NULL; -PFNGLGETSTRINGPROC glad_glGetString = NULL; -PFNGLGETSTRINGIPROC glad_glGetStringi = NULL; -PFNGLGETSYNCIVPROC glad_glGetSynciv = NULL; -PFNGLGETTEXENVFVPROC glad_glGetTexEnvfv = NULL; -PFNGLGETTEXENVIVPROC glad_glGetTexEnviv = NULL; -PFNGLGETTEXGENDVPROC glad_glGetTexGendv = NULL; -PFNGLGETTEXGENFVPROC glad_glGetTexGenfv = NULL; -PFNGLGETTEXGENIVPROC glad_glGetTexGeniv = NULL; -PFNGLGETTEXIMAGEPROC glad_glGetTexImage = NULL; -PFNGLGETTEXLEVELPARAMETERFVPROC glad_glGetTexLevelParameterfv = NULL; -PFNGLGETTEXLEVELPARAMETERIVPROC glad_glGetTexLevelParameteriv = NULL; -PFNGLGETTEXPARAMETERIIVPROC glad_glGetTexParameterIiv = NULL; -PFNGLGETTEXPARAMETERIUIVPROC glad_glGetTexParameterIuiv = NULL; -PFNGLGETTEXPARAMETERFVPROC glad_glGetTexParameterfv = NULL; -PFNGLGETTEXPARAMETERIVPROC glad_glGetTexParameteriv = NULL; -PFNGLGETTRANSFORMFEEDBACKVARYINGPROC glad_glGetTransformFeedbackVarying = NULL; -PFNGLGETUNIFORMBLOCKINDEXPROC glad_glGetUniformBlockIndex = NULL; -PFNGLGETUNIFORMINDICESPROC glad_glGetUniformIndices = NULL; -PFNGLGETUNIFORMLOCATIONPROC glad_glGetUniformLocation = NULL; -PFNGLGETUNIFORMFVPROC glad_glGetUniformfv = NULL; -PFNGLGETUNIFORMIVPROC glad_glGetUniformiv = NULL; -PFNGLGETUNIFORMUIVPROC glad_glGetUniformuiv = NULL; -PFNGLGETVERTEXATTRIBIIVPROC glad_glGetVertexAttribIiv = NULL; -PFNGLGETVERTEXATTRIBIUIVPROC glad_glGetVertexAttribIuiv = NULL; -PFNGLGETVERTEXATTRIBPOINTERVPROC glad_glGetVertexAttribPointerv = NULL; -PFNGLGETVERTEXATTRIBDVPROC glad_glGetVertexAttribdv = NULL; -PFNGLGETVERTEXATTRIBFVPROC glad_glGetVertexAttribfv = NULL; -PFNGLGETVERTEXATTRIBIVPROC glad_glGetVertexAttribiv = NULL; -PFNGLHINTPROC glad_glHint = NULL; -PFNGLINDEXMASKPROC glad_glIndexMask = NULL; -PFNGLINDEXPOINTERPROC glad_glIndexPointer = NULL; -PFNGLINDEXDPROC glad_glIndexd = NULL; -PFNGLINDEXDVPROC glad_glIndexdv = NULL; -PFNGLINDEXFPROC glad_glIndexf = NULL; -PFNGLINDEXFVPROC glad_glIndexfv = NULL; -PFNGLINDEXIPROC glad_glIndexi = NULL; -PFNGLINDEXIVPROC glad_glIndexiv = NULL; -PFNGLINDEXSPROC glad_glIndexs = NULL; -PFNGLINDEXSVPROC glad_glIndexsv = NULL; -PFNGLINDEXUBPROC glad_glIndexub = NULL; -PFNGLINDEXUBVPROC glad_glIndexubv = NULL; -PFNGLINITNAMESPROC glad_glInitNames = NULL; -PFNGLINTERLEAVEDARRAYSPROC glad_glInterleavedArrays = NULL; -PFNGLISBUFFERPROC glad_glIsBuffer = NULL; -PFNGLISENABLEDPROC glad_glIsEnabled = NULL; -PFNGLISENABLEDIPROC glad_glIsEnabledi = NULL; -PFNGLISFRAMEBUFFERPROC glad_glIsFramebuffer = NULL; -PFNGLISLISTPROC glad_glIsList = NULL; -PFNGLISPROGRAMPROC glad_glIsProgram = NULL; -PFNGLISQUERYPROC glad_glIsQuery = NULL; -PFNGLISRENDERBUFFERPROC glad_glIsRenderbuffer = NULL; -PFNGLISSAMPLERPROC glad_glIsSampler = NULL; -PFNGLISSHADERPROC glad_glIsShader = NULL; -PFNGLISSYNCPROC glad_glIsSync = NULL; -PFNGLISTEXTUREPROC glad_glIsTexture = NULL; -PFNGLISVERTEXARRAYPROC glad_glIsVertexArray = NULL; -PFNGLLIGHTMODELFPROC glad_glLightModelf = NULL; -PFNGLLIGHTMODELFVPROC glad_glLightModelfv = NULL; -PFNGLLIGHTMODELIPROC glad_glLightModeli = NULL; -PFNGLLIGHTMODELIVPROC glad_glLightModeliv = NULL; -PFNGLLIGHTFPROC glad_glLightf = NULL; -PFNGLLIGHTFVPROC glad_glLightfv = NULL; -PFNGLLIGHTIPROC glad_glLighti = NULL; -PFNGLLIGHTIVPROC glad_glLightiv = NULL; -PFNGLLINESTIPPLEPROC glad_glLineStipple = NULL; -PFNGLLINEWIDTHPROC glad_glLineWidth = NULL; -PFNGLLINKPROGRAMPROC glad_glLinkProgram = NULL; -PFNGLLISTBASEPROC glad_glListBase = NULL; -PFNGLLOADIDENTITYPROC glad_glLoadIdentity = NULL; -PFNGLLOADMATRIXDPROC glad_glLoadMatrixd = NULL; -PFNGLLOADMATRIXFPROC glad_glLoadMatrixf = NULL; -PFNGLLOADNAMEPROC glad_glLoadName = NULL; -PFNGLLOADTRANSPOSEMATRIXDPROC glad_glLoadTransposeMatrixd = NULL; -PFNGLLOADTRANSPOSEMATRIXFPROC glad_glLoadTransposeMatrixf = NULL; -PFNGLLOGICOPPROC glad_glLogicOp = NULL; -PFNGLMAP1DPROC glad_glMap1d = NULL; -PFNGLMAP1FPROC glad_glMap1f = NULL; -PFNGLMAP2DPROC glad_glMap2d = NULL; -PFNGLMAP2FPROC glad_glMap2f = NULL; -PFNGLMAPBUFFERPROC glad_glMapBuffer = NULL; -PFNGLMAPBUFFERRANGEPROC glad_glMapBufferRange = NULL; -PFNGLMAPGRID1DPROC glad_glMapGrid1d = NULL; -PFNGLMAPGRID1FPROC glad_glMapGrid1f = NULL; -PFNGLMAPGRID2DPROC glad_glMapGrid2d = NULL; -PFNGLMAPGRID2FPROC glad_glMapGrid2f = NULL; -PFNGLMATERIALFPROC glad_glMaterialf = NULL; -PFNGLMATERIALFVPROC glad_glMaterialfv = NULL; -PFNGLMATERIALIPROC glad_glMateriali = NULL; -PFNGLMATERIALIVPROC glad_glMaterialiv = NULL; -PFNGLMATRIXMODEPROC glad_glMatrixMode = NULL; -PFNGLMULTMATRIXDPROC glad_glMultMatrixd = NULL; -PFNGLMULTMATRIXFPROC glad_glMultMatrixf = NULL; -PFNGLMULTTRANSPOSEMATRIXDPROC glad_glMultTransposeMatrixd = NULL; -PFNGLMULTTRANSPOSEMATRIXFPROC glad_glMultTransposeMatrixf = NULL; -PFNGLMULTIDRAWARRAYSPROC glad_glMultiDrawArrays = NULL; -PFNGLMULTIDRAWELEMENTSPROC glad_glMultiDrawElements = NULL; -PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC glad_glMultiDrawElementsBaseVertex = NULL; -PFNGLMULTITEXCOORD1DPROC glad_glMultiTexCoord1d = NULL; -PFNGLMULTITEXCOORD1DVPROC glad_glMultiTexCoord1dv = NULL; -PFNGLMULTITEXCOORD1FPROC glad_glMultiTexCoord1f = NULL; -PFNGLMULTITEXCOORD1FVPROC glad_glMultiTexCoord1fv = NULL; -PFNGLMULTITEXCOORD1IPROC glad_glMultiTexCoord1i = NULL; -PFNGLMULTITEXCOORD1IVPROC glad_glMultiTexCoord1iv = NULL; -PFNGLMULTITEXCOORD1SPROC glad_glMultiTexCoord1s = NULL; -PFNGLMULTITEXCOORD1SVPROC glad_glMultiTexCoord1sv = NULL; -PFNGLMULTITEXCOORD2DPROC glad_glMultiTexCoord2d = NULL; -PFNGLMULTITEXCOORD2DVPROC glad_glMultiTexCoord2dv = NULL; -PFNGLMULTITEXCOORD2FPROC glad_glMultiTexCoord2f = NULL; -PFNGLMULTITEXCOORD2FVPROC glad_glMultiTexCoord2fv = NULL; -PFNGLMULTITEXCOORD2IPROC glad_glMultiTexCoord2i = NULL; -PFNGLMULTITEXCOORD2IVPROC glad_glMultiTexCoord2iv = NULL; -PFNGLMULTITEXCOORD2SPROC glad_glMultiTexCoord2s = NULL; -PFNGLMULTITEXCOORD2SVPROC glad_glMultiTexCoord2sv = NULL; -PFNGLMULTITEXCOORD3DPROC glad_glMultiTexCoord3d = NULL; -PFNGLMULTITEXCOORD3DVPROC glad_glMultiTexCoord3dv = NULL; -PFNGLMULTITEXCOORD3FPROC glad_glMultiTexCoord3f = NULL; -PFNGLMULTITEXCOORD3FVPROC glad_glMultiTexCoord3fv = NULL; -PFNGLMULTITEXCOORD3IPROC glad_glMultiTexCoord3i = NULL; -PFNGLMULTITEXCOORD3IVPROC glad_glMultiTexCoord3iv = NULL; -PFNGLMULTITEXCOORD3SPROC glad_glMultiTexCoord3s = NULL; -PFNGLMULTITEXCOORD3SVPROC glad_glMultiTexCoord3sv = NULL; -PFNGLMULTITEXCOORD4DPROC glad_glMultiTexCoord4d = NULL; -PFNGLMULTITEXCOORD4DVPROC glad_glMultiTexCoord4dv = NULL; -PFNGLMULTITEXCOORD4FPROC glad_glMultiTexCoord4f = NULL; -PFNGLMULTITEXCOORD4FVPROC glad_glMultiTexCoord4fv = NULL; -PFNGLMULTITEXCOORD4IPROC glad_glMultiTexCoord4i = NULL; -PFNGLMULTITEXCOORD4IVPROC glad_glMultiTexCoord4iv = NULL; -PFNGLMULTITEXCOORD4SPROC glad_glMultiTexCoord4s = NULL; -PFNGLMULTITEXCOORD4SVPROC glad_glMultiTexCoord4sv = NULL; -PFNGLMULTITEXCOORDP1UIPROC glad_glMultiTexCoordP1ui = NULL; -PFNGLMULTITEXCOORDP1UIVPROC glad_glMultiTexCoordP1uiv = NULL; -PFNGLMULTITEXCOORDP2UIPROC glad_glMultiTexCoordP2ui = NULL; -PFNGLMULTITEXCOORDP2UIVPROC glad_glMultiTexCoordP2uiv = NULL; -PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui = NULL; -PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv = NULL; -PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui = NULL; -PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv = NULL; -PFNGLNEWLISTPROC glad_glNewList = NULL; -PFNGLNORMAL3BPROC glad_glNormal3b = NULL; -PFNGLNORMAL3BVPROC glad_glNormal3bv = NULL; -PFNGLNORMAL3DPROC glad_glNormal3d = NULL; -PFNGLNORMAL3DVPROC glad_glNormal3dv = NULL; -PFNGLNORMAL3FPROC glad_glNormal3f = NULL; -PFNGLNORMAL3FVPROC glad_glNormal3fv = NULL; -PFNGLNORMAL3IPROC glad_glNormal3i = NULL; -PFNGLNORMAL3IVPROC glad_glNormal3iv = NULL; -PFNGLNORMAL3SPROC glad_glNormal3s = NULL; -PFNGLNORMAL3SVPROC glad_glNormal3sv = NULL; -PFNGLNORMALP3UIPROC glad_glNormalP3ui = NULL; -PFNGLNORMALP3UIVPROC glad_glNormalP3uiv = NULL; -PFNGLNORMALPOINTERPROC glad_glNormalPointer = NULL; -PFNGLORTHOPROC glad_glOrtho = NULL; -PFNGLPASSTHROUGHPROC glad_glPassThrough = NULL; -PFNGLPIXELMAPFVPROC glad_glPixelMapfv = NULL; -PFNGLPIXELMAPUIVPROC glad_glPixelMapuiv = NULL; -PFNGLPIXELMAPUSVPROC glad_glPixelMapusv = NULL; -PFNGLPIXELSTOREFPROC glad_glPixelStoref = NULL; -PFNGLPIXELSTOREIPROC glad_glPixelStorei = NULL; -PFNGLPIXELTRANSFERFPROC glad_glPixelTransferf = NULL; -PFNGLPIXELTRANSFERIPROC glad_glPixelTransferi = NULL; -PFNGLPIXELZOOMPROC glad_glPixelZoom = NULL; -PFNGLPOINTPARAMETERFPROC glad_glPointParameterf = NULL; -PFNGLPOINTPARAMETERFVPROC glad_glPointParameterfv = NULL; -PFNGLPOINTPARAMETERIPROC glad_glPointParameteri = NULL; -PFNGLPOINTPARAMETERIVPROC glad_glPointParameteriv = NULL; -PFNGLPOINTSIZEPROC glad_glPointSize = NULL; -PFNGLPOLYGONMODEPROC glad_glPolygonMode = NULL; -PFNGLPOLYGONOFFSETPROC glad_glPolygonOffset = NULL; -PFNGLPOLYGONSTIPPLEPROC glad_glPolygonStipple = NULL; -PFNGLPOPATTRIBPROC glad_glPopAttrib = NULL; -PFNGLPOPCLIENTATTRIBPROC glad_glPopClientAttrib = NULL; -PFNGLPOPMATRIXPROC glad_glPopMatrix = NULL; -PFNGLPOPNAMEPROC glad_glPopName = NULL; -PFNGLPRIMITIVERESTARTINDEXPROC glad_glPrimitiveRestartIndex = NULL; -PFNGLPRIORITIZETEXTURESPROC glad_glPrioritizeTextures = NULL; -PFNGLPROVOKINGVERTEXPROC glad_glProvokingVertex = NULL; -PFNGLPUSHATTRIBPROC glad_glPushAttrib = NULL; -PFNGLPUSHCLIENTATTRIBPROC glad_glPushClientAttrib = NULL; -PFNGLPUSHMATRIXPROC glad_glPushMatrix = NULL; -PFNGLPUSHNAMEPROC glad_glPushName = NULL; -PFNGLQUERYCOUNTERPROC glad_glQueryCounter = NULL; -PFNGLRASTERPOS2DPROC glad_glRasterPos2d = NULL; -PFNGLRASTERPOS2DVPROC glad_glRasterPos2dv = NULL; -PFNGLRASTERPOS2FPROC glad_glRasterPos2f = NULL; -PFNGLRASTERPOS2FVPROC glad_glRasterPos2fv = NULL; -PFNGLRASTERPOS2IPROC glad_glRasterPos2i = NULL; -PFNGLRASTERPOS2IVPROC glad_glRasterPos2iv = NULL; -PFNGLRASTERPOS2SPROC glad_glRasterPos2s = NULL; -PFNGLRASTERPOS2SVPROC glad_glRasterPos2sv = NULL; -PFNGLRASTERPOS3DPROC glad_glRasterPos3d = NULL; -PFNGLRASTERPOS3DVPROC glad_glRasterPos3dv = NULL; -PFNGLRASTERPOS3FPROC glad_glRasterPos3f = NULL; -PFNGLRASTERPOS3FVPROC glad_glRasterPos3fv = NULL; -PFNGLRASTERPOS3IPROC glad_glRasterPos3i = NULL; -PFNGLRASTERPOS3IVPROC glad_glRasterPos3iv = NULL; -PFNGLRASTERPOS3SPROC glad_glRasterPos3s = NULL; -PFNGLRASTERPOS3SVPROC glad_glRasterPos3sv = NULL; -PFNGLRASTERPOS4DPROC glad_glRasterPos4d = NULL; -PFNGLRASTERPOS4DVPROC glad_glRasterPos4dv = NULL; -PFNGLRASTERPOS4FPROC glad_glRasterPos4f = NULL; -PFNGLRASTERPOS4FVPROC glad_glRasterPos4fv = NULL; -PFNGLRASTERPOS4IPROC glad_glRasterPos4i = NULL; -PFNGLRASTERPOS4IVPROC glad_glRasterPos4iv = NULL; -PFNGLRASTERPOS4SPROC glad_glRasterPos4s = NULL; -PFNGLRASTERPOS4SVPROC glad_glRasterPos4sv = NULL; -PFNGLREADBUFFERPROC glad_glReadBuffer = NULL; -PFNGLREADPIXELSPROC glad_glReadPixels = NULL; -PFNGLRECTDPROC glad_glRectd = NULL; -PFNGLRECTDVPROC glad_glRectdv = NULL; -PFNGLRECTFPROC glad_glRectf = NULL; -PFNGLRECTFVPROC glad_glRectfv = NULL; -PFNGLRECTIPROC glad_glRecti = NULL; -PFNGLRECTIVPROC glad_glRectiv = NULL; -PFNGLRECTSPROC glad_glRects = NULL; -PFNGLRECTSVPROC glad_glRectsv = NULL; -PFNGLRENDERMODEPROC glad_glRenderMode = NULL; -PFNGLRENDERBUFFERSTORAGEPROC glad_glRenderbufferStorage = NULL; -PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC glad_glRenderbufferStorageMultisample = NULL; -PFNGLROTATEDPROC glad_glRotated = NULL; -PFNGLROTATEFPROC glad_glRotatef = NULL; -PFNGLSAMPLECOVERAGEPROC glad_glSampleCoverage = NULL; -PFNGLSAMPLEMASKIPROC glad_glSampleMaski = NULL; -PFNGLSAMPLERPARAMETERIIVPROC glad_glSamplerParameterIiv = NULL; -PFNGLSAMPLERPARAMETERIUIVPROC glad_glSamplerParameterIuiv = NULL; -PFNGLSAMPLERPARAMETERFPROC glad_glSamplerParameterf = NULL; -PFNGLSAMPLERPARAMETERFVPROC glad_glSamplerParameterfv = NULL; -PFNGLSAMPLERPARAMETERIPROC glad_glSamplerParameteri = NULL; -PFNGLSAMPLERPARAMETERIVPROC glad_glSamplerParameteriv = NULL; -PFNGLSCALEDPROC glad_glScaled = NULL; -PFNGLSCALEFPROC glad_glScalef = NULL; -PFNGLSCISSORPROC glad_glScissor = NULL; -PFNGLSECONDARYCOLOR3BPROC glad_glSecondaryColor3b = NULL; -PFNGLSECONDARYCOLOR3BVPROC glad_glSecondaryColor3bv = NULL; -PFNGLSECONDARYCOLOR3DPROC glad_glSecondaryColor3d = NULL; -PFNGLSECONDARYCOLOR3DVPROC glad_glSecondaryColor3dv = NULL; -PFNGLSECONDARYCOLOR3FPROC glad_glSecondaryColor3f = NULL; -PFNGLSECONDARYCOLOR3FVPROC glad_glSecondaryColor3fv = NULL; -PFNGLSECONDARYCOLOR3IPROC glad_glSecondaryColor3i = NULL; -PFNGLSECONDARYCOLOR3IVPROC glad_glSecondaryColor3iv = NULL; -PFNGLSECONDARYCOLOR3SPROC glad_glSecondaryColor3s = NULL; -PFNGLSECONDARYCOLOR3SVPROC glad_glSecondaryColor3sv = NULL; -PFNGLSECONDARYCOLOR3UBPROC glad_glSecondaryColor3ub = NULL; -PFNGLSECONDARYCOLOR3UBVPROC glad_glSecondaryColor3ubv = NULL; -PFNGLSECONDARYCOLOR3UIPROC glad_glSecondaryColor3ui = NULL; -PFNGLSECONDARYCOLOR3UIVPROC glad_glSecondaryColor3uiv = NULL; -PFNGLSECONDARYCOLOR3USPROC glad_glSecondaryColor3us = NULL; -PFNGLSECONDARYCOLOR3USVPROC glad_glSecondaryColor3usv = NULL; -PFNGLSECONDARYCOLORP3UIPROC glad_glSecondaryColorP3ui = NULL; -PFNGLSECONDARYCOLORP3UIVPROC glad_glSecondaryColorP3uiv = NULL; -PFNGLSECONDARYCOLORPOINTERPROC glad_glSecondaryColorPointer = NULL; -PFNGLSELECTBUFFERPROC glad_glSelectBuffer = NULL; -PFNGLSHADEMODELPROC glad_glShadeModel = NULL; -PFNGLSHADERSOURCEPROC glad_glShaderSource = NULL; -PFNGLSTENCILFUNCPROC glad_glStencilFunc = NULL; -PFNGLSTENCILFUNCSEPARATEPROC glad_glStencilFuncSeparate = NULL; -PFNGLSTENCILMASKPROC glad_glStencilMask = NULL; -PFNGLSTENCILMASKSEPARATEPROC glad_glStencilMaskSeparate = NULL; -PFNGLSTENCILOPPROC glad_glStencilOp = NULL; -PFNGLSTENCILOPSEPARATEPROC glad_glStencilOpSeparate = NULL; -PFNGLTEXBUFFERPROC glad_glTexBuffer = NULL; -PFNGLTEXCOORD1DPROC glad_glTexCoord1d = NULL; -PFNGLTEXCOORD1DVPROC glad_glTexCoord1dv = NULL; -PFNGLTEXCOORD1FPROC glad_glTexCoord1f = NULL; -PFNGLTEXCOORD1FVPROC glad_glTexCoord1fv = NULL; -PFNGLTEXCOORD1IPROC glad_glTexCoord1i = NULL; -PFNGLTEXCOORD1IVPROC glad_glTexCoord1iv = NULL; -PFNGLTEXCOORD1SPROC glad_glTexCoord1s = NULL; -PFNGLTEXCOORD1SVPROC glad_glTexCoord1sv = NULL; -PFNGLTEXCOORD2DPROC glad_glTexCoord2d = NULL; -PFNGLTEXCOORD2DVPROC glad_glTexCoord2dv = NULL; -PFNGLTEXCOORD2FPROC glad_glTexCoord2f = NULL; -PFNGLTEXCOORD2FVPROC glad_glTexCoord2fv = NULL; -PFNGLTEXCOORD2IPROC glad_glTexCoord2i = NULL; -PFNGLTEXCOORD2IVPROC glad_glTexCoord2iv = NULL; -PFNGLTEXCOORD2SPROC glad_glTexCoord2s = NULL; -PFNGLTEXCOORD2SVPROC glad_glTexCoord2sv = NULL; -PFNGLTEXCOORD3DPROC glad_glTexCoord3d = NULL; -PFNGLTEXCOORD3DVPROC glad_glTexCoord3dv = NULL; -PFNGLTEXCOORD3FPROC glad_glTexCoord3f = NULL; -PFNGLTEXCOORD3FVPROC glad_glTexCoord3fv = NULL; -PFNGLTEXCOORD3IPROC glad_glTexCoord3i = NULL; -PFNGLTEXCOORD3IVPROC glad_glTexCoord3iv = NULL; -PFNGLTEXCOORD3SPROC glad_glTexCoord3s = NULL; -PFNGLTEXCOORD3SVPROC glad_glTexCoord3sv = NULL; -PFNGLTEXCOORD4DPROC glad_glTexCoord4d = NULL; -PFNGLTEXCOORD4DVPROC glad_glTexCoord4dv = NULL; -PFNGLTEXCOORD4FPROC glad_glTexCoord4f = NULL; -PFNGLTEXCOORD4FVPROC glad_glTexCoord4fv = NULL; -PFNGLTEXCOORD4IPROC glad_glTexCoord4i = NULL; -PFNGLTEXCOORD4IVPROC glad_glTexCoord4iv = NULL; -PFNGLTEXCOORD4SPROC glad_glTexCoord4s = NULL; -PFNGLTEXCOORD4SVPROC glad_glTexCoord4sv = NULL; -PFNGLTEXCOORDP1UIPROC glad_glTexCoordP1ui = NULL; -PFNGLTEXCOORDP1UIVPROC glad_glTexCoordP1uiv = NULL; -PFNGLTEXCOORDP2UIPROC glad_glTexCoordP2ui = NULL; -PFNGLTEXCOORDP2UIVPROC glad_glTexCoordP2uiv = NULL; -PFNGLTEXCOORDP3UIPROC glad_glTexCoordP3ui = NULL; -PFNGLTEXCOORDP3UIVPROC glad_glTexCoordP3uiv = NULL; -PFNGLTEXCOORDP4UIPROC glad_glTexCoordP4ui = NULL; -PFNGLTEXCOORDP4UIVPROC glad_glTexCoordP4uiv = NULL; -PFNGLTEXCOORDPOINTERPROC glad_glTexCoordPointer = NULL; -PFNGLTEXENVFPROC glad_glTexEnvf = NULL; -PFNGLTEXENVFVPROC glad_glTexEnvfv = NULL; -PFNGLTEXENVIPROC glad_glTexEnvi = NULL; -PFNGLTEXENVIVPROC glad_glTexEnviv = NULL; -PFNGLTEXGENDPROC glad_glTexGend = NULL; -PFNGLTEXGENDVPROC glad_glTexGendv = NULL; -PFNGLTEXGENFPROC glad_glTexGenf = NULL; -PFNGLTEXGENFVPROC glad_glTexGenfv = NULL; -PFNGLTEXGENIPROC glad_glTexGeni = NULL; -PFNGLTEXGENIVPROC glad_glTexGeniv = NULL; -PFNGLTEXIMAGE1DPROC glad_glTexImage1D = NULL; -PFNGLTEXIMAGE2DPROC glad_glTexImage2D = NULL; -PFNGLTEXIMAGE2DMULTISAMPLEPROC glad_glTexImage2DMultisample = NULL; -PFNGLTEXIMAGE3DPROC glad_glTexImage3D = NULL; -PFNGLTEXIMAGE3DMULTISAMPLEPROC glad_glTexImage3DMultisample = NULL; -PFNGLTEXPARAMETERIIVPROC glad_glTexParameterIiv = NULL; -PFNGLTEXPARAMETERIUIVPROC glad_glTexParameterIuiv = NULL; -PFNGLTEXPARAMETERFPROC glad_glTexParameterf = NULL; -PFNGLTEXPARAMETERFVPROC glad_glTexParameterfv = NULL; -PFNGLTEXPARAMETERIPROC glad_glTexParameteri = NULL; -PFNGLTEXPARAMETERIVPROC glad_glTexParameteriv = NULL; -PFNGLTEXSUBIMAGE1DPROC glad_glTexSubImage1D = NULL; -PFNGLTEXSUBIMAGE2DPROC glad_glTexSubImage2D = NULL; -PFNGLTEXSUBIMAGE3DPROC glad_glTexSubImage3D = NULL; -PFNGLTRANSFORMFEEDBACKVARYINGSPROC glad_glTransformFeedbackVaryings = NULL; -PFNGLTRANSLATEDPROC glad_glTranslated = NULL; -PFNGLTRANSLATEFPROC glad_glTranslatef = NULL; -PFNGLUNIFORM1FPROC glad_glUniform1f = NULL; -PFNGLUNIFORM1FVPROC glad_glUniform1fv = NULL; -PFNGLUNIFORM1IPROC glad_glUniform1i = NULL; -PFNGLUNIFORM1IVPROC glad_glUniform1iv = NULL; -PFNGLUNIFORM1UIPROC glad_glUniform1ui = NULL; -PFNGLUNIFORM1UIVPROC glad_glUniform1uiv = NULL; -PFNGLUNIFORM2FPROC glad_glUniform2f = NULL; -PFNGLUNIFORM2FVPROC glad_glUniform2fv = NULL; -PFNGLUNIFORM2IPROC glad_glUniform2i = NULL; -PFNGLUNIFORM2IVPROC glad_glUniform2iv = NULL; -PFNGLUNIFORM2UIPROC glad_glUniform2ui = NULL; -PFNGLUNIFORM2UIVPROC glad_glUniform2uiv = NULL; -PFNGLUNIFORM3FPROC glad_glUniform3f = NULL; -PFNGLUNIFORM3FVPROC glad_glUniform3fv = NULL; -PFNGLUNIFORM3IPROC glad_glUniform3i = NULL; -PFNGLUNIFORM3IVPROC glad_glUniform3iv = NULL; -PFNGLUNIFORM3UIPROC glad_glUniform3ui = NULL; -PFNGLUNIFORM3UIVPROC glad_glUniform3uiv = NULL; -PFNGLUNIFORM4FPROC glad_glUniform4f = NULL; -PFNGLUNIFORM4FVPROC glad_glUniform4fv = NULL; -PFNGLUNIFORM4IPROC glad_glUniform4i = NULL; -PFNGLUNIFORM4IVPROC glad_glUniform4iv = NULL; -PFNGLUNIFORM4UIPROC glad_glUniform4ui = NULL; -PFNGLUNIFORM4UIVPROC glad_glUniform4uiv = NULL; -PFNGLUNIFORMBLOCKBINDINGPROC glad_glUniformBlockBinding = NULL; -PFNGLUNIFORMMATRIX2FVPROC glad_glUniformMatrix2fv = NULL; -PFNGLUNIFORMMATRIX2X3FVPROC glad_glUniformMatrix2x3fv = NULL; -PFNGLUNIFORMMATRIX2X4FVPROC glad_glUniformMatrix2x4fv = NULL; -PFNGLUNIFORMMATRIX3FVPROC glad_glUniformMatrix3fv = NULL; -PFNGLUNIFORMMATRIX3X2FVPROC glad_glUniformMatrix3x2fv = NULL; -PFNGLUNIFORMMATRIX3X4FVPROC glad_glUniformMatrix3x4fv = NULL; -PFNGLUNIFORMMATRIX4FVPROC glad_glUniformMatrix4fv = NULL; -PFNGLUNIFORMMATRIX4X2FVPROC glad_glUniformMatrix4x2fv = NULL; -PFNGLUNIFORMMATRIX4X3FVPROC glad_glUniformMatrix4x3fv = NULL; -PFNGLUNMAPBUFFERPROC glad_glUnmapBuffer = NULL; -PFNGLUSEPROGRAMPROC glad_glUseProgram = NULL; -PFNGLVALIDATEPROGRAMPROC glad_glValidateProgram = NULL; -PFNGLVERTEX2DPROC glad_glVertex2d = NULL; -PFNGLVERTEX2DVPROC glad_glVertex2dv = NULL; -PFNGLVERTEX2FPROC glad_glVertex2f = NULL; -PFNGLVERTEX2FVPROC glad_glVertex2fv = NULL; -PFNGLVERTEX2IPROC glad_glVertex2i = NULL; -PFNGLVERTEX2IVPROC glad_glVertex2iv = NULL; -PFNGLVERTEX2SPROC glad_glVertex2s = NULL; -PFNGLVERTEX2SVPROC glad_glVertex2sv = NULL; -PFNGLVERTEX3DPROC glad_glVertex3d = NULL; -PFNGLVERTEX3DVPROC glad_glVertex3dv = NULL; -PFNGLVERTEX3FPROC glad_glVertex3f = NULL; -PFNGLVERTEX3FVPROC glad_glVertex3fv = NULL; -PFNGLVERTEX3IPROC glad_glVertex3i = NULL; -PFNGLVERTEX3IVPROC glad_glVertex3iv = NULL; -PFNGLVERTEX3SPROC glad_glVertex3s = NULL; -PFNGLVERTEX3SVPROC glad_glVertex3sv = NULL; -PFNGLVERTEX4DPROC glad_glVertex4d = NULL; -PFNGLVERTEX4DVPROC glad_glVertex4dv = NULL; -PFNGLVERTEX4FPROC glad_glVertex4f = NULL; -PFNGLVERTEX4FVPROC glad_glVertex4fv = NULL; -PFNGLVERTEX4IPROC glad_glVertex4i = NULL; -PFNGLVERTEX4IVPROC glad_glVertex4iv = NULL; -PFNGLVERTEX4SPROC glad_glVertex4s = NULL; -PFNGLVERTEX4SVPROC glad_glVertex4sv = NULL; -PFNGLVERTEXATTRIB1DPROC glad_glVertexAttrib1d = NULL; -PFNGLVERTEXATTRIB1DVPROC glad_glVertexAttrib1dv = NULL; -PFNGLVERTEXATTRIB1FPROC glad_glVertexAttrib1f = NULL; -PFNGLVERTEXATTRIB1FVPROC glad_glVertexAttrib1fv = NULL; -PFNGLVERTEXATTRIB1SPROC glad_glVertexAttrib1s = NULL; -PFNGLVERTEXATTRIB1SVPROC glad_glVertexAttrib1sv = NULL; -PFNGLVERTEXATTRIB2DPROC glad_glVertexAttrib2d = NULL; -PFNGLVERTEXATTRIB2DVPROC glad_glVertexAttrib2dv = NULL; -PFNGLVERTEXATTRIB2FPROC glad_glVertexAttrib2f = NULL; -PFNGLVERTEXATTRIB2FVPROC glad_glVertexAttrib2fv = NULL; -PFNGLVERTEXATTRIB2SPROC glad_glVertexAttrib2s = NULL; -PFNGLVERTEXATTRIB2SVPROC glad_glVertexAttrib2sv = NULL; -PFNGLVERTEXATTRIB3DPROC glad_glVertexAttrib3d = NULL; -PFNGLVERTEXATTRIB3DVPROC glad_glVertexAttrib3dv = NULL; -PFNGLVERTEXATTRIB3FPROC glad_glVertexAttrib3f = NULL; -PFNGLVERTEXATTRIB3FVPROC glad_glVertexAttrib3fv = NULL; -PFNGLVERTEXATTRIB3SPROC glad_glVertexAttrib3s = NULL; -PFNGLVERTEXATTRIB3SVPROC glad_glVertexAttrib3sv = NULL; -PFNGLVERTEXATTRIB4NBVPROC glad_glVertexAttrib4Nbv = NULL; -PFNGLVERTEXATTRIB4NIVPROC glad_glVertexAttrib4Niv = NULL; -PFNGLVERTEXATTRIB4NSVPROC glad_glVertexAttrib4Nsv = NULL; -PFNGLVERTEXATTRIB4NUBPROC glad_glVertexAttrib4Nub = NULL; -PFNGLVERTEXATTRIB4NUBVPROC glad_glVertexAttrib4Nubv = NULL; -PFNGLVERTEXATTRIB4NUIVPROC glad_glVertexAttrib4Nuiv = NULL; -PFNGLVERTEXATTRIB4NUSVPROC glad_glVertexAttrib4Nusv = NULL; -PFNGLVERTEXATTRIB4BVPROC glad_glVertexAttrib4bv = NULL; -PFNGLVERTEXATTRIB4DPROC glad_glVertexAttrib4d = NULL; -PFNGLVERTEXATTRIB4DVPROC glad_glVertexAttrib4dv = NULL; -PFNGLVERTEXATTRIB4FPROC glad_glVertexAttrib4f = NULL; -PFNGLVERTEXATTRIB4FVPROC glad_glVertexAttrib4fv = NULL; -PFNGLVERTEXATTRIB4IVPROC glad_glVertexAttrib4iv = NULL; -PFNGLVERTEXATTRIB4SPROC glad_glVertexAttrib4s = NULL; -PFNGLVERTEXATTRIB4SVPROC glad_glVertexAttrib4sv = NULL; -PFNGLVERTEXATTRIB4UBVPROC glad_glVertexAttrib4ubv = NULL; -PFNGLVERTEXATTRIB4UIVPROC glad_glVertexAttrib4uiv = NULL; -PFNGLVERTEXATTRIB4USVPROC glad_glVertexAttrib4usv = NULL; -PFNGLVERTEXATTRIBDIVISORPROC glad_glVertexAttribDivisor = NULL; -PFNGLVERTEXATTRIBI1IPROC glad_glVertexAttribI1i = NULL; -PFNGLVERTEXATTRIBI1IVPROC glad_glVertexAttribI1iv = NULL; -PFNGLVERTEXATTRIBI1UIPROC glad_glVertexAttribI1ui = NULL; -PFNGLVERTEXATTRIBI1UIVPROC glad_glVertexAttribI1uiv = NULL; -PFNGLVERTEXATTRIBI2IPROC glad_glVertexAttribI2i = NULL; -PFNGLVERTEXATTRIBI2IVPROC glad_glVertexAttribI2iv = NULL; -PFNGLVERTEXATTRIBI2UIPROC glad_glVertexAttribI2ui = NULL; -PFNGLVERTEXATTRIBI2UIVPROC glad_glVertexAttribI2uiv = NULL; -PFNGLVERTEXATTRIBI3IPROC glad_glVertexAttribI3i = NULL; -PFNGLVERTEXATTRIBI3IVPROC glad_glVertexAttribI3iv = NULL; -PFNGLVERTEXATTRIBI3UIPROC glad_glVertexAttribI3ui = NULL; -PFNGLVERTEXATTRIBI3UIVPROC glad_glVertexAttribI3uiv = NULL; -PFNGLVERTEXATTRIBI4BVPROC glad_glVertexAttribI4bv = NULL; -PFNGLVERTEXATTRIBI4IPROC glad_glVertexAttribI4i = NULL; -PFNGLVERTEXATTRIBI4IVPROC glad_glVertexAttribI4iv = NULL; -PFNGLVERTEXATTRIBI4SVPROC glad_glVertexAttribI4sv = NULL; -PFNGLVERTEXATTRIBI4UBVPROC glad_glVertexAttribI4ubv = NULL; -PFNGLVERTEXATTRIBI4UIPROC glad_glVertexAttribI4ui = NULL; -PFNGLVERTEXATTRIBI4UIVPROC glad_glVertexAttribI4uiv = NULL; -PFNGLVERTEXATTRIBI4USVPROC glad_glVertexAttribI4usv = NULL; -PFNGLVERTEXATTRIBIPOINTERPROC glad_glVertexAttribIPointer = NULL; -PFNGLVERTEXATTRIBP1UIPROC glad_glVertexAttribP1ui = NULL; -PFNGLVERTEXATTRIBP1UIVPROC glad_glVertexAttribP1uiv = NULL; -PFNGLVERTEXATTRIBP2UIPROC glad_glVertexAttribP2ui = NULL; -PFNGLVERTEXATTRIBP2UIVPROC glad_glVertexAttribP2uiv = NULL; -PFNGLVERTEXATTRIBP3UIPROC glad_glVertexAttribP3ui = NULL; -PFNGLVERTEXATTRIBP3UIVPROC glad_glVertexAttribP3uiv = NULL; -PFNGLVERTEXATTRIBP4UIPROC glad_glVertexAttribP4ui = NULL; -PFNGLVERTEXATTRIBP4UIVPROC glad_glVertexAttribP4uiv = NULL; -PFNGLVERTEXATTRIBPOINTERPROC glad_glVertexAttribPointer = NULL; -PFNGLVERTEXP2UIPROC glad_glVertexP2ui = NULL; -PFNGLVERTEXP2UIVPROC glad_glVertexP2uiv = NULL; -PFNGLVERTEXP3UIPROC glad_glVertexP3ui = NULL; -PFNGLVERTEXP3UIVPROC glad_glVertexP3uiv = NULL; -PFNGLVERTEXP4UIPROC glad_glVertexP4ui = NULL; -PFNGLVERTEXP4UIVPROC glad_glVertexP4uiv = NULL; -PFNGLVERTEXPOINTERPROC glad_glVertexPointer = NULL; -PFNGLVIEWPORTPROC glad_glViewport = NULL; -PFNGLWAITSYNCPROC glad_glWaitSync = NULL; -PFNGLWINDOWPOS2DPROC glad_glWindowPos2d = NULL; -PFNGLWINDOWPOS2DVPROC glad_glWindowPos2dv = NULL; -PFNGLWINDOWPOS2FPROC glad_glWindowPos2f = NULL; -PFNGLWINDOWPOS2FVPROC glad_glWindowPos2fv = NULL; -PFNGLWINDOWPOS2IPROC glad_glWindowPos2i = NULL; -PFNGLWINDOWPOS2IVPROC glad_glWindowPos2iv = NULL; -PFNGLWINDOWPOS2SPROC glad_glWindowPos2s = NULL; -PFNGLWINDOWPOS2SVPROC glad_glWindowPos2sv = NULL; -PFNGLWINDOWPOS3DPROC glad_glWindowPos3d = NULL; -PFNGLWINDOWPOS3DVPROC glad_glWindowPos3dv = NULL; -PFNGLWINDOWPOS3FPROC glad_glWindowPos3f = NULL; -PFNGLWINDOWPOS3FVPROC glad_glWindowPos3fv = NULL; -PFNGLWINDOWPOS3IPROC glad_glWindowPos3i = NULL; -PFNGLWINDOWPOS3IVPROC glad_glWindowPos3iv = NULL; -PFNGLWINDOWPOS3SPROC glad_glWindowPos3s = NULL; -PFNGLWINDOWPOS3SVPROC glad_glWindowPos3sv = NULL; -int GLAD_GL_ARB_debug_output = 0; -int GLAD_GL_ARB_framebuffer_object = 0; -int GLAD_GL_EXT_framebuffer_blit = 0; -int GLAD_GL_EXT_framebuffer_multisample = 0; -int GLAD_GL_EXT_framebuffer_object = 0; -int GLAD_GL_OVR_multiview = 0; -int GLAD_GL_OVR_multiview2 = 0; -PFNGLDEBUGMESSAGECONTROLARBPROC glad_glDebugMessageControlARB = NULL; -PFNGLDEBUGMESSAGEINSERTARBPROC glad_glDebugMessageInsertARB = NULL; -PFNGLDEBUGMESSAGECALLBACKARBPROC glad_glDebugMessageCallbackARB = NULL; -PFNGLGETDEBUGMESSAGELOGARBPROC glad_glGetDebugMessageLogARB = NULL; -PFNGLBLITFRAMEBUFFEREXTPROC glad_glBlitFramebufferEXT = NULL; -PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glad_glRenderbufferStorageMultisampleEXT = NULL; -PFNGLISRENDERBUFFEREXTPROC glad_glIsRenderbufferEXT = NULL; -PFNGLBINDRENDERBUFFEREXTPROC glad_glBindRenderbufferEXT = NULL; -PFNGLDELETERENDERBUFFERSEXTPROC glad_glDeleteRenderbuffersEXT = NULL; -PFNGLGENRENDERBUFFERSEXTPROC glad_glGenRenderbuffersEXT = NULL; -PFNGLRENDERBUFFERSTORAGEEXTPROC glad_glRenderbufferStorageEXT = NULL; -PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glad_glGetRenderbufferParameterivEXT = NULL; -PFNGLISFRAMEBUFFEREXTPROC glad_glIsFramebufferEXT = NULL; -PFNGLBINDFRAMEBUFFEREXTPROC glad_glBindFramebufferEXT = NULL; -PFNGLDELETEFRAMEBUFFERSEXTPROC glad_glDeleteFramebuffersEXT = NULL; -PFNGLGENFRAMEBUFFERSEXTPROC glad_glGenFramebuffersEXT = NULL; -PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glad_glCheckFramebufferStatusEXT = NULL; -PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glad_glFramebufferTexture1DEXT = NULL; -PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glad_glFramebufferTexture2DEXT = NULL; -PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glad_glFramebufferTexture3DEXT = NULL; -PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glad_glFramebufferRenderbufferEXT = NULL; -PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glad_glGetFramebufferAttachmentParameterivEXT = NULL; -PFNGLGENERATEMIPMAPEXTPROC glad_glGenerateMipmapEXT = NULL; -PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glFramebufferTextureMultiviewOVR = NULL; -static void load_GL_VERSION_1_0(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_0) return; - glad_glCullFace = (PFNGLCULLFACEPROC)load("glCullFace"); - glad_glFrontFace = (PFNGLFRONTFACEPROC)load("glFrontFace"); - glad_glHint = (PFNGLHINTPROC)load("glHint"); - glad_glLineWidth = (PFNGLLINEWIDTHPROC)load("glLineWidth"); - glad_glPointSize = (PFNGLPOINTSIZEPROC)load("glPointSize"); - glad_glPolygonMode = (PFNGLPOLYGONMODEPROC)load("glPolygonMode"); - glad_glScissor = (PFNGLSCISSORPROC)load("glScissor"); - glad_glTexParameterf = (PFNGLTEXPARAMETERFPROC)load("glTexParameterf"); - glad_glTexParameterfv = (PFNGLTEXPARAMETERFVPROC)load("glTexParameterfv"); - glad_glTexParameteri = (PFNGLTEXPARAMETERIPROC)load("glTexParameteri"); - glad_glTexParameteriv = (PFNGLTEXPARAMETERIVPROC)load("glTexParameteriv"); - glad_glTexImage1D = (PFNGLTEXIMAGE1DPROC)load("glTexImage1D"); - glad_glTexImage2D = (PFNGLTEXIMAGE2DPROC)load("glTexImage2D"); - glad_glDrawBuffer = (PFNGLDRAWBUFFERPROC)load("glDrawBuffer"); - glad_glClear = (PFNGLCLEARPROC)load("glClear"); - glad_glClearColor = (PFNGLCLEARCOLORPROC)load("glClearColor"); - glad_glClearStencil = (PFNGLCLEARSTENCILPROC)load("glClearStencil"); - glad_glClearDepth = (PFNGLCLEARDEPTHPROC)load("glClearDepth"); - glad_glStencilMask = (PFNGLSTENCILMASKPROC)load("glStencilMask"); - glad_glColorMask = (PFNGLCOLORMASKPROC)load("glColorMask"); - glad_glDepthMask = (PFNGLDEPTHMASKPROC)load("glDepthMask"); - glad_glDisable = (PFNGLDISABLEPROC)load("glDisable"); - glad_glEnable = (PFNGLENABLEPROC)load("glEnable"); - glad_glFinish = (PFNGLFINISHPROC)load("glFinish"); - glad_glFlush = (PFNGLFLUSHPROC)load("glFlush"); - glad_glBlendFunc = (PFNGLBLENDFUNCPROC)load("glBlendFunc"); - glad_glLogicOp = (PFNGLLOGICOPPROC)load("glLogicOp"); - glad_glStencilFunc = (PFNGLSTENCILFUNCPROC)load("glStencilFunc"); - glad_glStencilOp = (PFNGLSTENCILOPPROC)load("glStencilOp"); - glad_glDepthFunc = (PFNGLDEPTHFUNCPROC)load("glDepthFunc"); - glad_glPixelStoref = (PFNGLPIXELSTOREFPROC)load("glPixelStoref"); - glad_glPixelStorei = (PFNGLPIXELSTOREIPROC)load("glPixelStorei"); - glad_glReadBuffer = (PFNGLREADBUFFERPROC)load("glReadBuffer"); - glad_glReadPixels = (PFNGLREADPIXELSPROC)load("glReadPixels"); - glad_glGetBooleanv = (PFNGLGETBOOLEANVPROC)load("glGetBooleanv"); - glad_glGetDoublev = (PFNGLGETDOUBLEVPROC)load("glGetDoublev"); - glad_glGetError = (PFNGLGETERRORPROC)load("glGetError"); - glad_glGetFloatv = (PFNGLGETFLOATVPROC)load("glGetFloatv"); - glad_glGetIntegerv = (PFNGLGETINTEGERVPROC)load("glGetIntegerv"); - glad_glGetString = (PFNGLGETSTRINGPROC)load("glGetString"); - glad_glGetTexImage = (PFNGLGETTEXIMAGEPROC)load("glGetTexImage"); - glad_glGetTexParameterfv = (PFNGLGETTEXPARAMETERFVPROC)load("glGetTexParameterfv"); - glad_glGetTexParameteriv = (PFNGLGETTEXPARAMETERIVPROC)load("glGetTexParameteriv"); - glad_glGetTexLevelParameterfv = (PFNGLGETTEXLEVELPARAMETERFVPROC)load("glGetTexLevelParameterfv"); - glad_glGetTexLevelParameteriv = (PFNGLGETTEXLEVELPARAMETERIVPROC)load("glGetTexLevelParameteriv"); - glad_glIsEnabled = (PFNGLISENABLEDPROC)load("glIsEnabled"); - glad_glDepthRange = (PFNGLDEPTHRANGEPROC)load("glDepthRange"); - glad_glViewport = (PFNGLVIEWPORTPROC)load("glViewport"); - glad_glNewList = (PFNGLNEWLISTPROC)load("glNewList"); - glad_glEndList = (PFNGLENDLISTPROC)load("glEndList"); - glad_glCallList = (PFNGLCALLLISTPROC)load("glCallList"); - glad_glCallLists = (PFNGLCALLLISTSPROC)load("glCallLists"); - glad_glDeleteLists = (PFNGLDELETELISTSPROC)load("glDeleteLists"); - glad_glGenLists = (PFNGLGENLISTSPROC)load("glGenLists"); - glad_glListBase = (PFNGLLISTBASEPROC)load("glListBase"); - glad_glBegin = (PFNGLBEGINPROC)load("glBegin"); - glad_glBitmap = (PFNGLBITMAPPROC)load("glBitmap"); - glad_glColor3b = (PFNGLCOLOR3BPROC)load("glColor3b"); - glad_glColor3bv = (PFNGLCOLOR3BVPROC)load("glColor3bv"); - glad_glColor3d = (PFNGLCOLOR3DPROC)load("glColor3d"); - glad_glColor3dv = (PFNGLCOLOR3DVPROC)load("glColor3dv"); - glad_glColor3f = (PFNGLCOLOR3FPROC)load("glColor3f"); - glad_glColor3fv = (PFNGLCOLOR3FVPROC)load("glColor3fv"); - glad_glColor3i = (PFNGLCOLOR3IPROC)load("glColor3i"); - glad_glColor3iv = (PFNGLCOLOR3IVPROC)load("glColor3iv"); - glad_glColor3s = (PFNGLCOLOR3SPROC)load("glColor3s"); - glad_glColor3sv = (PFNGLCOLOR3SVPROC)load("glColor3sv"); - glad_glColor3ub = (PFNGLCOLOR3UBPROC)load("glColor3ub"); - glad_glColor3ubv = (PFNGLCOLOR3UBVPROC)load("glColor3ubv"); - glad_glColor3ui = (PFNGLCOLOR3UIPROC)load("glColor3ui"); - glad_glColor3uiv = (PFNGLCOLOR3UIVPROC)load("glColor3uiv"); - glad_glColor3us = (PFNGLCOLOR3USPROC)load("glColor3us"); - glad_glColor3usv = (PFNGLCOLOR3USVPROC)load("glColor3usv"); - glad_glColor4b = (PFNGLCOLOR4BPROC)load("glColor4b"); - glad_glColor4bv = (PFNGLCOLOR4BVPROC)load("glColor4bv"); - glad_glColor4d = (PFNGLCOLOR4DPROC)load("glColor4d"); - glad_glColor4dv = (PFNGLCOLOR4DVPROC)load("glColor4dv"); - glad_glColor4f = (PFNGLCOLOR4FPROC)load("glColor4f"); - glad_glColor4fv = (PFNGLCOLOR4FVPROC)load("glColor4fv"); - glad_glColor4i = (PFNGLCOLOR4IPROC)load("glColor4i"); - glad_glColor4iv = (PFNGLCOLOR4IVPROC)load("glColor4iv"); - glad_glColor4s = (PFNGLCOLOR4SPROC)load("glColor4s"); - glad_glColor4sv = (PFNGLCOLOR4SVPROC)load("glColor4sv"); - glad_glColor4ub = (PFNGLCOLOR4UBPROC)load("glColor4ub"); - glad_glColor4ubv = (PFNGLCOLOR4UBVPROC)load("glColor4ubv"); - glad_glColor4ui = (PFNGLCOLOR4UIPROC)load("glColor4ui"); - glad_glColor4uiv = (PFNGLCOLOR4UIVPROC)load("glColor4uiv"); - glad_glColor4us = (PFNGLCOLOR4USPROC)load("glColor4us"); - glad_glColor4usv = (PFNGLCOLOR4USVPROC)load("glColor4usv"); - glad_glEdgeFlag = (PFNGLEDGEFLAGPROC)load("glEdgeFlag"); - glad_glEdgeFlagv = (PFNGLEDGEFLAGVPROC)load("glEdgeFlagv"); - glad_glEnd = (PFNGLENDPROC)load("glEnd"); - glad_glIndexd = (PFNGLINDEXDPROC)load("glIndexd"); - glad_glIndexdv = (PFNGLINDEXDVPROC)load("glIndexdv"); - glad_glIndexf = (PFNGLINDEXFPROC)load("glIndexf"); - glad_glIndexfv = (PFNGLINDEXFVPROC)load("glIndexfv"); - glad_glIndexi = (PFNGLINDEXIPROC)load("glIndexi"); - glad_glIndexiv = (PFNGLINDEXIVPROC)load("glIndexiv"); - glad_glIndexs = (PFNGLINDEXSPROC)load("glIndexs"); - glad_glIndexsv = (PFNGLINDEXSVPROC)load("glIndexsv"); - glad_glNormal3b = (PFNGLNORMAL3BPROC)load("glNormal3b"); - glad_glNormal3bv = (PFNGLNORMAL3BVPROC)load("glNormal3bv"); - glad_glNormal3d = (PFNGLNORMAL3DPROC)load("glNormal3d"); - glad_glNormal3dv = (PFNGLNORMAL3DVPROC)load("glNormal3dv"); - glad_glNormal3f = (PFNGLNORMAL3FPROC)load("glNormal3f"); - glad_glNormal3fv = (PFNGLNORMAL3FVPROC)load("glNormal3fv"); - glad_glNormal3i = (PFNGLNORMAL3IPROC)load("glNormal3i"); - glad_glNormal3iv = (PFNGLNORMAL3IVPROC)load("glNormal3iv"); - glad_glNormal3s = (PFNGLNORMAL3SPROC)load("glNormal3s"); - glad_glNormal3sv = (PFNGLNORMAL3SVPROC)load("glNormal3sv"); - glad_glRasterPos2d = (PFNGLRASTERPOS2DPROC)load("glRasterPos2d"); - glad_glRasterPos2dv = (PFNGLRASTERPOS2DVPROC)load("glRasterPos2dv"); - glad_glRasterPos2f = (PFNGLRASTERPOS2FPROC)load("glRasterPos2f"); - glad_glRasterPos2fv = (PFNGLRASTERPOS2FVPROC)load("glRasterPos2fv"); - glad_glRasterPos2i = (PFNGLRASTERPOS2IPROC)load("glRasterPos2i"); - glad_glRasterPos2iv = (PFNGLRASTERPOS2IVPROC)load("glRasterPos2iv"); - glad_glRasterPos2s = (PFNGLRASTERPOS2SPROC)load("glRasterPos2s"); - glad_glRasterPos2sv = (PFNGLRASTERPOS2SVPROC)load("glRasterPos2sv"); - glad_glRasterPos3d = (PFNGLRASTERPOS3DPROC)load("glRasterPos3d"); - glad_glRasterPos3dv = (PFNGLRASTERPOS3DVPROC)load("glRasterPos3dv"); - glad_glRasterPos3f = (PFNGLRASTERPOS3FPROC)load("glRasterPos3f"); - glad_glRasterPos3fv = (PFNGLRASTERPOS3FVPROC)load("glRasterPos3fv"); - glad_glRasterPos3i = (PFNGLRASTERPOS3IPROC)load("glRasterPos3i"); - glad_glRasterPos3iv = (PFNGLRASTERPOS3IVPROC)load("glRasterPos3iv"); - glad_glRasterPos3s = (PFNGLRASTERPOS3SPROC)load("glRasterPos3s"); - glad_glRasterPos3sv = (PFNGLRASTERPOS3SVPROC)load("glRasterPos3sv"); - glad_glRasterPos4d = (PFNGLRASTERPOS4DPROC)load("glRasterPos4d"); - glad_glRasterPos4dv = (PFNGLRASTERPOS4DVPROC)load("glRasterPos4dv"); - glad_glRasterPos4f = (PFNGLRASTERPOS4FPROC)load("glRasterPos4f"); - glad_glRasterPos4fv = (PFNGLRASTERPOS4FVPROC)load("glRasterPos4fv"); - glad_glRasterPos4i = (PFNGLRASTERPOS4IPROC)load("glRasterPos4i"); - glad_glRasterPos4iv = (PFNGLRASTERPOS4IVPROC)load("glRasterPos4iv"); - glad_glRasterPos4s = (PFNGLRASTERPOS4SPROC)load("glRasterPos4s"); - glad_glRasterPos4sv = (PFNGLRASTERPOS4SVPROC)load("glRasterPos4sv"); - glad_glRectd = (PFNGLRECTDPROC)load("glRectd"); - glad_glRectdv = (PFNGLRECTDVPROC)load("glRectdv"); - glad_glRectf = (PFNGLRECTFPROC)load("glRectf"); - glad_glRectfv = (PFNGLRECTFVPROC)load("glRectfv"); - glad_glRecti = (PFNGLRECTIPROC)load("glRecti"); - glad_glRectiv = (PFNGLRECTIVPROC)load("glRectiv"); - glad_glRects = (PFNGLRECTSPROC)load("glRects"); - glad_glRectsv = (PFNGLRECTSVPROC)load("glRectsv"); - glad_glTexCoord1d = (PFNGLTEXCOORD1DPROC)load("glTexCoord1d"); - glad_glTexCoord1dv = (PFNGLTEXCOORD1DVPROC)load("glTexCoord1dv"); - glad_glTexCoord1f = (PFNGLTEXCOORD1FPROC)load("glTexCoord1f"); - glad_glTexCoord1fv = (PFNGLTEXCOORD1FVPROC)load("glTexCoord1fv"); - glad_glTexCoord1i = (PFNGLTEXCOORD1IPROC)load("glTexCoord1i"); - glad_glTexCoord1iv = (PFNGLTEXCOORD1IVPROC)load("glTexCoord1iv"); - glad_glTexCoord1s = (PFNGLTEXCOORD1SPROC)load("glTexCoord1s"); - glad_glTexCoord1sv = (PFNGLTEXCOORD1SVPROC)load("glTexCoord1sv"); - glad_glTexCoord2d = (PFNGLTEXCOORD2DPROC)load("glTexCoord2d"); - glad_glTexCoord2dv = (PFNGLTEXCOORD2DVPROC)load("glTexCoord2dv"); - glad_glTexCoord2f = (PFNGLTEXCOORD2FPROC)load("glTexCoord2f"); - glad_glTexCoord2fv = (PFNGLTEXCOORD2FVPROC)load("glTexCoord2fv"); - glad_glTexCoord2i = (PFNGLTEXCOORD2IPROC)load("glTexCoord2i"); - glad_glTexCoord2iv = (PFNGLTEXCOORD2IVPROC)load("glTexCoord2iv"); - glad_glTexCoord2s = (PFNGLTEXCOORD2SPROC)load("glTexCoord2s"); - glad_glTexCoord2sv = (PFNGLTEXCOORD2SVPROC)load("glTexCoord2sv"); - glad_glTexCoord3d = (PFNGLTEXCOORD3DPROC)load("glTexCoord3d"); - glad_glTexCoord3dv = (PFNGLTEXCOORD3DVPROC)load("glTexCoord3dv"); - glad_glTexCoord3f = (PFNGLTEXCOORD3FPROC)load("glTexCoord3f"); - glad_glTexCoord3fv = (PFNGLTEXCOORD3FVPROC)load("glTexCoord3fv"); - glad_glTexCoord3i = (PFNGLTEXCOORD3IPROC)load("glTexCoord3i"); - glad_glTexCoord3iv = (PFNGLTEXCOORD3IVPROC)load("glTexCoord3iv"); - glad_glTexCoord3s = (PFNGLTEXCOORD3SPROC)load("glTexCoord3s"); - glad_glTexCoord3sv = (PFNGLTEXCOORD3SVPROC)load("glTexCoord3sv"); - glad_glTexCoord4d = (PFNGLTEXCOORD4DPROC)load("glTexCoord4d"); - glad_glTexCoord4dv = (PFNGLTEXCOORD4DVPROC)load("glTexCoord4dv"); - glad_glTexCoord4f = (PFNGLTEXCOORD4FPROC)load("glTexCoord4f"); - glad_glTexCoord4fv = (PFNGLTEXCOORD4FVPROC)load("glTexCoord4fv"); - glad_glTexCoord4i = (PFNGLTEXCOORD4IPROC)load("glTexCoord4i"); - glad_glTexCoord4iv = (PFNGLTEXCOORD4IVPROC)load("glTexCoord4iv"); - glad_glTexCoord4s = (PFNGLTEXCOORD4SPROC)load("glTexCoord4s"); - glad_glTexCoord4sv = (PFNGLTEXCOORD4SVPROC)load("glTexCoord4sv"); - glad_glVertex2d = (PFNGLVERTEX2DPROC)load("glVertex2d"); - glad_glVertex2dv = (PFNGLVERTEX2DVPROC)load("glVertex2dv"); - glad_glVertex2f = (PFNGLVERTEX2FPROC)load("glVertex2f"); - glad_glVertex2fv = (PFNGLVERTEX2FVPROC)load("glVertex2fv"); - glad_glVertex2i = (PFNGLVERTEX2IPROC)load("glVertex2i"); - glad_glVertex2iv = (PFNGLVERTEX2IVPROC)load("glVertex2iv"); - glad_glVertex2s = (PFNGLVERTEX2SPROC)load("glVertex2s"); - glad_glVertex2sv = (PFNGLVERTEX2SVPROC)load("glVertex2sv"); - glad_glVertex3d = (PFNGLVERTEX3DPROC)load("glVertex3d"); - glad_glVertex3dv = (PFNGLVERTEX3DVPROC)load("glVertex3dv"); - glad_glVertex3f = (PFNGLVERTEX3FPROC)load("glVertex3f"); - glad_glVertex3fv = (PFNGLVERTEX3FVPROC)load("glVertex3fv"); - glad_glVertex3i = (PFNGLVERTEX3IPROC)load("glVertex3i"); - glad_glVertex3iv = (PFNGLVERTEX3IVPROC)load("glVertex3iv"); - glad_glVertex3s = (PFNGLVERTEX3SPROC)load("glVertex3s"); - glad_glVertex3sv = (PFNGLVERTEX3SVPROC)load("glVertex3sv"); - glad_glVertex4d = (PFNGLVERTEX4DPROC)load("glVertex4d"); - glad_glVertex4dv = (PFNGLVERTEX4DVPROC)load("glVertex4dv"); - glad_glVertex4f = (PFNGLVERTEX4FPROC)load("glVertex4f"); - glad_glVertex4fv = (PFNGLVERTEX4FVPROC)load("glVertex4fv"); - glad_glVertex4i = (PFNGLVERTEX4IPROC)load("glVertex4i"); - glad_glVertex4iv = (PFNGLVERTEX4IVPROC)load("glVertex4iv"); - glad_glVertex4s = (PFNGLVERTEX4SPROC)load("glVertex4s"); - glad_glVertex4sv = (PFNGLVERTEX4SVPROC)load("glVertex4sv"); - glad_glClipPlane = (PFNGLCLIPPLANEPROC)load("glClipPlane"); - glad_glColorMaterial = (PFNGLCOLORMATERIALPROC)load("glColorMaterial"); - glad_glFogf = (PFNGLFOGFPROC)load("glFogf"); - glad_glFogfv = (PFNGLFOGFVPROC)load("glFogfv"); - glad_glFogi = (PFNGLFOGIPROC)load("glFogi"); - glad_glFogiv = (PFNGLFOGIVPROC)load("glFogiv"); - glad_glLightf = (PFNGLLIGHTFPROC)load("glLightf"); - glad_glLightfv = (PFNGLLIGHTFVPROC)load("glLightfv"); - glad_glLighti = (PFNGLLIGHTIPROC)load("glLighti"); - glad_glLightiv = (PFNGLLIGHTIVPROC)load("glLightiv"); - glad_glLightModelf = (PFNGLLIGHTMODELFPROC)load("glLightModelf"); - glad_glLightModelfv = (PFNGLLIGHTMODELFVPROC)load("glLightModelfv"); - glad_glLightModeli = (PFNGLLIGHTMODELIPROC)load("glLightModeli"); - glad_glLightModeliv = (PFNGLLIGHTMODELIVPROC)load("glLightModeliv"); - glad_glLineStipple = (PFNGLLINESTIPPLEPROC)load("glLineStipple"); - glad_glMaterialf = (PFNGLMATERIALFPROC)load("glMaterialf"); - glad_glMaterialfv = (PFNGLMATERIALFVPROC)load("glMaterialfv"); - glad_glMateriali = (PFNGLMATERIALIPROC)load("glMateriali"); - glad_glMaterialiv = (PFNGLMATERIALIVPROC)load("glMaterialiv"); - glad_glPolygonStipple = (PFNGLPOLYGONSTIPPLEPROC)load("glPolygonStipple"); - glad_glShadeModel = (PFNGLSHADEMODELPROC)load("glShadeModel"); - glad_glTexEnvf = (PFNGLTEXENVFPROC)load("glTexEnvf"); - glad_glTexEnvfv = (PFNGLTEXENVFVPROC)load("glTexEnvfv"); - glad_glTexEnvi = (PFNGLTEXENVIPROC)load("glTexEnvi"); - glad_glTexEnviv = (PFNGLTEXENVIVPROC)load("glTexEnviv"); - glad_glTexGend = (PFNGLTEXGENDPROC)load("glTexGend"); - glad_glTexGendv = (PFNGLTEXGENDVPROC)load("glTexGendv"); - glad_glTexGenf = (PFNGLTEXGENFPROC)load("glTexGenf"); - glad_glTexGenfv = (PFNGLTEXGENFVPROC)load("glTexGenfv"); - glad_glTexGeni = (PFNGLTEXGENIPROC)load("glTexGeni"); - glad_glTexGeniv = (PFNGLTEXGENIVPROC)load("glTexGeniv"); - glad_glFeedbackBuffer = (PFNGLFEEDBACKBUFFERPROC)load("glFeedbackBuffer"); - glad_glSelectBuffer = (PFNGLSELECTBUFFERPROC)load("glSelectBuffer"); - glad_glRenderMode = (PFNGLRENDERMODEPROC)load("glRenderMode"); - glad_glInitNames = (PFNGLINITNAMESPROC)load("glInitNames"); - glad_glLoadName = (PFNGLLOADNAMEPROC)load("glLoadName"); - glad_glPassThrough = (PFNGLPASSTHROUGHPROC)load("glPassThrough"); - glad_glPopName = (PFNGLPOPNAMEPROC)load("glPopName"); - glad_glPushName = (PFNGLPUSHNAMEPROC)load("glPushName"); - glad_glClearAccum = (PFNGLCLEARACCUMPROC)load("glClearAccum"); - glad_glClearIndex = (PFNGLCLEARINDEXPROC)load("glClearIndex"); - glad_glIndexMask = (PFNGLINDEXMASKPROC)load("glIndexMask"); - glad_glAccum = (PFNGLACCUMPROC)load("glAccum"); - glad_glPopAttrib = (PFNGLPOPATTRIBPROC)load("glPopAttrib"); - glad_glPushAttrib = (PFNGLPUSHATTRIBPROC)load("glPushAttrib"); - glad_glMap1d = (PFNGLMAP1DPROC)load("glMap1d"); - glad_glMap1f = (PFNGLMAP1FPROC)load("glMap1f"); - glad_glMap2d = (PFNGLMAP2DPROC)load("glMap2d"); - glad_glMap2f = (PFNGLMAP2FPROC)load("glMap2f"); - glad_glMapGrid1d = (PFNGLMAPGRID1DPROC)load("glMapGrid1d"); - glad_glMapGrid1f = (PFNGLMAPGRID1FPROC)load("glMapGrid1f"); - glad_glMapGrid2d = (PFNGLMAPGRID2DPROC)load("glMapGrid2d"); - glad_glMapGrid2f = (PFNGLMAPGRID2FPROC)load("glMapGrid2f"); - glad_glEvalCoord1d = (PFNGLEVALCOORD1DPROC)load("glEvalCoord1d"); - glad_glEvalCoord1dv = (PFNGLEVALCOORD1DVPROC)load("glEvalCoord1dv"); - glad_glEvalCoord1f = (PFNGLEVALCOORD1FPROC)load("glEvalCoord1f"); - glad_glEvalCoord1fv = (PFNGLEVALCOORD1FVPROC)load("glEvalCoord1fv"); - glad_glEvalCoord2d = (PFNGLEVALCOORD2DPROC)load("glEvalCoord2d"); - glad_glEvalCoord2dv = (PFNGLEVALCOORD2DVPROC)load("glEvalCoord2dv"); - glad_glEvalCoord2f = (PFNGLEVALCOORD2FPROC)load("glEvalCoord2f"); - glad_glEvalCoord2fv = (PFNGLEVALCOORD2FVPROC)load("glEvalCoord2fv"); - glad_glEvalMesh1 = (PFNGLEVALMESH1PROC)load("glEvalMesh1"); - glad_glEvalPoint1 = (PFNGLEVALPOINT1PROC)load("glEvalPoint1"); - glad_glEvalMesh2 = (PFNGLEVALMESH2PROC)load("glEvalMesh2"); - glad_glEvalPoint2 = (PFNGLEVALPOINT2PROC)load("glEvalPoint2"); - glad_glAlphaFunc = (PFNGLALPHAFUNCPROC)load("glAlphaFunc"); - glad_glPixelZoom = (PFNGLPIXELZOOMPROC)load("glPixelZoom"); - glad_glPixelTransferf = (PFNGLPIXELTRANSFERFPROC)load("glPixelTransferf"); - glad_glPixelTransferi = (PFNGLPIXELTRANSFERIPROC)load("glPixelTransferi"); - glad_glPixelMapfv = (PFNGLPIXELMAPFVPROC)load("glPixelMapfv"); - glad_glPixelMapuiv = (PFNGLPIXELMAPUIVPROC)load("glPixelMapuiv"); - glad_glPixelMapusv = (PFNGLPIXELMAPUSVPROC)load("glPixelMapusv"); - glad_glCopyPixels = (PFNGLCOPYPIXELSPROC)load("glCopyPixels"); - glad_glDrawPixels = (PFNGLDRAWPIXELSPROC)load("glDrawPixels"); - glad_glGetClipPlane = (PFNGLGETCLIPPLANEPROC)load("glGetClipPlane"); - glad_glGetLightfv = (PFNGLGETLIGHTFVPROC)load("glGetLightfv"); - glad_glGetLightiv = (PFNGLGETLIGHTIVPROC)load("glGetLightiv"); - glad_glGetMapdv = (PFNGLGETMAPDVPROC)load("glGetMapdv"); - glad_glGetMapfv = (PFNGLGETMAPFVPROC)load("glGetMapfv"); - glad_glGetMapiv = (PFNGLGETMAPIVPROC)load("glGetMapiv"); - glad_glGetMaterialfv = (PFNGLGETMATERIALFVPROC)load("glGetMaterialfv"); - glad_glGetMaterialiv = (PFNGLGETMATERIALIVPROC)load("glGetMaterialiv"); - glad_glGetPixelMapfv = (PFNGLGETPIXELMAPFVPROC)load("glGetPixelMapfv"); - glad_glGetPixelMapuiv = (PFNGLGETPIXELMAPUIVPROC)load("glGetPixelMapuiv"); - glad_glGetPixelMapusv = (PFNGLGETPIXELMAPUSVPROC)load("glGetPixelMapusv"); - glad_glGetPolygonStipple = (PFNGLGETPOLYGONSTIPPLEPROC)load("glGetPolygonStipple"); - glad_glGetTexEnvfv = (PFNGLGETTEXENVFVPROC)load("glGetTexEnvfv"); - glad_glGetTexEnviv = (PFNGLGETTEXENVIVPROC)load("glGetTexEnviv"); - glad_glGetTexGendv = (PFNGLGETTEXGENDVPROC)load("glGetTexGendv"); - glad_glGetTexGenfv = (PFNGLGETTEXGENFVPROC)load("glGetTexGenfv"); - glad_glGetTexGeniv = (PFNGLGETTEXGENIVPROC)load("glGetTexGeniv"); - glad_glIsList = (PFNGLISLISTPROC)load("glIsList"); - glad_glFrustum = (PFNGLFRUSTUMPROC)load("glFrustum"); - glad_glLoadIdentity = (PFNGLLOADIDENTITYPROC)load("glLoadIdentity"); - glad_glLoadMatrixf = (PFNGLLOADMATRIXFPROC)load("glLoadMatrixf"); - glad_glLoadMatrixd = (PFNGLLOADMATRIXDPROC)load("glLoadMatrixd"); - glad_glMatrixMode = (PFNGLMATRIXMODEPROC)load("glMatrixMode"); - glad_glMultMatrixf = (PFNGLMULTMATRIXFPROC)load("glMultMatrixf"); - glad_glMultMatrixd = (PFNGLMULTMATRIXDPROC)load("glMultMatrixd"); - glad_glOrtho = (PFNGLORTHOPROC)load("glOrtho"); - glad_glPopMatrix = (PFNGLPOPMATRIXPROC)load("glPopMatrix"); - glad_glPushMatrix = (PFNGLPUSHMATRIXPROC)load("glPushMatrix"); - glad_glRotated = (PFNGLROTATEDPROC)load("glRotated"); - glad_glRotatef = (PFNGLROTATEFPROC)load("glRotatef"); - glad_glScaled = (PFNGLSCALEDPROC)load("glScaled"); - glad_glScalef = (PFNGLSCALEFPROC)load("glScalef"); - glad_glTranslated = (PFNGLTRANSLATEDPROC)load("glTranslated"); - glad_glTranslatef = (PFNGLTRANSLATEFPROC)load("glTranslatef"); -} -static void load_GL_VERSION_1_1(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_1) return; - glad_glDrawArrays = (PFNGLDRAWARRAYSPROC)load("glDrawArrays"); - glad_glDrawElements = (PFNGLDRAWELEMENTSPROC)load("glDrawElements"); - glad_glGetPointerv = (PFNGLGETPOINTERVPROC)load("glGetPointerv"); - glad_glPolygonOffset = (PFNGLPOLYGONOFFSETPROC)load("glPolygonOffset"); - glad_glCopyTexImage1D = (PFNGLCOPYTEXIMAGE1DPROC)load("glCopyTexImage1D"); - glad_glCopyTexImage2D = (PFNGLCOPYTEXIMAGE2DPROC)load("glCopyTexImage2D"); - glad_glCopyTexSubImage1D = (PFNGLCOPYTEXSUBIMAGE1DPROC)load("glCopyTexSubImage1D"); - glad_glCopyTexSubImage2D = (PFNGLCOPYTEXSUBIMAGE2DPROC)load("glCopyTexSubImage2D"); - glad_glTexSubImage1D = (PFNGLTEXSUBIMAGE1DPROC)load("glTexSubImage1D"); - glad_glTexSubImage2D = (PFNGLTEXSUBIMAGE2DPROC)load("glTexSubImage2D"); - glad_glBindTexture = (PFNGLBINDTEXTUREPROC)load("glBindTexture"); - glad_glDeleteTextures = (PFNGLDELETETEXTURESPROC)load("glDeleteTextures"); - glad_glGenTextures = (PFNGLGENTEXTURESPROC)load("glGenTextures"); - glad_glIsTexture = (PFNGLISTEXTUREPROC)load("glIsTexture"); - glad_glArrayElement = (PFNGLARRAYELEMENTPROC)load("glArrayElement"); - glad_glColorPointer = (PFNGLCOLORPOINTERPROC)load("glColorPointer"); - glad_glDisableClientState = (PFNGLDISABLECLIENTSTATEPROC)load("glDisableClientState"); - glad_glEdgeFlagPointer = (PFNGLEDGEFLAGPOINTERPROC)load("glEdgeFlagPointer"); - glad_glEnableClientState = (PFNGLENABLECLIENTSTATEPROC)load("glEnableClientState"); - glad_glIndexPointer = (PFNGLINDEXPOINTERPROC)load("glIndexPointer"); - glad_glInterleavedArrays = (PFNGLINTERLEAVEDARRAYSPROC)load("glInterleavedArrays"); - glad_glNormalPointer = (PFNGLNORMALPOINTERPROC)load("glNormalPointer"); - glad_glTexCoordPointer = (PFNGLTEXCOORDPOINTERPROC)load("glTexCoordPointer"); - glad_glVertexPointer = (PFNGLVERTEXPOINTERPROC)load("glVertexPointer"); - glad_glAreTexturesResident = (PFNGLARETEXTURESRESIDENTPROC)load("glAreTexturesResident"); - glad_glPrioritizeTextures = (PFNGLPRIORITIZETEXTURESPROC)load("glPrioritizeTextures"); - glad_glIndexub = (PFNGLINDEXUBPROC)load("glIndexub"); - glad_glIndexubv = (PFNGLINDEXUBVPROC)load("glIndexubv"); - glad_glPopClientAttrib = (PFNGLPOPCLIENTATTRIBPROC)load("glPopClientAttrib"); - glad_glPushClientAttrib = (PFNGLPUSHCLIENTATTRIBPROC)load("glPushClientAttrib"); -} -static void load_GL_VERSION_1_2(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_2) return; - glad_glDrawRangeElements = (PFNGLDRAWRANGEELEMENTSPROC)load("glDrawRangeElements"); - glad_glTexImage3D = (PFNGLTEXIMAGE3DPROC)load("glTexImage3D"); - glad_glTexSubImage3D = (PFNGLTEXSUBIMAGE3DPROC)load("glTexSubImage3D"); - glad_glCopyTexSubImage3D = (PFNGLCOPYTEXSUBIMAGE3DPROC)load("glCopyTexSubImage3D"); -} -static void load_GL_VERSION_1_3(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_3) return; - glad_glActiveTexture = (PFNGLACTIVETEXTUREPROC)load("glActiveTexture"); - glad_glSampleCoverage = (PFNGLSAMPLECOVERAGEPROC)load("glSampleCoverage"); - glad_glCompressedTexImage3D = (PFNGLCOMPRESSEDTEXIMAGE3DPROC)load("glCompressedTexImage3D"); - glad_glCompressedTexImage2D = (PFNGLCOMPRESSEDTEXIMAGE2DPROC)load("glCompressedTexImage2D"); - glad_glCompressedTexImage1D = (PFNGLCOMPRESSEDTEXIMAGE1DPROC)load("glCompressedTexImage1D"); - glad_glCompressedTexSubImage3D = (PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC)load("glCompressedTexSubImage3D"); - glad_glCompressedTexSubImage2D = (PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC)load("glCompressedTexSubImage2D"); - glad_glCompressedTexSubImage1D = (PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC)load("glCompressedTexSubImage1D"); - glad_glGetCompressedTexImage = (PFNGLGETCOMPRESSEDTEXIMAGEPROC)load("glGetCompressedTexImage"); - glad_glClientActiveTexture = (PFNGLCLIENTACTIVETEXTUREPROC)load("glClientActiveTexture"); - glad_glMultiTexCoord1d = (PFNGLMULTITEXCOORD1DPROC)load("glMultiTexCoord1d"); - glad_glMultiTexCoord1dv = (PFNGLMULTITEXCOORD1DVPROC)load("glMultiTexCoord1dv"); - glad_glMultiTexCoord1f = (PFNGLMULTITEXCOORD1FPROC)load("glMultiTexCoord1f"); - glad_glMultiTexCoord1fv = (PFNGLMULTITEXCOORD1FVPROC)load("glMultiTexCoord1fv"); - glad_glMultiTexCoord1i = (PFNGLMULTITEXCOORD1IPROC)load("glMultiTexCoord1i"); - glad_glMultiTexCoord1iv = (PFNGLMULTITEXCOORD1IVPROC)load("glMultiTexCoord1iv"); - glad_glMultiTexCoord1s = (PFNGLMULTITEXCOORD1SPROC)load("glMultiTexCoord1s"); - glad_glMultiTexCoord1sv = (PFNGLMULTITEXCOORD1SVPROC)load("glMultiTexCoord1sv"); - glad_glMultiTexCoord2d = (PFNGLMULTITEXCOORD2DPROC)load("glMultiTexCoord2d"); - glad_glMultiTexCoord2dv = (PFNGLMULTITEXCOORD2DVPROC)load("glMultiTexCoord2dv"); - glad_glMultiTexCoord2f = (PFNGLMULTITEXCOORD2FPROC)load("glMultiTexCoord2f"); - glad_glMultiTexCoord2fv = (PFNGLMULTITEXCOORD2FVPROC)load("glMultiTexCoord2fv"); - glad_glMultiTexCoord2i = (PFNGLMULTITEXCOORD2IPROC)load("glMultiTexCoord2i"); - glad_glMultiTexCoord2iv = (PFNGLMULTITEXCOORD2IVPROC)load("glMultiTexCoord2iv"); - glad_glMultiTexCoord2s = (PFNGLMULTITEXCOORD2SPROC)load("glMultiTexCoord2s"); - glad_glMultiTexCoord2sv = (PFNGLMULTITEXCOORD2SVPROC)load("glMultiTexCoord2sv"); - glad_glMultiTexCoord3d = (PFNGLMULTITEXCOORD3DPROC)load("glMultiTexCoord3d"); - glad_glMultiTexCoord3dv = (PFNGLMULTITEXCOORD3DVPROC)load("glMultiTexCoord3dv"); - glad_glMultiTexCoord3f = (PFNGLMULTITEXCOORD3FPROC)load("glMultiTexCoord3f"); - glad_glMultiTexCoord3fv = (PFNGLMULTITEXCOORD3FVPROC)load("glMultiTexCoord3fv"); - glad_glMultiTexCoord3i = (PFNGLMULTITEXCOORD3IPROC)load("glMultiTexCoord3i"); - glad_glMultiTexCoord3iv = (PFNGLMULTITEXCOORD3IVPROC)load("glMultiTexCoord3iv"); - glad_glMultiTexCoord3s = (PFNGLMULTITEXCOORD3SPROC)load("glMultiTexCoord3s"); - glad_glMultiTexCoord3sv = (PFNGLMULTITEXCOORD3SVPROC)load("glMultiTexCoord3sv"); - glad_glMultiTexCoord4d = (PFNGLMULTITEXCOORD4DPROC)load("glMultiTexCoord4d"); - glad_glMultiTexCoord4dv = (PFNGLMULTITEXCOORD4DVPROC)load("glMultiTexCoord4dv"); - glad_glMultiTexCoord4f = (PFNGLMULTITEXCOORD4FPROC)load("glMultiTexCoord4f"); - glad_glMultiTexCoord4fv = (PFNGLMULTITEXCOORD4FVPROC)load("glMultiTexCoord4fv"); - glad_glMultiTexCoord4i = (PFNGLMULTITEXCOORD4IPROC)load("glMultiTexCoord4i"); - glad_glMultiTexCoord4iv = (PFNGLMULTITEXCOORD4IVPROC)load("glMultiTexCoord4iv"); - glad_glMultiTexCoord4s = (PFNGLMULTITEXCOORD4SPROC)load("glMultiTexCoord4s"); - glad_glMultiTexCoord4sv = (PFNGLMULTITEXCOORD4SVPROC)load("glMultiTexCoord4sv"); - glad_glLoadTransposeMatrixf = (PFNGLLOADTRANSPOSEMATRIXFPROC)load("glLoadTransposeMatrixf"); - glad_glLoadTransposeMatrixd = (PFNGLLOADTRANSPOSEMATRIXDPROC)load("glLoadTransposeMatrixd"); - glad_glMultTransposeMatrixf = (PFNGLMULTTRANSPOSEMATRIXFPROC)load("glMultTransposeMatrixf"); - glad_glMultTransposeMatrixd = (PFNGLMULTTRANSPOSEMATRIXDPROC)load("glMultTransposeMatrixd"); -} -static void load_GL_VERSION_1_4(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_4) return; - glad_glBlendFuncSeparate = (PFNGLBLENDFUNCSEPARATEPROC)load("glBlendFuncSeparate"); - glad_glMultiDrawArrays = (PFNGLMULTIDRAWARRAYSPROC)load("glMultiDrawArrays"); - glad_glMultiDrawElements = (PFNGLMULTIDRAWELEMENTSPROC)load("glMultiDrawElements"); - glad_glPointParameterf = (PFNGLPOINTPARAMETERFPROC)load("glPointParameterf"); - glad_glPointParameterfv = (PFNGLPOINTPARAMETERFVPROC)load("glPointParameterfv"); - glad_glPointParameteri = (PFNGLPOINTPARAMETERIPROC)load("glPointParameteri"); - glad_glPointParameteriv = (PFNGLPOINTPARAMETERIVPROC)load("glPointParameteriv"); - glad_glFogCoordf = (PFNGLFOGCOORDFPROC)load("glFogCoordf"); - glad_glFogCoordfv = (PFNGLFOGCOORDFVPROC)load("glFogCoordfv"); - glad_glFogCoordd = (PFNGLFOGCOORDDPROC)load("glFogCoordd"); - glad_glFogCoorddv = (PFNGLFOGCOORDDVPROC)load("glFogCoorddv"); - glad_glFogCoordPointer = (PFNGLFOGCOORDPOINTERPROC)load("glFogCoordPointer"); - glad_glSecondaryColor3b = (PFNGLSECONDARYCOLOR3BPROC)load("glSecondaryColor3b"); - glad_glSecondaryColor3bv = (PFNGLSECONDARYCOLOR3BVPROC)load("glSecondaryColor3bv"); - glad_glSecondaryColor3d = (PFNGLSECONDARYCOLOR3DPROC)load("glSecondaryColor3d"); - glad_glSecondaryColor3dv = (PFNGLSECONDARYCOLOR3DVPROC)load("glSecondaryColor3dv"); - glad_glSecondaryColor3f = (PFNGLSECONDARYCOLOR3FPROC)load("glSecondaryColor3f"); - glad_glSecondaryColor3fv = (PFNGLSECONDARYCOLOR3FVPROC)load("glSecondaryColor3fv"); - glad_glSecondaryColor3i = (PFNGLSECONDARYCOLOR3IPROC)load("glSecondaryColor3i"); - glad_glSecondaryColor3iv = (PFNGLSECONDARYCOLOR3IVPROC)load("glSecondaryColor3iv"); - glad_glSecondaryColor3s = (PFNGLSECONDARYCOLOR3SPROC)load("glSecondaryColor3s"); - glad_glSecondaryColor3sv = (PFNGLSECONDARYCOLOR3SVPROC)load("glSecondaryColor3sv"); - glad_glSecondaryColor3ub = (PFNGLSECONDARYCOLOR3UBPROC)load("glSecondaryColor3ub"); - glad_glSecondaryColor3ubv = (PFNGLSECONDARYCOLOR3UBVPROC)load("glSecondaryColor3ubv"); - glad_glSecondaryColor3ui = (PFNGLSECONDARYCOLOR3UIPROC)load("glSecondaryColor3ui"); - glad_glSecondaryColor3uiv = (PFNGLSECONDARYCOLOR3UIVPROC)load("glSecondaryColor3uiv"); - glad_glSecondaryColor3us = (PFNGLSECONDARYCOLOR3USPROC)load("glSecondaryColor3us"); - glad_glSecondaryColor3usv = (PFNGLSECONDARYCOLOR3USVPROC)load("glSecondaryColor3usv"); - glad_glSecondaryColorPointer = (PFNGLSECONDARYCOLORPOINTERPROC)load("glSecondaryColorPointer"); - glad_glWindowPos2d = (PFNGLWINDOWPOS2DPROC)load("glWindowPos2d"); - glad_glWindowPos2dv = (PFNGLWINDOWPOS2DVPROC)load("glWindowPos2dv"); - glad_glWindowPos2f = (PFNGLWINDOWPOS2FPROC)load("glWindowPos2f"); - glad_glWindowPos2fv = (PFNGLWINDOWPOS2FVPROC)load("glWindowPos2fv"); - glad_glWindowPos2i = (PFNGLWINDOWPOS2IPROC)load("glWindowPos2i"); - glad_glWindowPos2iv = (PFNGLWINDOWPOS2IVPROC)load("glWindowPos2iv"); - glad_glWindowPos2s = (PFNGLWINDOWPOS2SPROC)load("glWindowPos2s"); - glad_glWindowPos2sv = (PFNGLWINDOWPOS2SVPROC)load("glWindowPos2sv"); - glad_glWindowPos3d = (PFNGLWINDOWPOS3DPROC)load("glWindowPos3d"); - glad_glWindowPos3dv = (PFNGLWINDOWPOS3DVPROC)load("glWindowPos3dv"); - glad_glWindowPos3f = (PFNGLWINDOWPOS3FPROC)load("glWindowPos3f"); - glad_glWindowPos3fv = (PFNGLWINDOWPOS3FVPROC)load("glWindowPos3fv"); - glad_glWindowPos3i = (PFNGLWINDOWPOS3IPROC)load("glWindowPos3i"); - glad_glWindowPos3iv = (PFNGLWINDOWPOS3IVPROC)load("glWindowPos3iv"); - glad_glWindowPos3s = (PFNGLWINDOWPOS3SPROC)load("glWindowPos3s"); - glad_glWindowPos3sv = (PFNGLWINDOWPOS3SVPROC)load("glWindowPos3sv"); - glad_glBlendColor = (PFNGLBLENDCOLORPROC)load("glBlendColor"); - glad_glBlendEquation = (PFNGLBLENDEQUATIONPROC)load("glBlendEquation"); -} -static void load_GL_VERSION_1_5(GLADloadproc load) { - if(!GLAD_GL_VERSION_1_5) return; - glad_glGenQueries = (PFNGLGENQUERIESPROC)load("glGenQueries"); - glad_glDeleteQueries = (PFNGLDELETEQUERIESPROC)load("glDeleteQueries"); - glad_glIsQuery = (PFNGLISQUERYPROC)load("glIsQuery"); - glad_glBeginQuery = (PFNGLBEGINQUERYPROC)load("glBeginQuery"); - glad_glEndQuery = (PFNGLENDQUERYPROC)load("glEndQuery"); - glad_glGetQueryiv = (PFNGLGETQUERYIVPROC)load("glGetQueryiv"); - glad_glGetQueryObjectiv = (PFNGLGETQUERYOBJECTIVPROC)load("glGetQueryObjectiv"); - glad_glGetQueryObjectuiv = (PFNGLGETQUERYOBJECTUIVPROC)load("glGetQueryObjectuiv"); - glad_glBindBuffer = (PFNGLBINDBUFFERPROC)load("glBindBuffer"); - glad_glDeleteBuffers = (PFNGLDELETEBUFFERSPROC)load("glDeleteBuffers"); - glad_glGenBuffers = (PFNGLGENBUFFERSPROC)load("glGenBuffers"); - glad_glIsBuffer = (PFNGLISBUFFERPROC)load("glIsBuffer"); - glad_glBufferData = (PFNGLBUFFERDATAPROC)load("glBufferData"); - glad_glBufferSubData = (PFNGLBUFFERSUBDATAPROC)load("glBufferSubData"); - glad_glGetBufferSubData = (PFNGLGETBUFFERSUBDATAPROC)load("glGetBufferSubData"); - glad_glMapBuffer = (PFNGLMAPBUFFERPROC)load("glMapBuffer"); - glad_glUnmapBuffer = (PFNGLUNMAPBUFFERPROC)load("glUnmapBuffer"); - glad_glGetBufferParameteriv = (PFNGLGETBUFFERPARAMETERIVPROC)load("glGetBufferParameteriv"); - glad_glGetBufferPointerv = (PFNGLGETBUFFERPOINTERVPROC)load("glGetBufferPointerv"); -} -static void load_GL_VERSION_2_0(GLADloadproc load) { - if(!GLAD_GL_VERSION_2_0) return; - glad_glBlendEquationSeparate = (PFNGLBLENDEQUATIONSEPARATEPROC)load("glBlendEquationSeparate"); - glad_glDrawBuffers = (PFNGLDRAWBUFFERSPROC)load("glDrawBuffers"); - glad_glStencilOpSeparate = (PFNGLSTENCILOPSEPARATEPROC)load("glStencilOpSeparate"); - glad_glStencilFuncSeparate = (PFNGLSTENCILFUNCSEPARATEPROC)load("glStencilFuncSeparate"); - glad_glStencilMaskSeparate = (PFNGLSTENCILMASKSEPARATEPROC)load("glStencilMaskSeparate"); - glad_glAttachShader = (PFNGLATTACHSHADERPROC)load("glAttachShader"); - glad_glBindAttribLocation = (PFNGLBINDATTRIBLOCATIONPROC)load("glBindAttribLocation"); - glad_glCompileShader = (PFNGLCOMPILESHADERPROC)load("glCompileShader"); - glad_glCreateProgram = (PFNGLCREATEPROGRAMPROC)load("glCreateProgram"); - glad_glCreateShader = (PFNGLCREATESHADERPROC)load("glCreateShader"); - glad_glDeleteProgram = (PFNGLDELETEPROGRAMPROC)load("glDeleteProgram"); - glad_glDeleteShader = (PFNGLDELETESHADERPROC)load("glDeleteShader"); - glad_glDetachShader = (PFNGLDETACHSHADERPROC)load("glDetachShader"); - glad_glDisableVertexAttribArray = (PFNGLDISABLEVERTEXATTRIBARRAYPROC)load("glDisableVertexAttribArray"); - glad_glEnableVertexAttribArray = (PFNGLENABLEVERTEXATTRIBARRAYPROC)load("glEnableVertexAttribArray"); - glad_glGetActiveAttrib = (PFNGLGETACTIVEATTRIBPROC)load("glGetActiveAttrib"); - glad_glGetActiveUniform = (PFNGLGETACTIVEUNIFORMPROC)load("glGetActiveUniform"); - glad_glGetAttachedShaders = (PFNGLGETATTACHEDSHADERSPROC)load("glGetAttachedShaders"); - glad_glGetAttribLocation = (PFNGLGETATTRIBLOCATIONPROC)load("glGetAttribLocation"); - glad_glGetProgramiv = (PFNGLGETPROGRAMIVPROC)load("glGetProgramiv"); - glad_glGetProgramInfoLog = (PFNGLGETPROGRAMINFOLOGPROC)load("glGetProgramInfoLog"); - glad_glGetShaderiv = (PFNGLGETSHADERIVPROC)load("glGetShaderiv"); - glad_glGetShaderInfoLog = (PFNGLGETSHADERINFOLOGPROC)load("glGetShaderInfoLog"); - glad_glGetShaderSource = (PFNGLGETSHADERSOURCEPROC)load("glGetShaderSource"); - glad_glGetUniformLocation = (PFNGLGETUNIFORMLOCATIONPROC)load("glGetUniformLocation"); - glad_glGetUniformfv = (PFNGLGETUNIFORMFVPROC)load("glGetUniformfv"); - glad_glGetUniformiv = (PFNGLGETUNIFORMIVPROC)load("glGetUniformiv"); - glad_glGetVertexAttribdv = (PFNGLGETVERTEXATTRIBDVPROC)load("glGetVertexAttribdv"); - glad_glGetVertexAttribfv = (PFNGLGETVERTEXATTRIBFVPROC)load("glGetVertexAttribfv"); - glad_glGetVertexAttribiv = (PFNGLGETVERTEXATTRIBIVPROC)load("glGetVertexAttribiv"); - glad_glGetVertexAttribPointerv = (PFNGLGETVERTEXATTRIBPOINTERVPROC)load("glGetVertexAttribPointerv"); - glad_glIsProgram = (PFNGLISPROGRAMPROC)load("glIsProgram"); - glad_glIsShader = (PFNGLISSHADERPROC)load("glIsShader"); - glad_glLinkProgram = (PFNGLLINKPROGRAMPROC)load("glLinkProgram"); - glad_glShaderSource = (PFNGLSHADERSOURCEPROC)load("glShaderSource"); - glad_glUseProgram = (PFNGLUSEPROGRAMPROC)load("glUseProgram"); - glad_glUniform1f = (PFNGLUNIFORM1FPROC)load("glUniform1f"); - glad_glUniform2f = (PFNGLUNIFORM2FPROC)load("glUniform2f"); - glad_glUniform3f = (PFNGLUNIFORM3FPROC)load("glUniform3f"); - glad_glUniform4f = (PFNGLUNIFORM4FPROC)load("glUniform4f"); - glad_glUniform1i = (PFNGLUNIFORM1IPROC)load("glUniform1i"); - glad_glUniform2i = (PFNGLUNIFORM2IPROC)load("glUniform2i"); - glad_glUniform3i = (PFNGLUNIFORM3IPROC)load("glUniform3i"); - glad_glUniform4i = (PFNGLUNIFORM4IPROC)load("glUniform4i"); - glad_glUniform1fv = (PFNGLUNIFORM1FVPROC)load("glUniform1fv"); - glad_glUniform2fv = (PFNGLUNIFORM2FVPROC)load("glUniform2fv"); - glad_glUniform3fv = (PFNGLUNIFORM3FVPROC)load("glUniform3fv"); - glad_glUniform4fv = (PFNGLUNIFORM4FVPROC)load("glUniform4fv"); - glad_glUniform1iv = (PFNGLUNIFORM1IVPROC)load("glUniform1iv"); - glad_glUniform2iv = (PFNGLUNIFORM2IVPROC)load("glUniform2iv"); - glad_glUniform3iv = (PFNGLUNIFORM3IVPROC)load("glUniform3iv"); - glad_glUniform4iv = (PFNGLUNIFORM4IVPROC)load("glUniform4iv"); - glad_glUniformMatrix2fv = (PFNGLUNIFORMMATRIX2FVPROC)load("glUniformMatrix2fv"); - glad_glUniformMatrix3fv = (PFNGLUNIFORMMATRIX3FVPROC)load("glUniformMatrix3fv"); - glad_glUniformMatrix4fv = (PFNGLUNIFORMMATRIX4FVPROC)load("glUniformMatrix4fv"); - glad_glValidateProgram = (PFNGLVALIDATEPROGRAMPROC)load("glValidateProgram"); - glad_glVertexAttrib1d = (PFNGLVERTEXATTRIB1DPROC)load("glVertexAttrib1d"); - glad_glVertexAttrib1dv = (PFNGLVERTEXATTRIB1DVPROC)load("glVertexAttrib1dv"); - glad_glVertexAttrib1f = (PFNGLVERTEXATTRIB1FPROC)load("glVertexAttrib1f"); - glad_glVertexAttrib1fv = (PFNGLVERTEXATTRIB1FVPROC)load("glVertexAttrib1fv"); - glad_glVertexAttrib1s = (PFNGLVERTEXATTRIB1SPROC)load("glVertexAttrib1s"); - glad_glVertexAttrib1sv = (PFNGLVERTEXATTRIB1SVPROC)load("glVertexAttrib1sv"); - glad_glVertexAttrib2d = (PFNGLVERTEXATTRIB2DPROC)load("glVertexAttrib2d"); - glad_glVertexAttrib2dv = (PFNGLVERTEXATTRIB2DVPROC)load("glVertexAttrib2dv"); - glad_glVertexAttrib2f = (PFNGLVERTEXATTRIB2FPROC)load("glVertexAttrib2f"); - glad_glVertexAttrib2fv = (PFNGLVERTEXATTRIB2FVPROC)load("glVertexAttrib2fv"); - glad_glVertexAttrib2s = (PFNGLVERTEXATTRIB2SPROC)load("glVertexAttrib2s"); - glad_glVertexAttrib2sv = (PFNGLVERTEXATTRIB2SVPROC)load("glVertexAttrib2sv"); - glad_glVertexAttrib3d = (PFNGLVERTEXATTRIB3DPROC)load("glVertexAttrib3d"); - glad_glVertexAttrib3dv = (PFNGLVERTEXATTRIB3DVPROC)load("glVertexAttrib3dv"); - glad_glVertexAttrib3f = (PFNGLVERTEXATTRIB3FPROC)load("glVertexAttrib3f"); - glad_glVertexAttrib3fv = (PFNGLVERTEXATTRIB3FVPROC)load("glVertexAttrib3fv"); - glad_glVertexAttrib3s = (PFNGLVERTEXATTRIB3SPROC)load("glVertexAttrib3s"); - glad_glVertexAttrib3sv = (PFNGLVERTEXATTRIB3SVPROC)load("glVertexAttrib3sv"); - glad_glVertexAttrib4Nbv = (PFNGLVERTEXATTRIB4NBVPROC)load("glVertexAttrib4Nbv"); - glad_glVertexAttrib4Niv = (PFNGLVERTEXATTRIB4NIVPROC)load("glVertexAttrib4Niv"); - glad_glVertexAttrib4Nsv = (PFNGLVERTEXATTRIB4NSVPROC)load("glVertexAttrib4Nsv"); - glad_glVertexAttrib4Nub = (PFNGLVERTEXATTRIB4NUBPROC)load("glVertexAttrib4Nub"); - glad_glVertexAttrib4Nubv = (PFNGLVERTEXATTRIB4NUBVPROC)load("glVertexAttrib4Nubv"); - glad_glVertexAttrib4Nuiv = (PFNGLVERTEXATTRIB4NUIVPROC)load("glVertexAttrib4Nuiv"); - glad_glVertexAttrib4Nusv = (PFNGLVERTEXATTRIB4NUSVPROC)load("glVertexAttrib4Nusv"); - glad_glVertexAttrib4bv = (PFNGLVERTEXATTRIB4BVPROC)load("glVertexAttrib4bv"); - glad_glVertexAttrib4d = (PFNGLVERTEXATTRIB4DPROC)load("glVertexAttrib4d"); - glad_glVertexAttrib4dv = (PFNGLVERTEXATTRIB4DVPROC)load("glVertexAttrib4dv"); - glad_glVertexAttrib4f = (PFNGLVERTEXATTRIB4FPROC)load("glVertexAttrib4f"); - glad_glVertexAttrib4fv = (PFNGLVERTEXATTRIB4FVPROC)load("glVertexAttrib4fv"); - glad_glVertexAttrib4iv = (PFNGLVERTEXATTRIB4IVPROC)load("glVertexAttrib4iv"); - glad_glVertexAttrib4s = (PFNGLVERTEXATTRIB4SPROC)load("glVertexAttrib4s"); - glad_glVertexAttrib4sv = (PFNGLVERTEXATTRIB4SVPROC)load("glVertexAttrib4sv"); - glad_glVertexAttrib4ubv = (PFNGLVERTEXATTRIB4UBVPROC)load("glVertexAttrib4ubv"); - glad_glVertexAttrib4uiv = (PFNGLVERTEXATTRIB4UIVPROC)load("glVertexAttrib4uiv"); - glad_glVertexAttrib4usv = (PFNGLVERTEXATTRIB4USVPROC)load("glVertexAttrib4usv"); - glad_glVertexAttribPointer = (PFNGLVERTEXATTRIBPOINTERPROC)load("glVertexAttribPointer"); -} -static void load_GL_VERSION_2_1(GLADloadproc load) { - if(!GLAD_GL_VERSION_2_1) return; - glad_glUniformMatrix2x3fv = (PFNGLUNIFORMMATRIX2X3FVPROC)load("glUniformMatrix2x3fv"); - glad_glUniformMatrix3x2fv = (PFNGLUNIFORMMATRIX3X2FVPROC)load("glUniformMatrix3x2fv"); - glad_glUniformMatrix2x4fv = (PFNGLUNIFORMMATRIX2X4FVPROC)load("glUniformMatrix2x4fv"); - glad_glUniformMatrix4x2fv = (PFNGLUNIFORMMATRIX4X2FVPROC)load("glUniformMatrix4x2fv"); - glad_glUniformMatrix3x4fv = (PFNGLUNIFORMMATRIX3X4FVPROC)load("glUniformMatrix3x4fv"); - glad_glUniformMatrix4x3fv = (PFNGLUNIFORMMATRIX4X3FVPROC)load("glUniformMatrix4x3fv"); -} -static void load_GL_VERSION_3_0(GLADloadproc load) { - if(!GLAD_GL_VERSION_3_0) return; - glad_glColorMaski = (PFNGLCOLORMASKIPROC)load("glColorMaski"); - glad_glGetBooleani_v = (PFNGLGETBOOLEANI_VPROC)load("glGetBooleani_v"); - glad_glGetIntegeri_v = (PFNGLGETINTEGERI_VPROC)load("glGetIntegeri_v"); - glad_glEnablei = (PFNGLENABLEIPROC)load("glEnablei"); - glad_glDisablei = (PFNGLDISABLEIPROC)load("glDisablei"); - glad_glIsEnabledi = (PFNGLISENABLEDIPROC)load("glIsEnabledi"); - glad_glBeginTransformFeedback = (PFNGLBEGINTRANSFORMFEEDBACKPROC)load("glBeginTransformFeedback"); - glad_glEndTransformFeedback = (PFNGLENDTRANSFORMFEEDBACKPROC)load("glEndTransformFeedback"); - glad_glBindBufferRange = (PFNGLBINDBUFFERRANGEPROC)load("glBindBufferRange"); - glad_glBindBufferBase = (PFNGLBINDBUFFERBASEPROC)load("glBindBufferBase"); - glad_glTransformFeedbackVaryings = (PFNGLTRANSFORMFEEDBACKVARYINGSPROC)load("glTransformFeedbackVaryings"); - glad_glGetTransformFeedbackVarying = (PFNGLGETTRANSFORMFEEDBACKVARYINGPROC)load("glGetTransformFeedbackVarying"); - glad_glClampColor = (PFNGLCLAMPCOLORPROC)load("glClampColor"); - glad_glBeginConditionalRender = (PFNGLBEGINCONDITIONALRENDERPROC)load("glBeginConditionalRender"); - glad_glEndConditionalRender = (PFNGLENDCONDITIONALRENDERPROC)load("glEndConditionalRender"); - glad_glVertexAttribIPointer = (PFNGLVERTEXATTRIBIPOINTERPROC)load("glVertexAttribIPointer"); - glad_glGetVertexAttribIiv = (PFNGLGETVERTEXATTRIBIIVPROC)load("glGetVertexAttribIiv"); - glad_glGetVertexAttribIuiv = (PFNGLGETVERTEXATTRIBIUIVPROC)load("glGetVertexAttribIuiv"); - glad_glVertexAttribI1i = (PFNGLVERTEXATTRIBI1IPROC)load("glVertexAttribI1i"); - glad_glVertexAttribI2i = (PFNGLVERTEXATTRIBI2IPROC)load("glVertexAttribI2i"); - glad_glVertexAttribI3i = (PFNGLVERTEXATTRIBI3IPROC)load("glVertexAttribI3i"); - glad_glVertexAttribI4i = (PFNGLVERTEXATTRIBI4IPROC)load("glVertexAttribI4i"); - glad_glVertexAttribI1ui = (PFNGLVERTEXATTRIBI1UIPROC)load("glVertexAttribI1ui"); - glad_glVertexAttribI2ui = (PFNGLVERTEXATTRIBI2UIPROC)load("glVertexAttribI2ui"); - glad_glVertexAttribI3ui = (PFNGLVERTEXATTRIBI3UIPROC)load("glVertexAttribI3ui"); - glad_glVertexAttribI4ui = (PFNGLVERTEXATTRIBI4UIPROC)load("glVertexAttribI4ui"); - glad_glVertexAttribI1iv = (PFNGLVERTEXATTRIBI1IVPROC)load("glVertexAttribI1iv"); - glad_glVertexAttribI2iv = (PFNGLVERTEXATTRIBI2IVPROC)load("glVertexAttribI2iv"); - glad_glVertexAttribI3iv = (PFNGLVERTEXATTRIBI3IVPROC)load("glVertexAttribI3iv"); - glad_glVertexAttribI4iv = (PFNGLVERTEXATTRIBI4IVPROC)load("glVertexAttribI4iv"); - glad_glVertexAttribI1uiv = (PFNGLVERTEXATTRIBI1UIVPROC)load("glVertexAttribI1uiv"); - glad_glVertexAttribI2uiv = (PFNGLVERTEXATTRIBI2UIVPROC)load("glVertexAttribI2uiv"); - glad_glVertexAttribI3uiv = (PFNGLVERTEXATTRIBI3UIVPROC)load("glVertexAttribI3uiv"); - glad_glVertexAttribI4uiv = (PFNGLVERTEXATTRIBI4UIVPROC)load("glVertexAttribI4uiv"); - glad_glVertexAttribI4bv = (PFNGLVERTEXATTRIBI4BVPROC)load("glVertexAttribI4bv"); - glad_glVertexAttribI4sv = (PFNGLVERTEXATTRIBI4SVPROC)load("glVertexAttribI4sv"); - glad_glVertexAttribI4ubv = (PFNGLVERTEXATTRIBI4UBVPROC)load("glVertexAttribI4ubv"); - glad_glVertexAttribI4usv = (PFNGLVERTEXATTRIBI4USVPROC)load("glVertexAttribI4usv"); - glad_glGetUniformuiv = (PFNGLGETUNIFORMUIVPROC)load("glGetUniformuiv"); - glad_glBindFragDataLocation = (PFNGLBINDFRAGDATALOCATIONPROC)load("glBindFragDataLocation"); - glad_glGetFragDataLocation = (PFNGLGETFRAGDATALOCATIONPROC)load("glGetFragDataLocation"); - glad_glUniform1ui = (PFNGLUNIFORM1UIPROC)load("glUniform1ui"); - glad_glUniform2ui = (PFNGLUNIFORM2UIPROC)load("glUniform2ui"); - glad_glUniform3ui = (PFNGLUNIFORM3UIPROC)load("glUniform3ui"); - glad_glUniform4ui = (PFNGLUNIFORM4UIPROC)load("glUniform4ui"); - glad_glUniform1uiv = (PFNGLUNIFORM1UIVPROC)load("glUniform1uiv"); - glad_glUniform2uiv = (PFNGLUNIFORM2UIVPROC)load("glUniform2uiv"); - glad_glUniform3uiv = (PFNGLUNIFORM3UIVPROC)load("glUniform3uiv"); - glad_glUniform4uiv = (PFNGLUNIFORM4UIVPROC)load("glUniform4uiv"); - glad_glTexParameterIiv = (PFNGLTEXPARAMETERIIVPROC)load("glTexParameterIiv"); - glad_glTexParameterIuiv = (PFNGLTEXPARAMETERIUIVPROC)load("glTexParameterIuiv"); - glad_glGetTexParameterIiv = (PFNGLGETTEXPARAMETERIIVPROC)load("glGetTexParameterIiv"); - glad_glGetTexParameterIuiv = (PFNGLGETTEXPARAMETERIUIVPROC)load("glGetTexParameterIuiv"); - glad_glClearBufferiv = (PFNGLCLEARBUFFERIVPROC)load("glClearBufferiv"); - glad_glClearBufferuiv = (PFNGLCLEARBUFFERUIVPROC)load("glClearBufferuiv"); - glad_glClearBufferfv = (PFNGLCLEARBUFFERFVPROC)load("glClearBufferfv"); - glad_glClearBufferfi = (PFNGLCLEARBUFFERFIPROC)load("glClearBufferfi"); - glad_glGetStringi = (PFNGLGETSTRINGIPROC)load("glGetStringi"); - glad_glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC)load("glIsRenderbuffer"); - glad_glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC)load("glBindRenderbuffer"); - glad_glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC)load("glDeleteRenderbuffers"); - glad_glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC)load("glGenRenderbuffers"); - glad_glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC)load("glRenderbufferStorage"); - glad_glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC)load("glGetRenderbufferParameteriv"); - glad_glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC)load("glIsFramebuffer"); - glad_glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC)load("glBindFramebuffer"); - glad_glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC)load("glDeleteFramebuffers"); - glad_glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC)load("glGenFramebuffers"); - glad_glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC)load("glCheckFramebufferStatus"); - glad_glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC)load("glFramebufferTexture1D"); - glad_glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC)load("glFramebufferTexture2D"); - glad_glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC)load("glFramebufferTexture3D"); - glad_glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC)load("glFramebufferRenderbuffer"); - glad_glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)load("glGetFramebufferAttachmentParameteriv"); - glad_glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)load("glGenerateMipmap"); - glad_glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC)load("glBlitFramebuffer"); - glad_glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)load("glRenderbufferStorageMultisample"); - glad_glFramebufferTextureLayer = (PFNGLFRAMEBUFFERTEXTURELAYERPROC)load("glFramebufferTextureLayer"); - glad_glMapBufferRange = (PFNGLMAPBUFFERRANGEPROC)load("glMapBufferRange"); - glad_glFlushMappedBufferRange = (PFNGLFLUSHMAPPEDBUFFERRANGEPROC)load("glFlushMappedBufferRange"); - glad_glBindVertexArray = (PFNGLBINDVERTEXARRAYPROC)load("glBindVertexArray"); - glad_glDeleteVertexArrays = (PFNGLDELETEVERTEXARRAYSPROC)load("glDeleteVertexArrays"); - glad_glGenVertexArrays = (PFNGLGENVERTEXARRAYSPROC)load("glGenVertexArrays"); - glad_glIsVertexArray = (PFNGLISVERTEXARRAYPROC)load("glIsVertexArray"); -} -static void load_GL_VERSION_3_1(GLADloadproc load) { - if(!GLAD_GL_VERSION_3_1) return; - glad_glDrawArraysInstanced = (PFNGLDRAWARRAYSINSTANCEDPROC)load("glDrawArraysInstanced"); - glad_glDrawElementsInstanced = (PFNGLDRAWELEMENTSINSTANCEDPROC)load("glDrawElementsInstanced"); - glad_glTexBuffer = (PFNGLTEXBUFFERPROC)load("glTexBuffer"); - glad_glPrimitiveRestartIndex = (PFNGLPRIMITIVERESTARTINDEXPROC)load("glPrimitiveRestartIndex"); - glad_glCopyBufferSubData = (PFNGLCOPYBUFFERSUBDATAPROC)load("glCopyBufferSubData"); - glad_glGetUniformIndices = (PFNGLGETUNIFORMINDICESPROC)load("glGetUniformIndices"); - glad_glGetActiveUniformsiv = (PFNGLGETACTIVEUNIFORMSIVPROC)load("glGetActiveUniformsiv"); - glad_glGetActiveUniformName = (PFNGLGETACTIVEUNIFORMNAMEPROC)load("glGetActiveUniformName"); - glad_glGetUniformBlockIndex = (PFNGLGETUNIFORMBLOCKINDEXPROC)load("glGetUniformBlockIndex"); - glad_glGetActiveUniformBlockiv = (PFNGLGETACTIVEUNIFORMBLOCKIVPROC)load("glGetActiveUniformBlockiv"); - glad_glGetActiveUniformBlockName = (PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC)load("glGetActiveUniformBlockName"); - glad_glUniformBlockBinding = (PFNGLUNIFORMBLOCKBINDINGPROC)load("glUniformBlockBinding"); - glad_glBindBufferRange = (PFNGLBINDBUFFERRANGEPROC)load("glBindBufferRange"); - glad_glBindBufferBase = (PFNGLBINDBUFFERBASEPROC)load("glBindBufferBase"); - glad_glGetIntegeri_v = (PFNGLGETINTEGERI_VPROC)load("glGetIntegeri_v"); -} -static void load_GL_VERSION_3_2(GLADloadproc load) { - if(!GLAD_GL_VERSION_3_2) return; - glad_glDrawElementsBaseVertex = (PFNGLDRAWELEMENTSBASEVERTEXPROC)load("glDrawElementsBaseVertex"); - glad_glDrawRangeElementsBaseVertex = (PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC)load("glDrawRangeElementsBaseVertex"); - glad_glDrawElementsInstancedBaseVertex = (PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC)load("glDrawElementsInstancedBaseVertex"); - glad_glMultiDrawElementsBaseVertex = (PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC)load("glMultiDrawElementsBaseVertex"); - glad_glProvokingVertex = (PFNGLPROVOKINGVERTEXPROC)load("glProvokingVertex"); - glad_glFenceSync = (PFNGLFENCESYNCPROC)load("glFenceSync"); - glad_glIsSync = (PFNGLISSYNCPROC)load("glIsSync"); - glad_glDeleteSync = (PFNGLDELETESYNCPROC)load("glDeleteSync"); - glad_glClientWaitSync = (PFNGLCLIENTWAITSYNCPROC)load("glClientWaitSync"); - glad_glWaitSync = (PFNGLWAITSYNCPROC)load("glWaitSync"); - glad_glGetInteger64v = (PFNGLGETINTEGER64VPROC)load("glGetInteger64v"); - glad_glGetSynciv = (PFNGLGETSYNCIVPROC)load("glGetSynciv"); - glad_glGetInteger64i_v = (PFNGLGETINTEGER64I_VPROC)load("glGetInteger64i_v"); - glad_glGetBufferParameteri64v = (PFNGLGETBUFFERPARAMETERI64VPROC)load("glGetBufferParameteri64v"); - glad_glFramebufferTexture = (PFNGLFRAMEBUFFERTEXTUREPROC)load("glFramebufferTexture"); - glad_glTexImage2DMultisample = (PFNGLTEXIMAGE2DMULTISAMPLEPROC)load("glTexImage2DMultisample"); - glad_glTexImage3DMultisample = (PFNGLTEXIMAGE3DMULTISAMPLEPROC)load("glTexImage3DMultisample"); - glad_glGetMultisamplefv = (PFNGLGETMULTISAMPLEFVPROC)load("glGetMultisamplefv"); - glad_glSampleMaski = (PFNGLSAMPLEMASKIPROC)load("glSampleMaski"); -} -static void load_GL_VERSION_3_3(GLADloadproc load) { - if(!GLAD_GL_VERSION_3_3) return; - glad_glBindFragDataLocationIndexed = (PFNGLBINDFRAGDATALOCATIONINDEXEDPROC)load("glBindFragDataLocationIndexed"); - glad_glGetFragDataIndex = (PFNGLGETFRAGDATAINDEXPROC)load("glGetFragDataIndex"); - glad_glGenSamplers = (PFNGLGENSAMPLERSPROC)load("glGenSamplers"); - glad_glDeleteSamplers = (PFNGLDELETESAMPLERSPROC)load("glDeleteSamplers"); - glad_glIsSampler = (PFNGLISSAMPLERPROC)load("glIsSampler"); - glad_glBindSampler = (PFNGLBINDSAMPLERPROC)load("glBindSampler"); - glad_glSamplerParameteri = (PFNGLSAMPLERPARAMETERIPROC)load("glSamplerParameteri"); - glad_glSamplerParameteriv = (PFNGLSAMPLERPARAMETERIVPROC)load("glSamplerParameteriv"); - glad_glSamplerParameterf = (PFNGLSAMPLERPARAMETERFPROC)load("glSamplerParameterf"); - glad_glSamplerParameterfv = (PFNGLSAMPLERPARAMETERFVPROC)load("glSamplerParameterfv"); - glad_glSamplerParameterIiv = (PFNGLSAMPLERPARAMETERIIVPROC)load("glSamplerParameterIiv"); - glad_glSamplerParameterIuiv = (PFNGLSAMPLERPARAMETERIUIVPROC)load("glSamplerParameterIuiv"); - glad_glGetSamplerParameteriv = (PFNGLGETSAMPLERPARAMETERIVPROC)load("glGetSamplerParameteriv"); - glad_glGetSamplerParameterIiv = (PFNGLGETSAMPLERPARAMETERIIVPROC)load("glGetSamplerParameterIiv"); - glad_glGetSamplerParameterfv = (PFNGLGETSAMPLERPARAMETERFVPROC)load("glGetSamplerParameterfv"); - glad_glGetSamplerParameterIuiv = (PFNGLGETSAMPLERPARAMETERIUIVPROC)load("glGetSamplerParameterIuiv"); - glad_glQueryCounter = (PFNGLQUERYCOUNTERPROC)load("glQueryCounter"); - glad_glGetQueryObjecti64v = (PFNGLGETQUERYOBJECTI64VPROC)load("glGetQueryObjecti64v"); - glad_glGetQueryObjectui64v = (PFNGLGETQUERYOBJECTUI64VPROC)load("glGetQueryObjectui64v"); - glad_glVertexAttribDivisor = (PFNGLVERTEXATTRIBDIVISORPROC)load("glVertexAttribDivisor"); - glad_glVertexAttribP1ui = (PFNGLVERTEXATTRIBP1UIPROC)load("glVertexAttribP1ui"); - glad_glVertexAttribP1uiv = (PFNGLVERTEXATTRIBP1UIVPROC)load("glVertexAttribP1uiv"); - glad_glVertexAttribP2ui = (PFNGLVERTEXATTRIBP2UIPROC)load("glVertexAttribP2ui"); - glad_glVertexAttribP2uiv = (PFNGLVERTEXATTRIBP2UIVPROC)load("glVertexAttribP2uiv"); - glad_glVertexAttribP3ui = (PFNGLVERTEXATTRIBP3UIPROC)load("glVertexAttribP3ui"); - glad_glVertexAttribP3uiv = (PFNGLVERTEXATTRIBP3UIVPROC)load("glVertexAttribP3uiv"); - glad_glVertexAttribP4ui = (PFNGLVERTEXATTRIBP4UIPROC)load("glVertexAttribP4ui"); - glad_glVertexAttribP4uiv = (PFNGLVERTEXATTRIBP4UIVPROC)load("glVertexAttribP4uiv"); - glad_glVertexP2ui = (PFNGLVERTEXP2UIPROC)load("glVertexP2ui"); - glad_glVertexP2uiv = (PFNGLVERTEXP2UIVPROC)load("glVertexP2uiv"); - glad_glVertexP3ui = (PFNGLVERTEXP3UIPROC)load("glVertexP3ui"); - glad_glVertexP3uiv = (PFNGLVERTEXP3UIVPROC)load("glVertexP3uiv"); - glad_glVertexP4ui = (PFNGLVERTEXP4UIPROC)load("glVertexP4ui"); - glad_glVertexP4uiv = (PFNGLVERTEXP4UIVPROC)load("glVertexP4uiv"); - glad_glTexCoordP1ui = (PFNGLTEXCOORDP1UIPROC)load("glTexCoordP1ui"); - glad_glTexCoordP1uiv = (PFNGLTEXCOORDP1UIVPROC)load("glTexCoordP1uiv"); - glad_glTexCoordP2ui = (PFNGLTEXCOORDP2UIPROC)load("glTexCoordP2ui"); - glad_glTexCoordP2uiv = (PFNGLTEXCOORDP2UIVPROC)load("glTexCoordP2uiv"); - glad_glTexCoordP3ui = (PFNGLTEXCOORDP3UIPROC)load("glTexCoordP3ui"); - glad_glTexCoordP3uiv = (PFNGLTEXCOORDP3UIVPROC)load("glTexCoordP3uiv"); - glad_glTexCoordP4ui = (PFNGLTEXCOORDP4UIPROC)load("glTexCoordP4ui"); - glad_glTexCoordP4uiv = (PFNGLTEXCOORDP4UIVPROC)load("glTexCoordP4uiv"); - glad_glMultiTexCoordP1ui = (PFNGLMULTITEXCOORDP1UIPROC)load("glMultiTexCoordP1ui"); - glad_glMultiTexCoordP1uiv = (PFNGLMULTITEXCOORDP1UIVPROC)load("glMultiTexCoordP1uiv"); - glad_glMultiTexCoordP2ui = (PFNGLMULTITEXCOORDP2UIPROC)load("glMultiTexCoordP2ui"); - glad_glMultiTexCoordP2uiv = (PFNGLMULTITEXCOORDP2UIVPROC)load("glMultiTexCoordP2uiv"); - glad_glMultiTexCoordP3ui = (PFNGLMULTITEXCOORDP3UIPROC)load("glMultiTexCoordP3ui"); - glad_glMultiTexCoordP3uiv = (PFNGLMULTITEXCOORDP3UIVPROC)load("glMultiTexCoordP3uiv"); - glad_glMultiTexCoordP4ui = (PFNGLMULTITEXCOORDP4UIPROC)load("glMultiTexCoordP4ui"); - glad_glMultiTexCoordP4uiv = (PFNGLMULTITEXCOORDP4UIVPROC)load("glMultiTexCoordP4uiv"); - glad_glNormalP3ui = (PFNGLNORMALP3UIPROC)load("glNormalP3ui"); - glad_glNormalP3uiv = (PFNGLNORMALP3UIVPROC)load("glNormalP3uiv"); - glad_glColorP3ui = (PFNGLCOLORP3UIPROC)load("glColorP3ui"); - glad_glColorP3uiv = (PFNGLCOLORP3UIVPROC)load("glColorP3uiv"); - glad_glColorP4ui = (PFNGLCOLORP4UIPROC)load("glColorP4ui"); - glad_glColorP4uiv = (PFNGLCOLORP4UIVPROC)load("glColorP4uiv"); - glad_glSecondaryColorP3ui = (PFNGLSECONDARYCOLORP3UIPROC)load("glSecondaryColorP3ui"); - glad_glSecondaryColorP3uiv = (PFNGLSECONDARYCOLORP3UIVPROC)load("glSecondaryColorP3uiv"); -} -static void load_GL_ARB_debug_output(GLADloadproc load) { - if(!GLAD_GL_ARB_debug_output) return; - glad_glDebugMessageControlARB = (PFNGLDEBUGMESSAGECONTROLARBPROC)load("glDebugMessageControlARB"); - glad_glDebugMessageInsertARB = (PFNGLDEBUGMESSAGEINSERTARBPROC)load("glDebugMessageInsertARB"); - glad_glDebugMessageCallbackARB = (PFNGLDEBUGMESSAGECALLBACKARBPROC)load("glDebugMessageCallbackARB"); - glad_glGetDebugMessageLogARB = (PFNGLGETDEBUGMESSAGELOGARBPROC)load("glGetDebugMessageLogARB"); -} -static void load_GL_ARB_framebuffer_object(GLADloadproc load) { - if(!GLAD_GL_ARB_framebuffer_object) return; - glad_glIsRenderbuffer = (PFNGLISRENDERBUFFERPROC)load("glIsRenderbuffer"); - glad_glBindRenderbuffer = (PFNGLBINDRENDERBUFFERPROC)load("glBindRenderbuffer"); - glad_glDeleteRenderbuffers = (PFNGLDELETERENDERBUFFERSPROC)load("glDeleteRenderbuffers"); - glad_glGenRenderbuffers = (PFNGLGENRENDERBUFFERSPROC)load("glGenRenderbuffers"); - glad_glRenderbufferStorage = (PFNGLRENDERBUFFERSTORAGEPROC)load("glRenderbufferStorage"); - glad_glGetRenderbufferParameteriv = (PFNGLGETRENDERBUFFERPARAMETERIVPROC)load("glGetRenderbufferParameteriv"); - glad_glIsFramebuffer = (PFNGLISFRAMEBUFFERPROC)load("glIsFramebuffer"); - glad_glBindFramebuffer = (PFNGLBINDFRAMEBUFFERPROC)load("glBindFramebuffer"); - glad_glDeleteFramebuffers = (PFNGLDELETEFRAMEBUFFERSPROC)load("glDeleteFramebuffers"); - glad_glGenFramebuffers = (PFNGLGENFRAMEBUFFERSPROC)load("glGenFramebuffers"); - glad_glCheckFramebufferStatus = (PFNGLCHECKFRAMEBUFFERSTATUSPROC)load("glCheckFramebufferStatus"); - glad_glFramebufferTexture1D = (PFNGLFRAMEBUFFERTEXTURE1DPROC)load("glFramebufferTexture1D"); - glad_glFramebufferTexture2D = (PFNGLFRAMEBUFFERTEXTURE2DPROC)load("glFramebufferTexture2D"); - glad_glFramebufferTexture3D = (PFNGLFRAMEBUFFERTEXTURE3DPROC)load("glFramebufferTexture3D"); - glad_glFramebufferRenderbuffer = (PFNGLFRAMEBUFFERRENDERBUFFERPROC)load("glFramebufferRenderbuffer"); - glad_glGetFramebufferAttachmentParameteriv = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)load("glGetFramebufferAttachmentParameteriv"); - glad_glGenerateMipmap = (PFNGLGENERATEMIPMAPPROC)load("glGenerateMipmap"); - glad_glBlitFramebuffer = (PFNGLBLITFRAMEBUFFERPROC)load("glBlitFramebuffer"); - glad_glRenderbufferStorageMultisample = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)load("glRenderbufferStorageMultisample"); - glad_glFramebufferTextureLayer = (PFNGLFRAMEBUFFERTEXTURELAYERPROC)load("glFramebufferTextureLayer"); -} -static void load_GL_EXT_framebuffer_blit(GLADloadproc load) { - if(!GLAD_GL_EXT_framebuffer_blit) return; - glad_glBlitFramebufferEXT = (PFNGLBLITFRAMEBUFFEREXTPROC)load("glBlitFramebufferEXT"); -} -static void load_GL_EXT_framebuffer_multisample(GLADloadproc load) { - if(!GLAD_GL_EXT_framebuffer_multisample) return; - glad_glRenderbufferStorageMultisampleEXT = (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)load("glRenderbufferStorageMultisampleEXT"); -} -static void load_GL_EXT_framebuffer_object(GLADloadproc load) { - if(!GLAD_GL_EXT_framebuffer_object) return; - glad_glIsRenderbufferEXT = (PFNGLISRENDERBUFFEREXTPROC)load("glIsRenderbufferEXT"); - glad_glBindRenderbufferEXT = (PFNGLBINDRENDERBUFFEREXTPROC)load("glBindRenderbufferEXT"); - glad_glDeleteRenderbuffersEXT = (PFNGLDELETERENDERBUFFERSEXTPROC)load("glDeleteRenderbuffersEXT"); - glad_glGenRenderbuffersEXT = (PFNGLGENRENDERBUFFERSEXTPROC)load("glGenRenderbuffersEXT"); - glad_glRenderbufferStorageEXT = (PFNGLRENDERBUFFERSTORAGEEXTPROC)load("glRenderbufferStorageEXT"); - glad_glGetRenderbufferParameterivEXT = (PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC)load("glGetRenderbufferParameterivEXT"); - glad_glIsFramebufferEXT = (PFNGLISFRAMEBUFFEREXTPROC)load("glIsFramebufferEXT"); - glad_glBindFramebufferEXT = (PFNGLBINDFRAMEBUFFEREXTPROC)load("glBindFramebufferEXT"); - glad_glDeleteFramebuffersEXT = (PFNGLDELETEFRAMEBUFFERSEXTPROC)load("glDeleteFramebuffersEXT"); - glad_glGenFramebuffersEXT = (PFNGLGENFRAMEBUFFERSEXTPROC)load("glGenFramebuffersEXT"); - glad_glCheckFramebufferStatusEXT = (PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC)load("glCheckFramebufferStatusEXT"); - glad_glFramebufferTexture1DEXT = (PFNGLFRAMEBUFFERTEXTURE1DEXTPROC)load("glFramebufferTexture1DEXT"); - glad_glFramebufferTexture2DEXT = (PFNGLFRAMEBUFFERTEXTURE2DEXTPROC)load("glFramebufferTexture2DEXT"); - glad_glFramebufferTexture3DEXT = (PFNGLFRAMEBUFFERTEXTURE3DEXTPROC)load("glFramebufferTexture3DEXT"); - glad_glFramebufferRenderbufferEXT = (PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC)load("glFramebufferRenderbufferEXT"); - glad_glGetFramebufferAttachmentParameterivEXT = (PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC)load("glGetFramebufferAttachmentParameterivEXT"); - glad_glGenerateMipmapEXT = (PFNGLGENERATEMIPMAPEXTPROC)load("glGenerateMipmapEXT"); -} -static void load_GL_OVR_multiview(GLADloadproc load) { - if(!GLAD_GL_OVR_multiview) return; - glad_glFramebufferTextureMultiviewOVR = (PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC)load("glFramebufferTextureMultiviewOVR"); -} -static int find_extensionsGL(void) { - if (!get_exts()) return 0; - GLAD_GL_ARB_debug_output = has_ext("GL_ARB_debug_output"); - GLAD_GL_ARB_framebuffer_object = has_ext("GL_ARB_framebuffer_object"); - GLAD_GL_EXT_framebuffer_blit = has_ext("GL_EXT_framebuffer_blit"); - GLAD_GL_EXT_framebuffer_multisample = has_ext("GL_EXT_framebuffer_multisample"); - GLAD_GL_EXT_framebuffer_object = has_ext("GL_EXT_framebuffer_object"); - GLAD_GL_OVR_multiview = has_ext("GL_OVR_multiview"); - GLAD_GL_OVR_multiview2 = has_ext("GL_OVR_multiview2"); - free_exts(); - return 1; -} - -static void find_coreGL(void) { - - /* Thank you @elmindreda - * https://github.com/elmindreda/greg/blob/master/templates/greg.c.in#L176 - * https://github.com/glfw/glfw/blob/master/src/context.c#L36 - */ - int i, major, minor; - - const char* version; - const char* prefixes[] = { - "OpenGL ES-CM ", - "OpenGL ES-CL ", - "OpenGL ES ", - NULL - }; - - version = (const char*) glGetString(GL_VERSION); - if (!version) return; - - for (i = 0; prefixes[i]; i++) { - const size_t length = strlen(prefixes[i]); - if (strncmp(version, prefixes[i], length) == 0) { - version += length; - break; - } - } - -/* PR #18 */ -#ifdef _MSC_VER - sscanf_s(version, "%d.%d", &major, &minor); -#else - sscanf(version, "%d.%d", &major, &minor); -#endif - - GLVersion.major = major; GLVersion.minor = minor; - max_loaded_major = major; max_loaded_minor = minor; - GLAD_GL_VERSION_1_0 = (major == 1 && minor >= 0) || major > 1; - GLAD_GL_VERSION_1_1 = (major == 1 && minor >= 1) || major > 1; - GLAD_GL_VERSION_1_2 = (major == 1 && minor >= 2) || major > 1; - GLAD_GL_VERSION_1_3 = (major == 1 && minor >= 3) || major > 1; - GLAD_GL_VERSION_1_4 = (major == 1 && minor >= 4) || major > 1; - GLAD_GL_VERSION_1_5 = (major == 1 && minor >= 5) || major > 1; - GLAD_GL_VERSION_2_0 = (major == 2 && minor >= 0) || major > 2; - GLAD_GL_VERSION_2_1 = (major == 2 && minor >= 1) || major > 2; - GLAD_GL_VERSION_3_0 = (major == 3 && minor >= 0) || major > 3; - GLAD_GL_VERSION_3_1 = (major == 3 && minor >= 1) || major > 3; - GLAD_GL_VERSION_3_2 = (major == 3 && minor >= 2) || major > 3; - GLAD_GL_VERSION_3_3 = (major == 3 && minor >= 3) || major > 3; - if (GLVersion.major > 3 || (GLVersion.major >= 3 && GLVersion.minor >= 3)) { - max_loaded_major = 3; - max_loaded_minor = 3; - } -} - -int gladLoadGLLoader(GLADloadproc load) { - GLVersion.major = 0; GLVersion.minor = 0; - glGetString = (PFNGLGETSTRINGPROC)load("glGetString"); - if(glGetString == NULL) return 0; - if(glGetString(GL_VERSION) == NULL) return 0; - find_coreGL(); - load_GL_VERSION_1_0(load); - load_GL_VERSION_1_1(load); - load_GL_VERSION_1_2(load); - load_GL_VERSION_1_3(load); - load_GL_VERSION_1_4(load); - load_GL_VERSION_1_5(load); - load_GL_VERSION_2_0(load); - load_GL_VERSION_2_1(load); - load_GL_VERSION_3_0(load); - load_GL_VERSION_3_1(load); - load_GL_VERSION_3_2(load); - load_GL_VERSION_3_3(load); - - if (!find_extensionsGL()) return 0; - load_GL_ARB_debug_output(load); - load_GL_ARB_framebuffer_object(load); - load_GL_EXT_framebuffer_blit(load); - load_GL_EXT_framebuffer_multisample(load); - load_GL_EXT_framebuffer_object(load); - load_GL_OVR_multiview(load); - return GLVersion.major != 0 || GLVersion.minor != 0; -} - diff --git a/thirdparty/glad/glad/gl.h b/thirdparty/glad/glad/gl.h new file mode 100644 index 0000000000..9836296226 --- /dev/null +++ b/thirdparty/glad/glad/gl.h @@ -0,0 +1,3884 @@ +/** + * Loader generated by glad 2.0.2 on Mon Nov 7 12:17:15 2022 + * + * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 + * + * Generator: C/C++ + * Specification: gl + * Extensions: 7 + * + * APIs: + * - gl:compatibility=3.3 + * + * Options: + * - ALIAS = False + * - DEBUG = False + * - HEADER_ONLY = False + * - LOADER = True + * - MX = False + * - ON_DEMAND = False + * + * Commandline: + * --api='gl:compatibility=3.3' --extensions='GL_ARB_debug_output,GL_ARB_framebuffer_object,GL_EXT_framebuffer_blit,GL_EXT_framebuffer_multisample,GL_EXT_framebuffer_object,GL_OVR_multiview,GL_OVR_multiview2' c --loader + * + * Online: + * http://glad.sh/#api=gl%3Acompatibility%3D3.3&extensions=GL_ARB_debug_output%2CGL_ARB_framebuffer_object%2CGL_EXT_framebuffer_blit%2CGL_EXT_framebuffer_multisample%2CGL_EXT_framebuffer_object%2CGL_OVR_multiview%2CGL_OVR_multiview2&generator=c&options=LOADER + * + */ + +#ifndef GLAD_GL_H_ +#define GLAD_GL_H_ + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-id-macro" +#endif +#ifdef __gl_h_ + #error OpenGL (gl.h) header already included (API: gl), remove previous include! +#endif +#define __gl_h_ 1 +#ifdef __gl3_h_ + #error OpenGL (gl3.h) header already included (API: gl), remove previous include! +#endif +#define __gl3_h_ 1 +#ifdef __glext_h_ + #error OpenGL (glext.h) header already included (API: gl), remove previous include! +#endif +#define __glext_h_ 1 +#ifdef __gl3ext_h_ + #error OpenGL (gl3ext.h) header already included (API: gl), remove previous include! +#endif +#define __gl3ext_h_ 1 +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + +#define GLAD_GL +#define GLAD_OPTION_GL_LOADER + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef GLAD_PLATFORM_H_ +#define GLAD_PLATFORM_H_ + +#ifndef GLAD_PLATFORM_WIN32 + #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__MINGW32__) + #define GLAD_PLATFORM_WIN32 1 + #else + #define GLAD_PLATFORM_WIN32 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_APPLE + #ifdef __APPLE__ + #define GLAD_PLATFORM_APPLE 1 + #else + #define GLAD_PLATFORM_APPLE 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_EMSCRIPTEN + #ifdef __EMSCRIPTEN__ + #define GLAD_PLATFORM_EMSCRIPTEN 1 + #else + #define GLAD_PLATFORM_EMSCRIPTEN 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_UWP + #if defined(_MSC_VER) && !defined(GLAD_INTERNAL_HAVE_WINAPIFAMILY) + #ifdef __has_include + #if __has_include(<winapifamily.h>) + #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1 + #endif + #elif _MSC_VER >= 1700 && !_USING_V110_SDK71_ + #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1 + #endif + #endif + + #ifdef GLAD_INTERNAL_HAVE_WINAPIFAMILY + #include <winapifamily.h> + #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) + #define GLAD_PLATFORM_UWP 1 + #endif + #endif + + #ifndef GLAD_PLATFORM_UWP + #define GLAD_PLATFORM_UWP 0 + #endif +#endif + +#ifdef __GNUC__ + #define GLAD_GNUC_EXTENSION __extension__ +#else + #define GLAD_GNUC_EXTENSION +#endif + +#define GLAD_UNUSED(x) (void)(x) + +#ifndef GLAD_API_CALL + #if defined(GLAD_API_CALL_EXPORT) + #if GLAD_PLATFORM_WIN32 || defined(__CYGWIN__) + #if defined(GLAD_API_CALL_EXPORT_BUILD) + #if defined(__GNUC__) + #define GLAD_API_CALL __attribute__ ((dllexport)) extern + #else + #define GLAD_API_CALL __declspec(dllexport) extern + #endif + #else + #if defined(__GNUC__) + #define GLAD_API_CALL __attribute__ ((dllimport)) extern + #else + #define GLAD_API_CALL __declspec(dllimport) extern + #endif + #endif + #elif defined(__GNUC__) && defined(GLAD_API_CALL_EXPORT_BUILD) + #define GLAD_API_CALL __attribute__ ((visibility ("default"))) extern + #else + #define GLAD_API_CALL extern + #endif + #else + #define GLAD_API_CALL extern + #endif +#endif + +#ifdef APIENTRY + #define GLAD_API_PTR APIENTRY +#elif GLAD_PLATFORM_WIN32 + #define GLAD_API_PTR __stdcall +#else + #define GLAD_API_PTR +#endif + +#ifndef GLAPI +#define GLAPI GLAD_API_CALL +#endif + +#ifndef GLAPIENTRY +#define GLAPIENTRY GLAD_API_PTR +#endif + +#define GLAD_MAKE_VERSION(major, minor) (major * 10000 + minor) +#define GLAD_VERSION_MAJOR(version) (version / 10000) +#define GLAD_VERSION_MINOR(version) (version % 10000) + +#define GLAD_GENERATOR_VERSION "2.0.2" + +typedef void (*GLADapiproc)(void); + +typedef GLADapiproc (*GLADloadfunc)(const char *name); +typedef GLADapiproc (*GLADuserptrloadfunc)(void *userptr, const char *name); + +typedef void (*GLADprecallback)(const char *name, GLADapiproc apiproc, int len_args, ...); +typedef void (*GLADpostcallback)(void *ret, const char *name, GLADapiproc apiproc, int len_args, ...); + +#endif /* GLAD_PLATFORM_H_ */ + +#define GL_2D 0x0600 +#define GL_2_BYTES 0x1407 +#define GL_3D 0x0601 +#define GL_3D_COLOR 0x0602 +#define GL_3D_COLOR_TEXTURE 0x0603 +#define GL_3_BYTES 0x1408 +#define GL_4D_COLOR_TEXTURE 0x0604 +#define GL_4_BYTES 0x1409 +#define GL_ACCUM 0x0100 +#define GL_ACCUM_ALPHA_BITS 0x0D5B +#define GL_ACCUM_BLUE_BITS 0x0D5A +#define GL_ACCUM_BUFFER_BIT 0x00000200 +#define GL_ACCUM_CLEAR_VALUE 0x0B80 +#define GL_ACCUM_GREEN_BITS 0x0D59 +#define GL_ACCUM_RED_BITS 0x0D58 +#define GL_ACTIVE_ATTRIBUTES 0x8B89 +#define GL_ACTIVE_ATTRIBUTE_MAX_LENGTH 0x8B8A +#define GL_ACTIVE_TEXTURE 0x84E0 +#define GL_ACTIVE_UNIFORMS 0x8B86 +#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36 +#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35 +#define GL_ACTIVE_UNIFORM_MAX_LENGTH 0x8B87 +#define GL_ADD 0x0104 +#define GL_ADD_SIGNED 0x8574 +#define GL_ALIASED_LINE_WIDTH_RANGE 0x846E +#define GL_ALIASED_POINT_SIZE_RANGE 0x846D +#define GL_ALL_ATTRIB_BITS 0xFFFFFFFF +#define GL_ALPHA 0x1906 +#define GL_ALPHA12 0x803D +#define GL_ALPHA16 0x803E +#define GL_ALPHA4 0x803B +#define GL_ALPHA8 0x803C +#define GL_ALPHA_BIAS 0x0D1D +#define GL_ALPHA_BITS 0x0D55 +#define GL_ALPHA_INTEGER 0x8D97 +#define GL_ALPHA_SCALE 0x0D1C +#define GL_ALPHA_TEST 0x0BC0 +#define GL_ALPHA_TEST_FUNC 0x0BC1 +#define GL_ALPHA_TEST_REF 0x0BC2 +#define GL_ALREADY_SIGNALED 0x911A +#define GL_ALWAYS 0x0207 +#define GL_AMBIENT 0x1200 +#define GL_AMBIENT_AND_DIFFUSE 0x1602 +#define GL_AND 0x1501 +#define GL_AND_INVERTED 0x1504 +#define GL_AND_REVERSE 0x1502 +#define GL_ANY_SAMPLES_PASSED 0x8C2F +#define GL_ARRAY_BUFFER 0x8892 +#define GL_ARRAY_BUFFER_BINDING 0x8894 +#define GL_ATTACHED_SHADERS 0x8B85 +#define GL_ATTRIB_STACK_DEPTH 0x0BB0 +#define GL_AUTO_NORMAL 0x0D80 +#define GL_AUX0 0x0409 +#define GL_AUX1 0x040A +#define GL_AUX2 0x040B +#define GL_AUX3 0x040C +#define GL_AUX_BUFFERS 0x0C00 +#define GL_BACK 0x0405 +#define GL_BACK_LEFT 0x0402 +#define GL_BACK_RIGHT 0x0403 +#define GL_BGR 0x80E0 +#define GL_BGRA 0x80E1 +#define GL_BGRA_INTEGER 0x8D9B +#define GL_BGR_INTEGER 0x8D9A +#define GL_BITMAP 0x1A00 +#define GL_BITMAP_TOKEN 0x0704 +#define GL_BLEND 0x0BE2 +#define GL_BLEND_COLOR 0x8005 +#define GL_BLEND_DST 0x0BE0 +#define GL_BLEND_DST_ALPHA 0x80CA +#define GL_BLEND_DST_RGB 0x80C8 +#define GL_BLEND_EQUATION 0x8009 +#define GL_BLEND_EQUATION_ALPHA 0x883D +#define GL_BLEND_EQUATION_RGB 0x8009 +#define GL_BLEND_SRC 0x0BE1 +#define GL_BLEND_SRC_ALPHA 0x80CB +#define GL_BLEND_SRC_RGB 0x80C9 +#define GL_BLUE 0x1905 +#define GL_BLUE_BIAS 0x0D1B +#define GL_BLUE_BITS 0x0D54 +#define GL_BLUE_INTEGER 0x8D96 +#define GL_BLUE_SCALE 0x0D1A +#define GL_BOOL 0x8B56 +#define GL_BOOL_VEC2 0x8B57 +#define GL_BOOL_VEC3 0x8B58 +#define GL_BOOL_VEC4 0x8B59 +#define GL_BUFFER_ACCESS 0x88BB +#define GL_BUFFER_ACCESS_FLAGS 0x911F +#define GL_BUFFER_MAPPED 0x88BC +#define GL_BUFFER_MAP_LENGTH 0x9120 +#define GL_BUFFER_MAP_OFFSET 0x9121 +#define GL_BUFFER_MAP_POINTER 0x88BD +#define GL_BUFFER_SIZE 0x8764 +#define GL_BUFFER_USAGE 0x8765 +#define GL_BYTE 0x1400 +#define GL_C3F_V3F 0x2A24 +#define GL_C4F_N3F_V3F 0x2A26 +#define GL_C4UB_V2F 0x2A22 +#define GL_C4UB_V3F 0x2A23 +#define GL_CCW 0x0901 +#define GL_CLAMP 0x2900 +#define GL_CLAMP_FRAGMENT_COLOR 0x891B +#define GL_CLAMP_READ_COLOR 0x891C +#define GL_CLAMP_TO_BORDER 0x812D +#define GL_CLAMP_TO_EDGE 0x812F +#define GL_CLAMP_VERTEX_COLOR 0x891A +#define GL_CLEAR 0x1500 +#define GL_CLIENT_ACTIVE_TEXTURE 0x84E1 +#define GL_CLIENT_ALL_ATTRIB_BITS 0xFFFFFFFF +#define GL_CLIENT_ATTRIB_STACK_DEPTH 0x0BB1 +#define GL_CLIENT_PIXEL_STORE_BIT 0x00000001 +#define GL_CLIENT_VERTEX_ARRAY_BIT 0x00000002 +#define GL_CLIP_DISTANCE0 0x3000 +#define GL_CLIP_DISTANCE1 0x3001 +#define GL_CLIP_DISTANCE2 0x3002 +#define GL_CLIP_DISTANCE3 0x3003 +#define GL_CLIP_DISTANCE4 0x3004 +#define GL_CLIP_DISTANCE5 0x3005 +#define GL_CLIP_DISTANCE6 0x3006 +#define GL_CLIP_DISTANCE7 0x3007 +#define GL_CLIP_PLANE0 0x3000 +#define GL_CLIP_PLANE1 0x3001 +#define GL_CLIP_PLANE2 0x3002 +#define GL_CLIP_PLANE3 0x3003 +#define GL_CLIP_PLANE4 0x3004 +#define GL_CLIP_PLANE5 0x3005 +#define GL_COEFF 0x0A00 +#define GL_COLOR 0x1800 +#define GL_COLOR_ARRAY 0x8076 +#define GL_COLOR_ARRAY_BUFFER_BINDING 0x8898 +#define GL_COLOR_ARRAY_POINTER 0x8090 +#define GL_COLOR_ARRAY_SIZE 0x8081 +#define GL_COLOR_ARRAY_STRIDE 0x8083 +#define GL_COLOR_ARRAY_TYPE 0x8082 +#define GL_COLOR_ATTACHMENT0 0x8CE0 +#define GL_COLOR_ATTACHMENT0_EXT 0x8CE0 +#define GL_COLOR_ATTACHMENT1 0x8CE1 +#define GL_COLOR_ATTACHMENT10 0x8CEA +#define GL_COLOR_ATTACHMENT10_EXT 0x8CEA +#define GL_COLOR_ATTACHMENT11 0x8CEB +#define GL_COLOR_ATTACHMENT11_EXT 0x8CEB +#define GL_COLOR_ATTACHMENT12 0x8CEC +#define GL_COLOR_ATTACHMENT12_EXT 0x8CEC +#define GL_COLOR_ATTACHMENT13 0x8CED +#define GL_COLOR_ATTACHMENT13_EXT 0x8CED +#define GL_COLOR_ATTACHMENT14 0x8CEE +#define GL_COLOR_ATTACHMENT14_EXT 0x8CEE +#define GL_COLOR_ATTACHMENT15 0x8CEF +#define GL_COLOR_ATTACHMENT15_EXT 0x8CEF +#define GL_COLOR_ATTACHMENT16 0x8CF0 +#define GL_COLOR_ATTACHMENT17 0x8CF1 +#define GL_COLOR_ATTACHMENT18 0x8CF2 +#define GL_COLOR_ATTACHMENT19 0x8CF3 +#define GL_COLOR_ATTACHMENT1_EXT 0x8CE1 +#define GL_COLOR_ATTACHMENT2 0x8CE2 +#define GL_COLOR_ATTACHMENT20 0x8CF4 +#define GL_COLOR_ATTACHMENT21 0x8CF5 +#define GL_COLOR_ATTACHMENT22 0x8CF6 +#define GL_COLOR_ATTACHMENT23 0x8CF7 +#define GL_COLOR_ATTACHMENT24 0x8CF8 +#define GL_COLOR_ATTACHMENT25 0x8CF9 +#define GL_COLOR_ATTACHMENT26 0x8CFA +#define GL_COLOR_ATTACHMENT27 0x8CFB +#define GL_COLOR_ATTACHMENT28 0x8CFC +#define GL_COLOR_ATTACHMENT29 0x8CFD +#define GL_COLOR_ATTACHMENT2_EXT 0x8CE2 +#define GL_COLOR_ATTACHMENT3 0x8CE3 +#define GL_COLOR_ATTACHMENT30 0x8CFE +#define GL_COLOR_ATTACHMENT31 0x8CFF +#define GL_COLOR_ATTACHMENT3_EXT 0x8CE3 +#define GL_COLOR_ATTACHMENT4 0x8CE4 +#define GL_COLOR_ATTACHMENT4_EXT 0x8CE4 +#define GL_COLOR_ATTACHMENT5 0x8CE5 +#define GL_COLOR_ATTACHMENT5_EXT 0x8CE5 +#define GL_COLOR_ATTACHMENT6 0x8CE6 +#define GL_COLOR_ATTACHMENT6_EXT 0x8CE6 +#define GL_COLOR_ATTACHMENT7 0x8CE7 +#define GL_COLOR_ATTACHMENT7_EXT 0x8CE7 +#define GL_COLOR_ATTACHMENT8 0x8CE8 +#define GL_COLOR_ATTACHMENT8_EXT 0x8CE8 +#define GL_COLOR_ATTACHMENT9 0x8CE9 +#define GL_COLOR_ATTACHMENT9_EXT 0x8CE9 +#define GL_COLOR_BUFFER_BIT 0x00004000 +#define GL_COLOR_CLEAR_VALUE 0x0C22 +#define GL_COLOR_INDEX 0x1900 +#define GL_COLOR_INDEXES 0x1603 +#define GL_COLOR_LOGIC_OP 0x0BF2 +#define GL_COLOR_MATERIAL 0x0B57 +#define GL_COLOR_MATERIAL_FACE 0x0B55 +#define GL_COLOR_MATERIAL_PARAMETER 0x0B56 +#define GL_COLOR_SUM 0x8458 +#define GL_COLOR_WRITEMASK 0x0C23 +#define GL_COMBINE 0x8570 +#define GL_COMBINE_ALPHA 0x8572 +#define GL_COMBINE_RGB 0x8571 +#define GL_COMPARE_REF_TO_TEXTURE 0x884E +#define GL_COMPARE_R_TO_TEXTURE 0x884E +#define GL_COMPILE 0x1300 +#define GL_COMPILE_AND_EXECUTE 0x1301 +#define GL_COMPILE_STATUS 0x8B81 +#define GL_COMPRESSED_ALPHA 0x84E9 +#define GL_COMPRESSED_INTENSITY 0x84EC +#define GL_COMPRESSED_LUMINANCE 0x84EA +#define GL_COMPRESSED_LUMINANCE_ALPHA 0x84EB +#define GL_COMPRESSED_RED 0x8225 +#define GL_COMPRESSED_RED_RGTC1 0x8DBB +#define GL_COMPRESSED_RG 0x8226 +#define GL_COMPRESSED_RGB 0x84ED +#define GL_COMPRESSED_RGBA 0x84EE +#define GL_COMPRESSED_RG_RGTC2 0x8DBD +#define GL_COMPRESSED_SIGNED_RED_RGTC1 0x8DBC +#define GL_COMPRESSED_SIGNED_RG_RGTC2 0x8DBE +#define GL_COMPRESSED_SLUMINANCE 0x8C4A +#define GL_COMPRESSED_SLUMINANCE_ALPHA 0x8C4B +#define GL_COMPRESSED_SRGB 0x8C48 +#define GL_COMPRESSED_SRGB_ALPHA 0x8C49 +#define GL_COMPRESSED_TEXTURE_FORMATS 0x86A3 +#define GL_CONDITION_SATISFIED 0x911C +#define GL_CONSTANT 0x8576 +#define GL_CONSTANT_ALPHA 0x8003 +#define GL_CONSTANT_ATTENUATION 0x1207 +#define GL_CONSTANT_COLOR 0x8001 +#define GL_CONTEXT_COMPATIBILITY_PROFILE_BIT 0x00000002 +#define GL_CONTEXT_CORE_PROFILE_BIT 0x00000001 +#define GL_CONTEXT_FLAGS 0x821E +#define GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT 0x00000001 +#define GL_CONTEXT_PROFILE_MASK 0x9126 +#define GL_COORD_REPLACE 0x8862 +#define GL_COPY 0x1503 +#define GL_COPY_INVERTED 0x150C +#define GL_COPY_PIXEL_TOKEN 0x0706 +#define GL_COPY_READ_BUFFER 0x8F36 +#define GL_COPY_WRITE_BUFFER 0x8F37 +#define GL_CULL_FACE 0x0B44 +#define GL_CULL_FACE_MODE 0x0B45 +#define GL_CURRENT_BIT 0x00000001 +#define GL_CURRENT_COLOR 0x0B00 +#define GL_CURRENT_FOG_COORD 0x8453 +#define GL_CURRENT_FOG_COORDINATE 0x8453 +#define GL_CURRENT_INDEX 0x0B01 +#define GL_CURRENT_NORMAL 0x0B02 +#define GL_CURRENT_PROGRAM 0x8B8D +#define GL_CURRENT_QUERY 0x8865 +#define GL_CURRENT_RASTER_COLOR 0x0B04 +#define GL_CURRENT_RASTER_DISTANCE 0x0B09 +#define GL_CURRENT_RASTER_INDEX 0x0B05 +#define GL_CURRENT_RASTER_POSITION 0x0B07 +#define GL_CURRENT_RASTER_POSITION_VALID 0x0B08 +#define GL_CURRENT_RASTER_SECONDARY_COLOR 0x845F +#define GL_CURRENT_RASTER_TEXTURE_COORDS 0x0B06 +#define GL_CURRENT_SECONDARY_COLOR 0x8459 +#define GL_CURRENT_TEXTURE_COORDS 0x0B03 +#define GL_CURRENT_VERTEX_ATTRIB 0x8626 +#define GL_CW 0x0900 +#define GL_DEBUG_CALLBACK_FUNCTION_ARB 0x8244 +#define GL_DEBUG_CALLBACK_USER_PARAM_ARB 0x8245 +#define GL_DEBUG_LOGGED_MESSAGES_ARB 0x9145 +#define GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH_ARB 0x8243 +#define GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB 0x8242 +#define GL_DEBUG_SEVERITY_HIGH_ARB 0x9146 +#define GL_DEBUG_SEVERITY_LOW_ARB 0x9148 +#define GL_DEBUG_SEVERITY_MEDIUM_ARB 0x9147 +#define GL_DEBUG_SOURCE_API_ARB 0x8246 +#define GL_DEBUG_SOURCE_APPLICATION_ARB 0x824A +#define GL_DEBUG_SOURCE_OTHER_ARB 0x824B +#define GL_DEBUG_SOURCE_SHADER_COMPILER_ARB 0x8248 +#define GL_DEBUG_SOURCE_THIRD_PARTY_ARB 0x8249 +#define GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB 0x8247 +#define GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB 0x824D +#define GL_DEBUG_TYPE_ERROR_ARB 0x824C +#define GL_DEBUG_TYPE_OTHER_ARB 0x8251 +#define GL_DEBUG_TYPE_PERFORMANCE_ARB 0x8250 +#define GL_DEBUG_TYPE_PORTABILITY_ARB 0x824F +#define GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB 0x824E +#define GL_DECAL 0x2101 +#define GL_DECR 0x1E03 +#define GL_DECR_WRAP 0x8508 +#define GL_DELETE_STATUS 0x8B80 +#define GL_DEPTH 0x1801 +#define GL_DEPTH24_STENCIL8 0x88F0 +#define GL_DEPTH32F_STENCIL8 0x8CAD +#define GL_DEPTH_ATTACHMENT 0x8D00 +#define GL_DEPTH_ATTACHMENT_EXT 0x8D00 +#define GL_DEPTH_BIAS 0x0D1F +#define GL_DEPTH_BITS 0x0D56 +#define GL_DEPTH_BUFFER_BIT 0x00000100 +#define GL_DEPTH_CLAMP 0x864F +#define GL_DEPTH_CLEAR_VALUE 0x0B73 +#define GL_DEPTH_COMPONENT 0x1902 +#define GL_DEPTH_COMPONENT16 0x81A5 +#define GL_DEPTH_COMPONENT24 0x81A6 +#define GL_DEPTH_COMPONENT32 0x81A7 +#define GL_DEPTH_COMPONENT32F 0x8CAC +#define GL_DEPTH_FUNC 0x0B74 +#define GL_DEPTH_RANGE 0x0B70 +#define GL_DEPTH_SCALE 0x0D1E +#define GL_DEPTH_STENCIL 0x84F9 +#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A +#define GL_DEPTH_TEST 0x0B71 +#define GL_DEPTH_TEXTURE_MODE 0x884B +#define GL_DEPTH_WRITEMASK 0x0B72 +#define GL_DIFFUSE 0x1201 +#define GL_DITHER 0x0BD0 +#define GL_DOMAIN 0x0A02 +#define GL_DONT_CARE 0x1100 +#define GL_DOT3_RGB 0x86AE +#define GL_DOT3_RGBA 0x86AF +#define GL_DOUBLE 0x140A +#define GL_DOUBLEBUFFER 0x0C32 +#define GL_DRAW_BUFFER 0x0C01 +#define GL_DRAW_BUFFER0 0x8825 +#define GL_DRAW_BUFFER1 0x8826 +#define GL_DRAW_BUFFER10 0x882F +#define GL_DRAW_BUFFER11 0x8830 +#define GL_DRAW_BUFFER12 0x8831 +#define GL_DRAW_BUFFER13 0x8832 +#define GL_DRAW_BUFFER14 0x8833 +#define GL_DRAW_BUFFER15 0x8834 +#define GL_DRAW_BUFFER2 0x8827 +#define GL_DRAW_BUFFER3 0x8828 +#define GL_DRAW_BUFFER4 0x8829 +#define GL_DRAW_BUFFER5 0x882A +#define GL_DRAW_BUFFER6 0x882B +#define GL_DRAW_BUFFER7 0x882C +#define GL_DRAW_BUFFER8 0x882D +#define GL_DRAW_BUFFER9 0x882E +#define GL_DRAW_FRAMEBUFFER 0x8CA9 +#define GL_DRAW_FRAMEBUFFER_BINDING 0x8CA6 +#define GL_DRAW_FRAMEBUFFER_BINDING_EXT 0x8CA6 +#define GL_DRAW_FRAMEBUFFER_EXT 0x8CA9 +#define GL_DRAW_PIXEL_TOKEN 0x0705 +#define GL_DST_ALPHA 0x0304 +#define GL_DST_COLOR 0x0306 +#define GL_DYNAMIC_COPY 0x88EA +#define GL_DYNAMIC_DRAW 0x88E8 +#define GL_DYNAMIC_READ 0x88E9 +#define GL_EDGE_FLAG 0x0B43 +#define GL_EDGE_FLAG_ARRAY 0x8079 +#define GL_EDGE_FLAG_ARRAY_BUFFER_BINDING 0x889B +#define GL_EDGE_FLAG_ARRAY_POINTER 0x8093 +#define GL_EDGE_FLAG_ARRAY_STRIDE 0x808C +#define GL_ELEMENT_ARRAY_BUFFER 0x8893 +#define GL_ELEMENT_ARRAY_BUFFER_BINDING 0x8895 +#define GL_EMISSION 0x1600 +#define GL_ENABLE_BIT 0x00002000 +#define GL_EQUAL 0x0202 +#define GL_EQUIV 0x1509 +#define GL_EVAL_BIT 0x00010000 +#define GL_EXP 0x0800 +#define GL_EXP2 0x0801 +#define GL_EXTENSIONS 0x1F03 +#define GL_EYE_LINEAR 0x2400 +#define GL_EYE_PLANE 0x2502 +#define GL_FALSE 0 +#define GL_FASTEST 0x1101 +#define GL_FEEDBACK 0x1C01 +#define GL_FEEDBACK_BUFFER_POINTER 0x0DF0 +#define GL_FEEDBACK_BUFFER_SIZE 0x0DF1 +#define GL_FEEDBACK_BUFFER_TYPE 0x0DF2 +#define GL_FILL 0x1B02 +#define GL_FIRST_VERTEX_CONVENTION 0x8E4D +#define GL_FIXED_ONLY 0x891D +#define GL_FLAT 0x1D00 +#define GL_FLOAT 0x1406 +#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD +#define GL_FLOAT_MAT2 0x8B5A +#define GL_FLOAT_MAT2x3 0x8B65 +#define GL_FLOAT_MAT2x4 0x8B66 +#define GL_FLOAT_MAT3 0x8B5B +#define GL_FLOAT_MAT3x2 0x8B67 +#define GL_FLOAT_MAT3x4 0x8B68 +#define GL_FLOAT_MAT4 0x8B5C +#define GL_FLOAT_MAT4x2 0x8B69 +#define GL_FLOAT_MAT4x3 0x8B6A +#define GL_FLOAT_VEC2 0x8B50 +#define GL_FLOAT_VEC3 0x8B51 +#define GL_FLOAT_VEC4 0x8B52 +#define GL_FOG 0x0B60 +#define GL_FOG_BIT 0x00000080 +#define GL_FOG_COLOR 0x0B66 +#define GL_FOG_COORD 0x8451 +#define GL_FOG_COORDINATE 0x8451 +#define GL_FOG_COORDINATE_ARRAY 0x8457 +#define GL_FOG_COORDINATE_ARRAY_BUFFER_BINDING 0x889D +#define GL_FOG_COORDINATE_ARRAY_POINTER 0x8456 +#define GL_FOG_COORDINATE_ARRAY_STRIDE 0x8455 +#define GL_FOG_COORDINATE_ARRAY_TYPE 0x8454 +#define GL_FOG_COORDINATE_SOURCE 0x8450 +#define GL_FOG_COORD_ARRAY 0x8457 +#define GL_FOG_COORD_ARRAY_BUFFER_BINDING 0x889D +#define GL_FOG_COORD_ARRAY_POINTER 0x8456 +#define GL_FOG_COORD_ARRAY_STRIDE 0x8455 +#define GL_FOG_COORD_ARRAY_TYPE 0x8454 +#define GL_FOG_COORD_SRC 0x8450 +#define GL_FOG_DENSITY 0x0B62 +#define GL_FOG_END 0x0B64 +#define GL_FOG_HINT 0x0C54 +#define GL_FOG_INDEX 0x0B61 +#define GL_FOG_MODE 0x0B65 +#define GL_FOG_START 0x0B63 +#define GL_FRAGMENT_DEPTH 0x8452 +#define GL_FRAGMENT_SHADER 0x8B30 +#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B +#define GL_FRAMEBUFFER 0x8D40 +#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215 +#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214 +#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210 +#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211 +#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216 +#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213 +#define GL_FRAMEBUFFER_ATTACHMENT_LAYERED 0x8DA7 +#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME 0x8CD1 +#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT 0x8CD1 +#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE 0x8CD0 +#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT 0x8CD0 +#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212 +#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT 0x8CD4 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE 0x8CD3 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT 0x8CD3 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL 0x8CD2 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT 0x8CD2 +#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630 +#define GL_FRAMEBUFFER_BINDING 0x8CA6 +#define GL_FRAMEBUFFER_BINDING_EXT 0x8CA6 +#define GL_FRAMEBUFFER_COMPLETE 0x8CD5 +#define GL_FRAMEBUFFER_COMPLETE_EXT 0x8CD5 +#define GL_FRAMEBUFFER_DEFAULT 0x8218 +#define GL_FRAMEBUFFER_EXT 0x8D40 +#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT 0x8CD6 +#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT 0x8CD6 +#define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT 0x8CD9 +#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER 0x8CDB +#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT 0x8CDB +#define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT 0x8CDA +#define GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS 0x8DA8 +#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT 0x8CD7 +#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT 0x8CD7 +#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56 +#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_EXT 0x8D56 +#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER 0x8CDC +#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT 0x8CDC +#define GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR 0x9633 +#define GL_FRAMEBUFFER_SRGB 0x8DB9 +#define GL_FRAMEBUFFER_UNDEFINED 0x8219 +#define GL_FRAMEBUFFER_UNSUPPORTED 0x8CDD +#define GL_FRAMEBUFFER_UNSUPPORTED_EXT 0x8CDD +#define GL_FRONT 0x0404 +#define GL_FRONT_AND_BACK 0x0408 +#define GL_FRONT_FACE 0x0B46 +#define GL_FRONT_LEFT 0x0400 +#define GL_FRONT_RIGHT 0x0401 +#define GL_FUNC_ADD 0x8006 +#define GL_FUNC_REVERSE_SUBTRACT 0x800B +#define GL_FUNC_SUBTRACT 0x800A +#define GL_GENERATE_MIPMAP 0x8191 +#define GL_GENERATE_MIPMAP_HINT 0x8192 +#define GL_GEOMETRY_INPUT_TYPE 0x8917 +#define GL_GEOMETRY_OUTPUT_TYPE 0x8918 +#define GL_GEOMETRY_SHADER 0x8DD9 +#define GL_GEOMETRY_VERTICES_OUT 0x8916 +#define GL_GEQUAL 0x0206 +#define GL_GREATER 0x0204 +#define GL_GREEN 0x1904 +#define GL_GREEN_BIAS 0x0D19 +#define GL_GREEN_BITS 0x0D53 +#define GL_GREEN_INTEGER 0x8D95 +#define GL_GREEN_SCALE 0x0D18 +#define GL_HALF_FLOAT 0x140B +#define GL_HINT_BIT 0x00008000 +#define GL_INCR 0x1E02 +#define GL_INCR_WRAP 0x8507 +#define GL_INDEX 0x8222 +#define GL_INDEX_ARRAY 0x8077 +#define GL_INDEX_ARRAY_BUFFER_BINDING 0x8899 +#define GL_INDEX_ARRAY_POINTER 0x8091 +#define GL_INDEX_ARRAY_STRIDE 0x8086 +#define GL_INDEX_ARRAY_TYPE 0x8085 +#define GL_INDEX_BITS 0x0D51 +#define GL_INDEX_CLEAR_VALUE 0x0C20 +#define GL_INDEX_LOGIC_OP 0x0BF1 +#define GL_INDEX_MODE 0x0C30 +#define GL_INDEX_OFFSET 0x0D13 +#define GL_INDEX_SHIFT 0x0D12 +#define GL_INDEX_WRITEMASK 0x0C21 +#define GL_INFO_LOG_LENGTH 0x8B84 +#define GL_INT 0x1404 +#define GL_INTENSITY 0x8049 +#define GL_INTENSITY12 0x804C +#define GL_INTENSITY16 0x804D +#define GL_INTENSITY4 0x804A +#define GL_INTENSITY8 0x804B +#define GL_INTERLEAVED_ATTRIBS 0x8C8C +#define GL_INTERPOLATE 0x8575 +#define GL_INT_2_10_10_10_REV 0x8D9F +#define GL_INT_SAMPLER_1D 0x8DC9 +#define GL_INT_SAMPLER_1D_ARRAY 0x8DCE +#define GL_INT_SAMPLER_2D 0x8DCA +#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF +#define GL_INT_SAMPLER_2D_MULTISAMPLE 0x9109 +#define GL_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910C +#define GL_INT_SAMPLER_2D_RECT 0x8DCD +#define GL_INT_SAMPLER_3D 0x8DCB +#define GL_INT_SAMPLER_BUFFER 0x8DD0 +#define GL_INT_SAMPLER_CUBE 0x8DCC +#define GL_INT_VEC2 0x8B53 +#define GL_INT_VEC3 0x8B54 +#define GL_INT_VEC4 0x8B55 +#define GL_INVALID_ENUM 0x0500 +#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506 +#define GL_INVALID_FRAMEBUFFER_OPERATION_EXT 0x0506 +#define GL_INVALID_INDEX 0xFFFFFFFF +#define GL_INVALID_OPERATION 0x0502 +#define GL_INVALID_VALUE 0x0501 +#define GL_INVERT 0x150A +#define GL_KEEP 0x1E00 +#define GL_LAST_VERTEX_CONVENTION 0x8E4E +#define GL_LEFT 0x0406 +#define GL_LEQUAL 0x0203 +#define GL_LESS 0x0201 +#define GL_LIGHT0 0x4000 +#define GL_LIGHT1 0x4001 +#define GL_LIGHT2 0x4002 +#define GL_LIGHT3 0x4003 +#define GL_LIGHT4 0x4004 +#define GL_LIGHT5 0x4005 +#define GL_LIGHT6 0x4006 +#define GL_LIGHT7 0x4007 +#define GL_LIGHTING 0x0B50 +#define GL_LIGHTING_BIT 0x00000040 +#define GL_LIGHT_MODEL_AMBIENT 0x0B53 +#define GL_LIGHT_MODEL_COLOR_CONTROL 0x81F8 +#define GL_LIGHT_MODEL_LOCAL_VIEWER 0x0B51 +#define GL_LIGHT_MODEL_TWO_SIDE 0x0B52 +#define GL_LINE 0x1B01 +#define GL_LINEAR 0x2601 +#define GL_LINEAR_ATTENUATION 0x1208 +#define GL_LINEAR_MIPMAP_LINEAR 0x2703 +#define GL_LINEAR_MIPMAP_NEAREST 0x2701 +#define GL_LINES 0x0001 +#define GL_LINES_ADJACENCY 0x000A +#define GL_LINE_BIT 0x00000004 +#define GL_LINE_LOOP 0x0002 +#define GL_LINE_RESET_TOKEN 0x0707 +#define GL_LINE_SMOOTH 0x0B20 +#define GL_LINE_SMOOTH_HINT 0x0C52 +#define GL_LINE_STIPPLE 0x0B24 +#define GL_LINE_STIPPLE_PATTERN 0x0B25 +#define GL_LINE_STIPPLE_REPEAT 0x0B26 +#define GL_LINE_STRIP 0x0003 +#define GL_LINE_STRIP_ADJACENCY 0x000B +#define GL_LINE_TOKEN 0x0702 +#define GL_LINE_WIDTH 0x0B21 +#define GL_LINE_WIDTH_GRANULARITY 0x0B23 +#define GL_LINE_WIDTH_RANGE 0x0B22 +#define GL_LINK_STATUS 0x8B82 +#define GL_LIST_BASE 0x0B32 +#define GL_LIST_BIT 0x00020000 +#define GL_LIST_INDEX 0x0B33 +#define GL_LIST_MODE 0x0B30 +#define GL_LOAD 0x0101 +#define GL_LOGIC_OP 0x0BF1 +#define GL_LOGIC_OP_MODE 0x0BF0 +#define GL_LOWER_LEFT 0x8CA1 +#define GL_LUMINANCE 0x1909 +#define GL_LUMINANCE12 0x8041 +#define GL_LUMINANCE12_ALPHA12 0x8047 +#define GL_LUMINANCE12_ALPHA4 0x8046 +#define GL_LUMINANCE16 0x8042 +#define GL_LUMINANCE16_ALPHA16 0x8048 +#define GL_LUMINANCE4 0x803F +#define GL_LUMINANCE4_ALPHA4 0x8043 +#define GL_LUMINANCE6_ALPHA2 0x8044 +#define GL_LUMINANCE8 0x8040 +#define GL_LUMINANCE8_ALPHA8 0x8045 +#define GL_LUMINANCE_ALPHA 0x190A +#define GL_MAJOR_VERSION 0x821B +#define GL_MAP1_COLOR_4 0x0D90 +#define GL_MAP1_GRID_DOMAIN 0x0DD0 +#define GL_MAP1_GRID_SEGMENTS 0x0DD1 +#define GL_MAP1_INDEX 0x0D91 +#define GL_MAP1_NORMAL 0x0D92 +#define GL_MAP1_TEXTURE_COORD_1 0x0D93 +#define GL_MAP1_TEXTURE_COORD_2 0x0D94 +#define GL_MAP1_TEXTURE_COORD_3 0x0D95 +#define GL_MAP1_TEXTURE_COORD_4 0x0D96 +#define GL_MAP1_VERTEX_3 0x0D97 +#define GL_MAP1_VERTEX_4 0x0D98 +#define GL_MAP2_COLOR_4 0x0DB0 +#define GL_MAP2_GRID_DOMAIN 0x0DD2 +#define GL_MAP2_GRID_SEGMENTS 0x0DD3 +#define GL_MAP2_INDEX 0x0DB1 +#define GL_MAP2_NORMAL 0x0DB2 +#define GL_MAP2_TEXTURE_COORD_1 0x0DB3 +#define GL_MAP2_TEXTURE_COORD_2 0x0DB4 +#define GL_MAP2_TEXTURE_COORD_3 0x0DB5 +#define GL_MAP2_TEXTURE_COORD_4 0x0DB6 +#define GL_MAP2_VERTEX_3 0x0DB7 +#define GL_MAP2_VERTEX_4 0x0DB8 +#define GL_MAP_COLOR 0x0D10 +#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010 +#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008 +#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004 +#define GL_MAP_READ_BIT 0x0001 +#define GL_MAP_STENCIL 0x0D11 +#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020 +#define GL_MAP_WRITE_BIT 0x0002 +#define GL_MATRIX_MODE 0x0BA0 +#define GL_MAX 0x8008 +#define GL_MAX_3D_TEXTURE_SIZE 0x8073 +#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF +#define GL_MAX_ATTRIB_STACK_DEPTH 0x0D35 +#define GL_MAX_CLIENT_ATTRIB_STACK_DEPTH 0x0D3B +#define GL_MAX_CLIP_DISTANCES 0x0D32 +#define GL_MAX_CLIP_PLANES 0x0D32 +#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF +#define GL_MAX_COLOR_ATTACHMENTS_EXT 0x8CDF +#define GL_MAX_COLOR_TEXTURE_SAMPLES 0x910E +#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33 +#define GL_MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS 0x8A32 +#define GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS 0x8B4D +#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E +#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31 +#define GL_MAX_CUBE_MAP_TEXTURE_SIZE 0x851C +#define GL_MAX_DEBUG_LOGGED_MESSAGES_ARB 0x9144 +#define GL_MAX_DEBUG_MESSAGE_LENGTH_ARB 0x9143 +#define GL_MAX_DEPTH_TEXTURE_SAMPLES 0x910F +#define GL_MAX_DRAW_BUFFERS 0x8824 +#define GL_MAX_DUAL_SOURCE_DRAW_BUFFERS 0x88FC +#define GL_MAX_ELEMENTS_INDICES 0x80E9 +#define GL_MAX_ELEMENTS_VERTICES 0x80E8 +#define GL_MAX_EVAL_ORDER 0x0D30 +#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125 +#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D +#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49 +#define GL_MAX_GEOMETRY_INPUT_COMPONENTS 0x9123 +#define GL_MAX_GEOMETRY_OUTPUT_COMPONENTS 0x9124 +#define GL_MAX_GEOMETRY_OUTPUT_VERTICES 0x8DE0 +#define GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS 0x8C29 +#define GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS 0x8DE1 +#define GL_MAX_GEOMETRY_UNIFORM_BLOCKS 0x8A2C +#define GL_MAX_GEOMETRY_UNIFORM_COMPONENTS 0x8DDF +#define GL_MAX_INTEGER_SAMPLES 0x9110 +#define GL_MAX_LIGHTS 0x0D31 +#define GL_MAX_LIST_NESTING 0x0B31 +#define GL_MAX_MODELVIEW_STACK_DEPTH 0x0D36 +#define GL_MAX_NAME_STACK_DEPTH 0x0D37 +#define GL_MAX_PIXEL_MAP_TABLE 0x0D34 +#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905 +#define GL_MAX_PROJECTION_STACK_DEPTH 0x0D38 +#define GL_MAX_RECTANGLE_TEXTURE_SIZE 0x84F8 +#define GL_MAX_RENDERBUFFER_SIZE 0x84E8 +#define GL_MAX_RENDERBUFFER_SIZE_EXT 0x84E8 +#define GL_MAX_SAMPLES 0x8D57 +#define GL_MAX_SAMPLES_EXT 0x8D57 +#define GL_MAX_SAMPLE_MASK_WORDS 0x8E59 +#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111 +#define GL_MAX_TEXTURE_BUFFER_SIZE 0x8C2B +#define GL_MAX_TEXTURE_COORDS 0x8871 +#define GL_MAX_TEXTURE_IMAGE_UNITS 0x8872 +#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD +#define GL_MAX_TEXTURE_SIZE 0x0D33 +#define GL_MAX_TEXTURE_STACK_DEPTH 0x0D39 +#define GL_MAX_TEXTURE_UNITS 0x84E2 +#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A +#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B +#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80 +#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30 +#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F +#define GL_MAX_VARYING_COMPONENTS 0x8B4B +#define GL_MAX_VARYING_FLOATS 0x8B4B +#define GL_MAX_VERTEX_ATTRIBS 0x8869 +#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122 +#define GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS 0x8B4C +#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B +#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A +#define GL_MAX_VIEWPORT_DIMS 0x0D3A +#define GL_MAX_VIEWS_OVR 0x9631 +#define GL_MIN 0x8007 +#define GL_MINOR_VERSION 0x821C +#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904 +#define GL_MIRRORED_REPEAT 0x8370 +#define GL_MODELVIEW 0x1700 +#define GL_MODELVIEW_MATRIX 0x0BA6 +#define GL_MODELVIEW_STACK_DEPTH 0x0BA3 +#define GL_MODULATE 0x2100 +#define GL_MULT 0x0103 +#define GL_MULTISAMPLE 0x809D +#define GL_MULTISAMPLE_BIT 0x20000000 +#define GL_N3F_V3F 0x2A25 +#define GL_NAME_STACK_DEPTH 0x0D70 +#define GL_NAND 0x150E +#define GL_NEAREST 0x2600 +#define GL_NEAREST_MIPMAP_LINEAR 0x2702 +#define GL_NEAREST_MIPMAP_NEAREST 0x2700 +#define GL_NEVER 0x0200 +#define GL_NICEST 0x1102 +#define GL_NONE 0 +#define GL_NOOP 0x1505 +#define GL_NOR 0x1508 +#define GL_NORMALIZE 0x0BA1 +#define GL_NORMAL_ARRAY 0x8075 +#define GL_NORMAL_ARRAY_BUFFER_BINDING 0x8897 +#define GL_NORMAL_ARRAY_POINTER 0x808F +#define GL_NORMAL_ARRAY_STRIDE 0x807F +#define GL_NORMAL_ARRAY_TYPE 0x807E +#define GL_NORMAL_MAP 0x8511 +#define GL_NOTEQUAL 0x0205 +#define GL_NO_ERROR 0 +#define GL_NUM_COMPRESSED_TEXTURE_FORMATS 0x86A2 +#define GL_NUM_EXTENSIONS 0x821D +#define GL_OBJECT_LINEAR 0x2401 +#define GL_OBJECT_PLANE 0x2501 +#define GL_OBJECT_TYPE 0x9112 +#define GL_ONE 1 +#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004 +#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002 +#define GL_ONE_MINUS_DST_ALPHA 0x0305 +#define GL_ONE_MINUS_DST_COLOR 0x0307 +#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB +#define GL_ONE_MINUS_SRC1_COLOR 0x88FA +#define GL_ONE_MINUS_SRC_ALPHA 0x0303 +#define GL_ONE_MINUS_SRC_COLOR 0x0301 +#define GL_OPERAND0_ALPHA 0x8598 +#define GL_OPERAND0_RGB 0x8590 +#define GL_OPERAND1_ALPHA 0x8599 +#define GL_OPERAND1_RGB 0x8591 +#define GL_OPERAND2_ALPHA 0x859A +#define GL_OPERAND2_RGB 0x8592 +#define GL_OR 0x1507 +#define GL_ORDER 0x0A01 +#define GL_OR_INVERTED 0x150D +#define GL_OR_REVERSE 0x150B +#define GL_OUT_OF_MEMORY 0x0505 +#define GL_PACK_ALIGNMENT 0x0D05 +#define GL_PACK_IMAGE_HEIGHT 0x806C +#define GL_PACK_LSB_FIRST 0x0D01 +#define GL_PACK_ROW_LENGTH 0x0D02 +#define GL_PACK_SKIP_IMAGES 0x806B +#define GL_PACK_SKIP_PIXELS 0x0D04 +#define GL_PACK_SKIP_ROWS 0x0D03 +#define GL_PACK_SWAP_BYTES 0x0D00 +#define GL_PASS_THROUGH_TOKEN 0x0700 +#define GL_PERSPECTIVE_CORRECTION_HINT 0x0C50 +#define GL_PIXEL_MAP_A_TO_A 0x0C79 +#define GL_PIXEL_MAP_A_TO_A_SIZE 0x0CB9 +#define GL_PIXEL_MAP_B_TO_B 0x0C78 +#define GL_PIXEL_MAP_B_TO_B_SIZE 0x0CB8 +#define GL_PIXEL_MAP_G_TO_G 0x0C77 +#define GL_PIXEL_MAP_G_TO_G_SIZE 0x0CB7 +#define GL_PIXEL_MAP_I_TO_A 0x0C75 +#define GL_PIXEL_MAP_I_TO_A_SIZE 0x0CB5 +#define GL_PIXEL_MAP_I_TO_B 0x0C74 +#define GL_PIXEL_MAP_I_TO_B_SIZE 0x0CB4 +#define GL_PIXEL_MAP_I_TO_G 0x0C73 +#define GL_PIXEL_MAP_I_TO_G_SIZE 0x0CB3 +#define GL_PIXEL_MAP_I_TO_I 0x0C70 +#define GL_PIXEL_MAP_I_TO_I_SIZE 0x0CB0 +#define GL_PIXEL_MAP_I_TO_R 0x0C72 +#define GL_PIXEL_MAP_I_TO_R_SIZE 0x0CB2 +#define GL_PIXEL_MAP_R_TO_R 0x0C76 +#define GL_PIXEL_MAP_R_TO_R_SIZE 0x0CB6 +#define GL_PIXEL_MAP_S_TO_S 0x0C71 +#define GL_PIXEL_MAP_S_TO_S_SIZE 0x0CB1 +#define GL_PIXEL_MODE_BIT 0x00000020 +#define GL_PIXEL_PACK_BUFFER 0x88EB +#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED +#define GL_PIXEL_UNPACK_BUFFER 0x88EC +#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF +#define GL_POINT 0x1B00 +#define GL_POINTS 0x0000 +#define GL_POINT_BIT 0x00000002 +#define GL_POINT_DISTANCE_ATTENUATION 0x8129 +#define GL_POINT_FADE_THRESHOLD_SIZE 0x8128 +#define GL_POINT_SIZE 0x0B11 +#define GL_POINT_SIZE_GRANULARITY 0x0B13 +#define GL_POINT_SIZE_MAX 0x8127 +#define GL_POINT_SIZE_MIN 0x8126 +#define GL_POINT_SIZE_RANGE 0x0B12 +#define GL_POINT_SMOOTH 0x0B10 +#define GL_POINT_SMOOTH_HINT 0x0C51 +#define GL_POINT_SPRITE 0x8861 +#define GL_POINT_SPRITE_COORD_ORIGIN 0x8CA0 +#define GL_POINT_TOKEN 0x0701 +#define GL_POLYGON 0x0009 +#define GL_POLYGON_BIT 0x00000008 +#define GL_POLYGON_MODE 0x0B40 +#define GL_POLYGON_OFFSET_FACTOR 0x8038 +#define GL_POLYGON_OFFSET_FILL 0x8037 +#define GL_POLYGON_OFFSET_LINE 0x2A02 +#define GL_POLYGON_OFFSET_POINT 0x2A01 +#define GL_POLYGON_OFFSET_UNITS 0x2A00 +#define GL_POLYGON_SMOOTH 0x0B41 +#define GL_POLYGON_SMOOTH_HINT 0x0C53 +#define GL_POLYGON_STIPPLE 0x0B42 +#define GL_POLYGON_STIPPLE_BIT 0x00000010 +#define GL_POLYGON_TOKEN 0x0703 +#define GL_POSITION 0x1203 +#define GL_PREVIOUS 0x8578 +#define GL_PRIMARY_COLOR 0x8577 +#define GL_PRIMITIVES_GENERATED 0x8C87 +#define GL_PRIMITIVE_RESTART 0x8F9D +#define GL_PRIMITIVE_RESTART_INDEX 0x8F9E +#define GL_PROGRAM_POINT_SIZE 0x8642 +#define GL_PROJECTION 0x1701 +#define GL_PROJECTION_MATRIX 0x0BA7 +#define GL_PROJECTION_STACK_DEPTH 0x0BA4 +#define GL_PROVOKING_VERTEX 0x8E4F +#define GL_PROXY_TEXTURE_1D 0x8063 +#define GL_PROXY_TEXTURE_1D_ARRAY 0x8C19 +#define GL_PROXY_TEXTURE_2D 0x8064 +#define GL_PROXY_TEXTURE_2D_ARRAY 0x8C1B +#define GL_PROXY_TEXTURE_2D_MULTISAMPLE 0x9101 +#define GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9103 +#define GL_PROXY_TEXTURE_3D 0x8070 +#define GL_PROXY_TEXTURE_CUBE_MAP 0x851B +#define GL_PROXY_TEXTURE_RECTANGLE 0x84F7 +#define GL_Q 0x2003 +#define GL_QUADRATIC_ATTENUATION 0x1209 +#define GL_QUADS 0x0007 +#define GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION 0x8E4C +#define GL_QUAD_STRIP 0x0008 +#define GL_QUERY_BY_REGION_NO_WAIT 0x8E16 +#define GL_QUERY_BY_REGION_WAIT 0x8E15 +#define GL_QUERY_COUNTER_BITS 0x8864 +#define GL_QUERY_NO_WAIT 0x8E14 +#define GL_QUERY_RESULT 0x8866 +#define GL_QUERY_RESULT_AVAILABLE 0x8867 +#define GL_QUERY_WAIT 0x8E13 +#define GL_R 0x2002 +#define GL_R11F_G11F_B10F 0x8C3A +#define GL_R16 0x822A +#define GL_R16F 0x822D +#define GL_R16I 0x8233 +#define GL_R16UI 0x8234 +#define GL_R16_SNORM 0x8F98 +#define GL_R32F 0x822E +#define GL_R32I 0x8235 +#define GL_R32UI 0x8236 +#define GL_R3_G3_B2 0x2A10 +#define GL_R8 0x8229 +#define GL_R8I 0x8231 +#define GL_R8UI 0x8232 +#define GL_R8_SNORM 0x8F94 +#define GL_RASTERIZER_DISCARD 0x8C89 +#define GL_READ_BUFFER 0x0C02 +#define GL_READ_FRAMEBUFFER 0x8CA8 +#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA +#define GL_READ_FRAMEBUFFER_BINDING_EXT 0x8CAA +#define GL_READ_FRAMEBUFFER_EXT 0x8CA8 +#define GL_READ_ONLY 0x88B8 +#define GL_READ_WRITE 0x88BA +#define GL_RED 0x1903 +#define GL_RED_BIAS 0x0D15 +#define GL_RED_BITS 0x0D52 +#define GL_RED_INTEGER 0x8D94 +#define GL_RED_SCALE 0x0D14 +#define GL_REFLECTION_MAP 0x8512 +#define GL_RENDER 0x1C00 +#define GL_RENDERBUFFER 0x8D41 +#define GL_RENDERBUFFER_ALPHA_SIZE 0x8D53 +#define GL_RENDERBUFFER_ALPHA_SIZE_EXT 0x8D53 +#define GL_RENDERBUFFER_BINDING 0x8CA7 +#define GL_RENDERBUFFER_BINDING_EXT 0x8CA7 +#define GL_RENDERBUFFER_BLUE_SIZE 0x8D52 +#define GL_RENDERBUFFER_BLUE_SIZE_EXT 0x8D52 +#define GL_RENDERBUFFER_DEPTH_SIZE 0x8D54 +#define GL_RENDERBUFFER_DEPTH_SIZE_EXT 0x8D54 +#define GL_RENDERBUFFER_EXT 0x8D41 +#define GL_RENDERBUFFER_GREEN_SIZE 0x8D51 +#define GL_RENDERBUFFER_GREEN_SIZE_EXT 0x8D51 +#define GL_RENDERBUFFER_HEIGHT 0x8D43 +#define GL_RENDERBUFFER_HEIGHT_EXT 0x8D43 +#define GL_RENDERBUFFER_INTERNAL_FORMAT 0x8D44 +#define GL_RENDERBUFFER_INTERNAL_FORMAT_EXT 0x8D44 +#define GL_RENDERBUFFER_RED_SIZE 0x8D50 +#define GL_RENDERBUFFER_RED_SIZE_EXT 0x8D50 +#define GL_RENDERBUFFER_SAMPLES 0x8CAB +#define GL_RENDERBUFFER_SAMPLES_EXT 0x8CAB +#define GL_RENDERBUFFER_STENCIL_SIZE 0x8D55 +#define GL_RENDERBUFFER_STENCIL_SIZE_EXT 0x8D55 +#define GL_RENDERBUFFER_WIDTH 0x8D42 +#define GL_RENDERBUFFER_WIDTH_EXT 0x8D42 +#define GL_RENDERER 0x1F01 +#define GL_RENDER_MODE 0x0C40 +#define GL_REPEAT 0x2901 +#define GL_REPLACE 0x1E01 +#define GL_RESCALE_NORMAL 0x803A +#define GL_RETURN 0x0102 +#define GL_RG 0x8227 +#define GL_RG16 0x822C +#define GL_RG16F 0x822F +#define GL_RG16I 0x8239 +#define GL_RG16UI 0x823A +#define GL_RG16_SNORM 0x8F99 +#define GL_RG32F 0x8230 +#define GL_RG32I 0x823B +#define GL_RG32UI 0x823C +#define GL_RG8 0x822B +#define GL_RG8I 0x8237 +#define GL_RG8UI 0x8238 +#define GL_RG8_SNORM 0x8F95 +#define GL_RGB 0x1907 +#define GL_RGB10 0x8052 +#define GL_RGB10_A2 0x8059 +#define GL_RGB10_A2UI 0x906F +#define GL_RGB12 0x8053 +#define GL_RGB16 0x8054 +#define GL_RGB16F 0x881B +#define GL_RGB16I 0x8D89 +#define GL_RGB16UI 0x8D77 +#define GL_RGB16_SNORM 0x8F9A +#define GL_RGB32F 0x8815 +#define GL_RGB32I 0x8D83 +#define GL_RGB32UI 0x8D71 +#define GL_RGB4 0x804F +#define GL_RGB5 0x8050 +#define GL_RGB5_A1 0x8057 +#define GL_RGB8 0x8051 +#define GL_RGB8I 0x8D8F +#define GL_RGB8UI 0x8D7D +#define GL_RGB8_SNORM 0x8F96 +#define GL_RGB9_E5 0x8C3D +#define GL_RGBA 0x1908 +#define GL_RGBA12 0x805A +#define GL_RGBA16 0x805B +#define GL_RGBA16F 0x881A +#define GL_RGBA16I 0x8D88 +#define GL_RGBA16UI 0x8D76 +#define GL_RGBA16_SNORM 0x8F9B +#define GL_RGBA2 0x8055 +#define GL_RGBA32F 0x8814 +#define GL_RGBA32I 0x8D82 +#define GL_RGBA32UI 0x8D70 +#define GL_RGBA4 0x8056 +#define GL_RGBA8 0x8058 +#define GL_RGBA8I 0x8D8E +#define GL_RGBA8UI 0x8D7C +#define GL_RGBA8_SNORM 0x8F97 +#define GL_RGBA_INTEGER 0x8D99 +#define GL_RGBA_MODE 0x0C31 +#define GL_RGB_INTEGER 0x8D98 +#define GL_RGB_SCALE 0x8573 +#define GL_RG_INTEGER 0x8228 +#define GL_RIGHT 0x0407 +#define GL_S 0x2000 +#define GL_SAMPLER_1D 0x8B5D +#define GL_SAMPLER_1D_ARRAY 0x8DC0 +#define GL_SAMPLER_1D_ARRAY_SHADOW 0x8DC3 +#define GL_SAMPLER_1D_SHADOW 0x8B61 +#define GL_SAMPLER_2D 0x8B5E +#define GL_SAMPLER_2D_ARRAY 0x8DC1 +#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4 +#define GL_SAMPLER_2D_MULTISAMPLE 0x9108 +#define GL_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910B +#define GL_SAMPLER_2D_RECT 0x8B63 +#define GL_SAMPLER_2D_RECT_SHADOW 0x8B64 +#define GL_SAMPLER_2D_SHADOW 0x8B62 +#define GL_SAMPLER_3D 0x8B5F +#define GL_SAMPLER_BINDING 0x8919 +#define GL_SAMPLER_BUFFER 0x8DC2 +#define GL_SAMPLER_CUBE 0x8B60 +#define GL_SAMPLER_CUBE_SHADOW 0x8DC5 +#define GL_SAMPLES 0x80A9 +#define GL_SAMPLES_PASSED 0x8914 +#define GL_SAMPLE_ALPHA_TO_COVERAGE 0x809E +#define GL_SAMPLE_ALPHA_TO_ONE 0x809F +#define GL_SAMPLE_BUFFERS 0x80A8 +#define GL_SAMPLE_COVERAGE 0x80A0 +#define GL_SAMPLE_COVERAGE_INVERT 0x80AB +#define GL_SAMPLE_COVERAGE_VALUE 0x80AA +#define GL_SAMPLE_MASK 0x8E51 +#define GL_SAMPLE_MASK_VALUE 0x8E52 +#define GL_SAMPLE_POSITION 0x8E50 +#define GL_SCISSOR_BIT 0x00080000 +#define GL_SCISSOR_BOX 0x0C10 +#define GL_SCISSOR_TEST 0x0C11 +#define GL_SECONDARY_COLOR_ARRAY 0x845E +#define GL_SECONDARY_COLOR_ARRAY_BUFFER_BINDING 0x889C +#define GL_SECONDARY_COLOR_ARRAY_POINTER 0x845D +#define GL_SECONDARY_COLOR_ARRAY_SIZE 0x845A +#define GL_SECONDARY_COLOR_ARRAY_STRIDE 0x845C +#define GL_SECONDARY_COLOR_ARRAY_TYPE 0x845B +#define GL_SELECT 0x1C02 +#define GL_SELECTION_BUFFER_POINTER 0x0DF3 +#define GL_SELECTION_BUFFER_SIZE 0x0DF4 +#define GL_SEPARATE_ATTRIBS 0x8C8D +#define GL_SEPARATE_SPECULAR_COLOR 0x81FA +#define GL_SET 0x150F +#define GL_SHADER_SOURCE_LENGTH 0x8B88 +#define GL_SHADER_TYPE 0x8B4F +#define GL_SHADE_MODEL 0x0B54 +#define GL_SHADING_LANGUAGE_VERSION 0x8B8C +#define GL_SHININESS 0x1601 +#define GL_SHORT 0x1402 +#define GL_SIGNALED 0x9119 +#define GL_SIGNED_NORMALIZED 0x8F9C +#define GL_SINGLE_COLOR 0x81F9 +#define GL_SLUMINANCE 0x8C46 +#define GL_SLUMINANCE8 0x8C47 +#define GL_SLUMINANCE8_ALPHA8 0x8C45 +#define GL_SLUMINANCE_ALPHA 0x8C44 +#define GL_SMOOTH 0x1D01 +#define GL_SMOOTH_LINE_WIDTH_GRANULARITY 0x0B23 +#define GL_SMOOTH_LINE_WIDTH_RANGE 0x0B22 +#define GL_SMOOTH_POINT_SIZE_GRANULARITY 0x0B13 +#define GL_SMOOTH_POINT_SIZE_RANGE 0x0B12 +#define GL_SOURCE0_ALPHA 0x8588 +#define GL_SOURCE0_RGB 0x8580 +#define GL_SOURCE1_ALPHA 0x8589 +#define GL_SOURCE1_RGB 0x8581 +#define GL_SOURCE2_ALPHA 0x858A +#define GL_SOURCE2_RGB 0x8582 +#define GL_SPECULAR 0x1202 +#define GL_SPHERE_MAP 0x2402 +#define GL_SPOT_CUTOFF 0x1206 +#define GL_SPOT_DIRECTION 0x1204 +#define GL_SPOT_EXPONENT 0x1205 +#define GL_SRC0_ALPHA 0x8588 +#define GL_SRC0_RGB 0x8580 +#define GL_SRC1_ALPHA 0x8589 +#define GL_SRC1_COLOR 0x88F9 +#define GL_SRC1_RGB 0x8581 +#define GL_SRC2_ALPHA 0x858A +#define GL_SRC2_RGB 0x8582 +#define GL_SRC_ALPHA 0x0302 +#define GL_SRC_ALPHA_SATURATE 0x0308 +#define GL_SRC_COLOR 0x0300 +#define GL_SRGB 0x8C40 +#define GL_SRGB8 0x8C41 +#define GL_SRGB8_ALPHA8 0x8C43 +#define GL_SRGB_ALPHA 0x8C42 +#define GL_STACK_OVERFLOW 0x0503 +#define GL_STACK_UNDERFLOW 0x0504 +#define GL_STATIC_COPY 0x88E6 +#define GL_STATIC_DRAW 0x88E4 +#define GL_STATIC_READ 0x88E5 +#define GL_STENCIL 0x1802 +#define GL_STENCIL_ATTACHMENT 0x8D20 +#define GL_STENCIL_ATTACHMENT_EXT 0x8D20 +#define GL_STENCIL_BACK_FAIL 0x8801 +#define GL_STENCIL_BACK_FUNC 0x8800 +#define GL_STENCIL_BACK_PASS_DEPTH_FAIL 0x8802 +#define GL_STENCIL_BACK_PASS_DEPTH_PASS 0x8803 +#define GL_STENCIL_BACK_REF 0x8CA3 +#define GL_STENCIL_BACK_VALUE_MASK 0x8CA4 +#define GL_STENCIL_BACK_WRITEMASK 0x8CA5 +#define GL_STENCIL_BITS 0x0D57 +#define GL_STENCIL_BUFFER_BIT 0x00000400 +#define GL_STENCIL_CLEAR_VALUE 0x0B91 +#define GL_STENCIL_FAIL 0x0B94 +#define GL_STENCIL_FUNC 0x0B92 +#define GL_STENCIL_INDEX 0x1901 +#define GL_STENCIL_INDEX1 0x8D46 +#define GL_STENCIL_INDEX16 0x8D49 +#define GL_STENCIL_INDEX16_EXT 0x8D49 +#define GL_STENCIL_INDEX1_EXT 0x8D46 +#define GL_STENCIL_INDEX4 0x8D47 +#define GL_STENCIL_INDEX4_EXT 0x8D47 +#define GL_STENCIL_INDEX8 0x8D48 +#define GL_STENCIL_INDEX8_EXT 0x8D48 +#define GL_STENCIL_PASS_DEPTH_FAIL 0x0B95 +#define GL_STENCIL_PASS_DEPTH_PASS 0x0B96 +#define GL_STENCIL_REF 0x0B97 +#define GL_STENCIL_TEST 0x0B90 +#define GL_STENCIL_VALUE_MASK 0x0B93 +#define GL_STENCIL_WRITEMASK 0x0B98 +#define GL_STEREO 0x0C33 +#define GL_STREAM_COPY 0x88E2 +#define GL_STREAM_DRAW 0x88E0 +#define GL_STREAM_READ 0x88E1 +#define GL_SUBPIXEL_BITS 0x0D50 +#define GL_SUBTRACT 0x84E7 +#define GL_SYNC_CONDITION 0x9113 +#define GL_SYNC_FENCE 0x9116 +#define GL_SYNC_FLAGS 0x9115 +#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001 +#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117 +#define GL_SYNC_STATUS 0x9114 +#define GL_T 0x2001 +#define GL_T2F_C3F_V3F 0x2A2A +#define GL_T2F_C4F_N3F_V3F 0x2A2C +#define GL_T2F_C4UB_V3F 0x2A29 +#define GL_T2F_N3F_V3F 0x2A2B +#define GL_T2F_V3F 0x2A27 +#define GL_T4F_C4F_N3F_V4F 0x2A2D +#define GL_T4F_V4F 0x2A28 +#define GL_TEXTURE 0x1702 +#define GL_TEXTURE0 0x84C0 +#define GL_TEXTURE1 0x84C1 +#define GL_TEXTURE10 0x84CA +#define GL_TEXTURE11 0x84CB +#define GL_TEXTURE12 0x84CC +#define GL_TEXTURE13 0x84CD +#define GL_TEXTURE14 0x84CE +#define GL_TEXTURE15 0x84CF +#define GL_TEXTURE16 0x84D0 +#define GL_TEXTURE17 0x84D1 +#define GL_TEXTURE18 0x84D2 +#define GL_TEXTURE19 0x84D3 +#define GL_TEXTURE2 0x84C2 +#define GL_TEXTURE20 0x84D4 +#define GL_TEXTURE21 0x84D5 +#define GL_TEXTURE22 0x84D6 +#define GL_TEXTURE23 0x84D7 +#define GL_TEXTURE24 0x84D8 +#define GL_TEXTURE25 0x84D9 +#define GL_TEXTURE26 0x84DA +#define GL_TEXTURE27 0x84DB +#define GL_TEXTURE28 0x84DC +#define GL_TEXTURE29 0x84DD +#define GL_TEXTURE3 0x84C3 +#define GL_TEXTURE30 0x84DE +#define GL_TEXTURE31 0x84DF +#define GL_TEXTURE4 0x84C4 +#define GL_TEXTURE5 0x84C5 +#define GL_TEXTURE6 0x84C6 +#define GL_TEXTURE7 0x84C7 +#define GL_TEXTURE8 0x84C8 +#define GL_TEXTURE9 0x84C9 +#define GL_TEXTURE_1D 0x0DE0 +#define GL_TEXTURE_1D_ARRAY 0x8C18 +#define GL_TEXTURE_2D 0x0DE1 +#define GL_TEXTURE_2D_ARRAY 0x8C1A +#define GL_TEXTURE_2D_MULTISAMPLE 0x9100 +#define GL_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9102 +#define GL_TEXTURE_3D 0x806F +#define GL_TEXTURE_ALPHA_SIZE 0x805F +#define GL_TEXTURE_ALPHA_TYPE 0x8C13 +#define GL_TEXTURE_BASE_LEVEL 0x813C +#define GL_TEXTURE_BINDING_1D 0x8068 +#define GL_TEXTURE_BINDING_1D_ARRAY 0x8C1C +#define GL_TEXTURE_BINDING_2D 0x8069 +#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D +#define GL_TEXTURE_BINDING_2D_MULTISAMPLE 0x9104 +#define GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY 0x9105 +#define GL_TEXTURE_BINDING_3D 0x806A +#define GL_TEXTURE_BINDING_BUFFER 0x8C2C +#define GL_TEXTURE_BINDING_CUBE_MAP 0x8514 +#define GL_TEXTURE_BINDING_RECTANGLE 0x84F6 +#define GL_TEXTURE_BIT 0x00040000 +#define GL_TEXTURE_BLUE_SIZE 0x805E +#define GL_TEXTURE_BLUE_TYPE 0x8C12 +#define GL_TEXTURE_BORDER 0x1005 +#define GL_TEXTURE_BORDER_COLOR 0x1004 +#define GL_TEXTURE_BUFFER 0x8C2A +#define GL_TEXTURE_BUFFER_DATA_STORE_BINDING 0x8C2D +#define GL_TEXTURE_COMPARE_FUNC 0x884D +#define GL_TEXTURE_COMPARE_MODE 0x884C +#define GL_TEXTURE_COMPONENTS 0x1003 +#define GL_TEXTURE_COMPRESSED 0x86A1 +#define GL_TEXTURE_COMPRESSED_IMAGE_SIZE 0x86A0 +#define GL_TEXTURE_COMPRESSION_HINT 0x84EF +#define GL_TEXTURE_COORD_ARRAY 0x8078 +#define GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING 0x889A +#define GL_TEXTURE_COORD_ARRAY_POINTER 0x8092 +#define GL_TEXTURE_COORD_ARRAY_SIZE 0x8088 +#define GL_TEXTURE_COORD_ARRAY_STRIDE 0x808A +#define GL_TEXTURE_COORD_ARRAY_TYPE 0x8089 +#define GL_TEXTURE_CUBE_MAP 0x8513 +#define GL_TEXTURE_CUBE_MAP_NEGATIVE_X 0x8516 +#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Y 0x8518 +#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Z 0x851A +#define GL_TEXTURE_CUBE_MAP_POSITIVE_X 0x8515 +#define GL_TEXTURE_CUBE_MAP_POSITIVE_Y 0x8517 +#define GL_TEXTURE_CUBE_MAP_POSITIVE_Z 0x8519 +#define GL_TEXTURE_CUBE_MAP_SEAMLESS 0x884F +#define GL_TEXTURE_DEPTH 0x8071 +#define GL_TEXTURE_DEPTH_SIZE 0x884A +#define GL_TEXTURE_DEPTH_TYPE 0x8C16 +#define GL_TEXTURE_ENV 0x2300 +#define GL_TEXTURE_ENV_COLOR 0x2201 +#define GL_TEXTURE_ENV_MODE 0x2200 +#define GL_TEXTURE_FILTER_CONTROL 0x8500 +#define GL_TEXTURE_FIXED_SAMPLE_LOCATIONS 0x9107 +#define GL_TEXTURE_GEN_MODE 0x2500 +#define GL_TEXTURE_GEN_Q 0x0C63 +#define GL_TEXTURE_GEN_R 0x0C62 +#define GL_TEXTURE_GEN_S 0x0C60 +#define GL_TEXTURE_GEN_T 0x0C61 +#define GL_TEXTURE_GREEN_SIZE 0x805D +#define GL_TEXTURE_GREEN_TYPE 0x8C11 +#define GL_TEXTURE_HEIGHT 0x1001 +#define GL_TEXTURE_INTENSITY_SIZE 0x8061 +#define GL_TEXTURE_INTENSITY_TYPE 0x8C15 +#define GL_TEXTURE_INTERNAL_FORMAT 0x1003 +#define GL_TEXTURE_LOD_BIAS 0x8501 +#define GL_TEXTURE_LUMINANCE_SIZE 0x8060 +#define GL_TEXTURE_LUMINANCE_TYPE 0x8C14 +#define GL_TEXTURE_MAG_FILTER 0x2800 +#define GL_TEXTURE_MATRIX 0x0BA8 +#define GL_TEXTURE_MAX_LEVEL 0x813D +#define GL_TEXTURE_MAX_LOD 0x813B +#define GL_TEXTURE_MIN_FILTER 0x2801 +#define GL_TEXTURE_MIN_LOD 0x813A +#define GL_TEXTURE_PRIORITY 0x8066 +#define GL_TEXTURE_RECTANGLE 0x84F5 +#define GL_TEXTURE_RED_SIZE 0x805C +#define GL_TEXTURE_RED_TYPE 0x8C10 +#define GL_TEXTURE_RESIDENT 0x8067 +#define GL_TEXTURE_SAMPLES 0x9106 +#define GL_TEXTURE_SHARED_SIZE 0x8C3F +#define GL_TEXTURE_STACK_DEPTH 0x0BA5 +#define GL_TEXTURE_STENCIL_SIZE 0x88F1 +#define GL_TEXTURE_SWIZZLE_A 0x8E45 +#define GL_TEXTURE_SWIZZLE_B 0x8E44 +#define GL_TEXTURE_SWIZZLE_G 0x8E43 +#define GL_TEXTURE_SWIZZLE_R 0x8E42 +#define GL_TEXTURE_SWIZZLE_RGBA 0x8E46 +#define GL_TEXTURE_WIDTH 0x1000 +#define GL_TEXTURE_WRAP_R 0x8072 +#define GL_TEXTURE_WRAP_S 0x2802 +#define GL_TEXTURE_WRAP_T 0x2803 +#define GL_TIMEOUT_EXPIRED 0x911B +#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFF +#define GL_TIMESTAMP 0x8E28 +#define GL_TIME_ELAPSED 0x88BF +#define GL_TRANSFORM_BIT 0x00001000 +#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E +#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F +#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F +#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85 +#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84 +#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88 +#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83 +#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76 +#define GL_TRANSPOSE_COLOR_MATRIX 0x84E6 +#define GL_TRANSPOSE_MODELVIEW_MATRIX 0x84E3 +#define GL_TRANSPOSE_PROJECTION_MATRIX 0x84E4 +#define GL_TRANSPOSE_TEXTURE_MATRIX 0x84E5 +#define GL_TRIANGLES 0x0004 +#define GL_TRIANGLES_ADJACENCY 0x000C +#define GL_TRIANGLE_FAN 0x0006 +#define GL_TRIANGLE_STRIP 0x0005 +#define GL_TRIANGLE_STRIP_ADJACENCY 0x000D +#define GL_TRUE 1 +#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C +#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42 +#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43 +#define GL_UNIFORM_BLOCK_BINDING 0x8A3F +#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40 +#define GL_UNIFORM_BLOCK_INDEX 0x8A3A +#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41 +#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46 +#define GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER 0x8A45 +#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44 +#define GL_UNIFORM_BUFFER 0x8A11 +#define GL_UNIFORM_BUFFER_BINDING 0x8A28 +#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34 +#define GL_UNIFORM_BUFFER_SIZE 0x8A2A +#define GL_UNIFORM_BUFFER_START 0x8A29 +#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E +#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D +#define GL_UNIFORM_NAME_LENGTH 0x8A39 +#define GL_UNIFORM_OFFSET 0x8A3B +#define GL_UNIFORM_SIZE 0x8A38 +#define GL_UNIFORM_TYPE 0x8A37 +#define GL_UNPACK_ALIGNMENT 0x0CF5 +#define GL_UNPACK_IMAGE_HEIGHT 0x806E +#define GL_UNPACK_LSB_FIRST 0x0CF1 +#define GL_UNPACK_ROW_LENGTH 0x0CF2 +#define GL_UNPACK_SKIP_IMAGES 0x806D +#define GL_UNPACK_SKIP_PIXELS 0x0CF4 +#define GL_UNPACK_SKIP_ROWS 0x0CF3 +#define GL_UNPACK_SWAP_BYTES 0x0CF0 +#define GL_UNSIGNALED 0x9118 +#define GL_UNSIGNED_BYTE 0x1401 +#define GL_UNSIGNED_BYTE_2_3_3_REV 0x8362 +#define GL_UNSIGNED_BYTE_3_3_2 0x8032 +#define GL_UNSIGNED_INT 0x1405 +#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B +#define GL_UNSIGNED_INT_10_10_10_2 0x8036 +#define GL_UNSIGNED_INT_24_8 0x84FA +#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368 +#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E +#define GL_UNSIGNED_INT_8_8_8_8 0x8035 +#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367 +#define GL_UNSIGNED_INT_SAMPLER_1D 0x8DD1 +#define GL_UNSIGNED_INT_SAMPLER_1D_ARRAY 0x8DD6 +#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2 +#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7 +#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE 0x910A +#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910D +#define GL_UNSIGNED_INT_SAMPLER_2D_RECT 0x8DD5 +#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3 +#define GL_UNSIGNED_INT_SAMPLER_BUFFER 0x8DD8 +#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4 +#define GL_UNSIGNED_INT_VEC2 0x8DC6 +#define GL_UNSIGNED_INT_VEC3 0x8DC7 +#define GL_UNSIGNED_INT_VEC4 0x8DC8 +#define GL_UNSIGNED_NORMALIZED 0x8C17 +#define GL_UNSIGNED_SHORT 0x1403 +#define GL_UNSIGNED_SHORT_1_5_5_5_REV 0x8366 +#define GL_UNSIGNED_SHORT_4_4_4_4 0x8033 +#define GL_UNSIGNED_SHORT_4_4_4_4_REV 0x8365 +#define GL_UNSIGNED_SHORT_5_5_5_1 0x8034 +#define GL_UNSIGNED_SHORT_5_6_5 0x8363 +#define GL_UNSIGNED_SHORT_5_6_5_REV 0x8364 +#define GL_UPPER_LEFT 0x8CA2 +#define GL_V2F 0x2A20 +#define GL_V3F 0x2A21 +#define GL_VALIDATE_STATUS 0x8B83 +#define GL_VENDOR 0x1F00 +#define GL_VERSION 0x1F02 +#define GL_VERTEX_ARRAY 0x8074 +#define GL_VERTEX_ARRAY_BINDING 0x85B5 +#define GL_VERTEX_ARRAY_BUFFER_BINDING 0x8896 +#define GL_VERTEX_ARRAY_POINTER 0x808E +#define GL_VERTEX_ARRAY_SIZE 0x807A +#define GL_VERTEX_ARRAY_STRIDE 0x807C +#define GL_VERTEX_ARRAY_TYPE 0x807B +#define GL_VERTEX_ATTRIB_ARRAY_BUFFER_BINDING 0x889F +#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE +#define GL_VERTEX_ATTRIB_ARRAY_ENABLED 0x8622 +#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD +#define GL_VERTEX_ATTRIB_ARRAY_NORMALIZED 0x886A +#define GL_VERTEX_ATTRIB_ARRAY_POINTER 0x8645 +#define GL_VERTEX_ATTRIB_ARRAY_SIZE 0x8623 +#define GL_VERTEX_ATTRIB_ARRAY_STRIDE 0x8624 +#define GL_VERTEX_ATTRIB_ARRAY_TYPE 0x8625 +#define GL_VERTEX_PROGRAM_POINT_SIZE 0x8642 +#define GL_VERTEX_PROGRAM_TWO_SIDE 0x8643 +#define GL_VERTEX_SHADER 0x8B31 +#define GL_VIEWPORT 0x0BA2 +#define GL_VIEWPORT_BIT 0x00000800 +#define GL_WAIT_FAILED 0x911D +#define GL_WEIGHT_ARRAY_BUFFER_BINDING 0x889E +#define GL_WRITE_ONLY 0x88B9 +#define GL_XOR 0x1506 +#define GL_ZERO 0 +#define GL_ZOOM_X 0x0D16 +#define GL_ZOOM_Y 0x0D17 + + +#include <KHR/khrplatform.h> +typedef unsigned int GLenum; +typedef unsigned char GLboolean; +typedef unsigned int GLbitfield; +typedef void GLvoid; +typedef khronos_int8_t GLbyte; +typedef khronos_uint8_t GLubyte; +typedef khronos_int16_t GLshort; +typedef khronos_uint16_t GLushort; +typedef int GLint; +typedef unsigned int GLuint; +typedef khronos_int32_t GLclampx; +typedef int GLsizei; +typedef khronos_float_t GLfloat; +typedef khronos_float_t GLclampf; +typedef double GLdouble; +typedef double GLclampd; +typedef void *GLeglClientBufferEXT; +typedef void *GLeglImageOES; +typedef char GLchar; +typedef char GLcharARB; +#ifdef __APPLE__ +typedef void *GLhandleARB; +#else +typedef unsigned int GLhandleARB; +#endif +typedef khronos_uint16_t GLhalf; +typedef khronos_uint16_t GLhalfARB; +typedef khronos_int32_t GLfixed; +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) +typedef khronos_intptr_t GLintptr; +#else +typedef khronos_intptr_t GLintptr; +#endif +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) +typedef khronos_intptr_t GLintptrARB; +#else +typedef khronos_intptr_t GLintptrARB; +#endif +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) +typedef khronos_ssize_t GLsizeiptr; +#else +typedef khronos_ssize_t GLsizeiptr; +#endif +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) +typedef khronos_ssize_t GLsizeiptrARB; +#else +typedef khronos_ssize_t GLsizeiptrARB; +#endif +typedef khronos_int64_t GLint64; +typedef khronos_int64_t GLint64EXT; +typedef khronos_uint64_t GLuint64; +typedef khronos_uint64_t GLuint64EXT; +typedef struct __GLsync *GLsync; +struct _cl_context; +struct _cl_event; +typedef void (GLAD_API_PTR *GLDEBUGPROC)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); +typedef void (GLAD_API_PTR *GLDEBUGPROCARB)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); +typedef void (GLAD_API_PTR *GLDEBUGPROCKHR)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); +typedef void (GLAD_API_PTR *GLDEBUGPROCAMD)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,void *userParam); +typedef unsigned short GLhalfNV; +typedef GLintptr GLvdpauSurfaceNV; +typedef void (GLAD_API_PTR *GLVULKANPROCNV)(void); + + +#define GL_VERSION_1_0 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_0; +#define GL_VERSION_1_1 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_1; +#define GL_VERSION_1_2 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_2; +#define GL_VERSION_1_3 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_3; +#define GL_VERSION_1_4 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_4; +#define GL_VERSION_1_5 1 +GLAD_API_CALL int GLAD_GL_VERSION_1_5; +#define GL_VERSION_2_0 1 +GLAD_API_CALL int GLAD_GL_VERSION_2_0; +#define GL_VERSION_2_1 1 +GLAD_API_CALL int GLAD_GL_VERSION_2_1; +#define GL_VERSION_3_0 1 +GLAD_API_CALL int GLAD_GL_VERSION_3_0; +#define GL_VERSION_3_1 1 +GLAD_API_CALL int GLAD_GL_VERSION_3_1; +#define GL_VERSION_3_2 1 +GLAD_API_CALL int GLAD_GL_VERSION_3_2; +#define GL_VERSION_3_3 1 +GLAD_API_CALL int GLAD_GL_VERSION_3_3; +#define GL_ARB_debug_output 1 +GLAD_API_CALL int GLAD_GL_ARB_debug_output; +#define GL_ARB_framebuffer_object 1 +GLAD_API_CALL int GLAD_GL_ARB_framebuffer_object; +#define GL_EXT_framebuffer_blit 1 +GLAD_API_CALL int GLAD_GL_EXT_framebuffer_blit; +#define GL_EXT_framebuffer_multisample 1 +GLAD_API_CALL int GLAD_GL_EXT_framebuffer_multisample; +#define GL_EXT_framebuffer_object 1 +GLAD_API_CALL int GLAD_GL_EXT_framebuffer_object; +#define GL_OVR_multiview 1 +GLAD_API_CALL int GLAD_GL_OVR_multiview; +#define GL_OVR_multiview2 1 +GLAD_API_CALL int GLAD_GL_OVR_multiview2; + + +typedef void (GLAD_API_PTR *PFNGLACCUMPROC)(GLenum op, GLfloat value); +typedef void (GLAD_API_PTR *PFNGLACTIVETEXTUREPROC)(GLenum texture); +typedef void (GLAD_API_PTR *PFNGLALPHAFUNCPROC)(GLenum func, GLfloat ref); +typedef GLboolean (GLAD_API_PTR *PFNGLARETEXTURESRESIDENTPROC)(GLsizei n, const GLuint * textures, GLboolean * residences); +typedef void (GLAD_API_PTR *PFNGLARRAYELEMENTPROC)(GLint i); +typedef void (GLAD_API_PTR *PFNGLATTACHSHADERPROC)(GLuint program, GLuint shader); +typedef void (GLAD_API_PTR *PFNGLBEGINPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLBEGINCONDITIONALRENDERPROC)(GLuint id, GLenum mode); +typedef void (GLAD_API_PTR *PFNGLBEGINQUERYPROC)(GLenum target, GLuint id); +typedef void (GLAD_API_PTR *PFNGLBEGINTRANSFORMFEEDBACKPROC)(GLenum primitiveMode); +typedef void (GLAD_API_PTR *PFNGLBINDATTRIBLOCATIONPROC)(GLuint program, GLuint index, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLBINDBUFFERPROC)(GLenum target, GLuint buffer); +typedef void (GLAD_API_PTR *PFNGLBINDBUFFERBASEPROC)(GLenum target, GLuint index, GLuint buffer); +typedef void (GLAD_API_PTR *PFNGLBINDBUFFERRANGEPROC)(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size); +typedef void (GLAD_API_PTR *PFNGLBINDFRAGDATALOCATIONPROC)(GLuint program, GLuint color, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLBINDFRAGDATALOCATIONINDEXEDPROC)(GLuint program, GLuint colorNumber, GLuint index, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLBINDFRAMEBUFFERPROC)(GLenum target, GLuint framebuffer); +typedef void (GLAD_API_PTR *PFNGLBINDFRAMEBUFFEREXTPROC)(GLenum target, GLuint framebuffer); +typedef void (GLAD_API_PTR *PFNGLBINDRENDERBUFFERPROC)(GLenum target, GLuint renderbuffer); +typedef void (GLAD_API_PTR *PFNGLBINDRENDERBUFFEREXTPROC)(GLenum target, GLuint renderbuffer); +typedef void (GLAD_API_PTR *PFNGLBINDSAMPLERPROC)(GLuint unit, GLuint sampler); +typedef void (GLAD_API_PTR *PFNGLBINDTEXTUREPROC)(GLenum target, GLuint texture); +typedef void (GLAD_API_PTR *PFNGLBINDVERTEXARRAYPROC)(GLuint array); +typedef void (GLAD_API_PTR *PFNGLBITMAPPROC)(GLsizei width, GLsizei height, GLfloat xorig, GLfloat yorig, GLfloat xmove, GLfloat ymove, const GLubyte * bitmap); +typedef void (GLAD_API_PTR *PFNGLBLENDCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); +typedef void (GLAD_API_PTR *PFNGLBLENDEQUATIONPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLBLENDEQUATIONSEPARATEPROC)(GLenum modeRGB, GLenum modeAlpha); +typedef void (GLAD_API_PTR *PFNGLBLENDFUNCPROC)(GLenum sfactor, GLenum dfactor); +typedef void (GLAD_API_PTR *PFNGLBLENDFUNCSEPARATEPROC)(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha); +typedef void (GLAD_API_PTR *PFNGLBLITFRAMEBUFFERPROC)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); +typedef void (GLAD_API_PTR *PFNGLBLITFRAMEBUFFEREXTPROC)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); +typedef void (GLAD_API_PTR *PFNGLBUFFERDATAPROC)(GLenum target, GLsizeiptr size, const void * data, GLenum usage); +typedef void (GLAD_API_PTR *PFNGLBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, const void * data); +typedef void (GLAD_API_PTR *PFNGLCALLLISTPROC)(GLuint list); +typedef void (GLAD_API_PTR *PFNGLCALLLISTSPROC)(GLsizei n, GLenum type, const void * lists); +typedef GLenum (GLAD_API_PTR *PFNGLCHECKFRAMEBUFFERSTATUSPROC)(GLenum target); +typedef GLenum (GLAD_API_PTR *PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC)(GLenum target); +typedef void (GLAD_API_PTR *PFNGLCLAMPCOLORPROC)(GLenum target, GLenum clamp); +typedef void (GLAD_API_PTR *PFNGLCLEARPROC)(GLbitfield mask); +typedef void (GLAD_API_PTR *PFNGLCLEARACCUMPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); +typedef void (GLAD_API_PTR *PFNGLCLEARBUFFERFIPROC)(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil); +typedef void (GLAD_API_PTR *PFNGLCLEARBUFFERFVPROC)(GLenum buffer, GLint drawbuffer, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLCLEARBUFFERIVPROC)(GLenum buffer, GLint drawbuffer, const GLint * value); +typedef void (GLAD_API_PTR *PFNGLCLEARBUFFERUIVPROC)(GLenum buffer, GLint drawbuffer, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLCLEARCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); +typedef void (GLAD_API_PTR *PFNGLCLEARDEPTHPROC)(GLdouble depth); +typedef void (GLAD_API_PTR *PFNGLCLEARINDEXPROC)(GLfloat c); +typedef void (GLAD_API_PTR *PFNGLCLEARSTENCILPROC)(GLint s); +typedef void (GLAD_API_PTR *PFNGLCLIENTACTIVETEXTUREPROC)(GLenum texture); +typedef GLenum (GLAD_API_PTR *PFNGLCLIENTWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout); +typedef void (GLAD_API_PTR *PFNGLCLIPPLANEPROC)(GLenum plane, const GLdouble * equation); +typedef void (GLAD_API_PTR *PFNGLCOLOR3BPROC)(GLbyte red, GLbyte green, GLbyte blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3BVPROC)(const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3DPROC)(GLdouble red, GLdouble green, GLdouble blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3FPROC)(GLfloat red, GLfloat green, GLfloat blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3IPROC)(GLint red, GLint green, GLint blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3SPROC)(GLshort red, GLshort green, GLshort blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3UBPROC)(GLubyte red, GLubyte green, GLubyte blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3UBVPROC)(const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3UIPROC)(GLuint red, GLuint green, GLuint blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3UIVPROC)(const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR3USPROC)(GLushort red, GLushort green, GLushort blue); +typedef void (GLAD_API_PTR *PFNGLCOLOR3USVPROC)(const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4BPROC)(GLbyte red, GLbyte green, GLbyte blue, GLbyte alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4BVPROC)(const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4DPROC)(GLdouble red, GLdouble green, GLdouble blue, GLdouble alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4FPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4IPROC)(GLint red, GLint green, GLint blue, GLint alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4SPROC)(GLshort red, GLshort green, GLshort blue, GLshort alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4UBPROC)(GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4UBVPROC)(const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4UIPROC)(GLuint red, GLuint green, GLuint blue, GLuint alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4UIVPROC)(const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLCOLOR4USPROC)(GLushort red, GLushort green, GLushort blue, GLushort alpha); +typedef void (GLAD_API_PTR *PFNGLCOLOR4USVPROC)(const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLCOLORMASKPROC)(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha); +typedef void (GLAD_API_PTR *PFNGLCOLORMASKIPROC)(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a); +typedef void (GLAD_API_PTR *PFNGLCOLORMATERIALPROC)(GLenum face, GLenum mode); +typedef void (GLAD_API_PTR *PFNGLCOLORP3UIPROC)(GLenum type, GLuint color); +typedef void (GLAD_API_PTR *PFNGLCOLORP3UIVPROC)(GLenum type, const GLuint * color); +typedef void (GLAD_API_PTR *PFNGLCOLORP4UIPROC)(GLenum type, GLuint color); +typedef void (GLAD_API_PTR *PFNGLCOLORP4UIVPROC)(GLenum type, const GLuint * color); +typedef void (GLAD_API_PTR *PFNGLCOLORPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLCOMPILESHADERPROC)(GLuint shader); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXIMAGE3DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void * data); +typedef void (GLAD_API_PTR *PFNGLCOPYBUFFERSUBDATAPROC)(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size); +typedef void (GLAD_API_PTR *PFNGLCOPYPIXELSPROC)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum type); +typedef void (GLAD_API_PTR *PFNGLCOPYTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border); +typedef void (GLAD_API_PTR *PFNGLCOPYTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border); +typedef void (GLAD_API_PTR *PFNGLCOPYTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width); +typedef void (GLAD_API_PTR *PFNGLCOPYTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLCOPYTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height); +typedef GLuint (GLAD_API_PTR *PFNGLCREATEPROGRAMPROC)(void); +typedef GLuint (GLAD_API_PTR *PFNGLCREATESHADERPROC)(GLenum type); +typedef void (GLAD_API_PTR *PFNGLCULLFACEPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLDEBUGMESSAGECALLBACKARBPROC)(GLDEBUGPROCARB callback, const void * userParam); +typedef void (GLAD_API_PTR *PFNGLDEBUGMESSAGECONTROLARBPROC)(GLenum source, GLenum type, GLenum severity, GLsizei count, const GLuint * ids, GLboolean enabled); +typedef void (GLAD_API_PTR *PFNGLDEBUGMESSAGEINSERTARBPROC)(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar * buf); +typedef void (GLAD_API_PTR *PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint * buffers); +typedef void (GLAD_API_PTR *PFNGLDELETEFRAMEBUFFERSPROC)(GLsizei n, const GLuint * framebuffers); +typedef void (GLAD_API_PTR *PFNGLDELETEFRAMEBUFFERSEXTPROC)(GLsizei n, const GLuint * framebuffers); +typedef void (GLAD_API_PTR *PFNGLDELETELISTSPROC)(GLuint list, GLsizei range); +typedef void (GLAD_API_PTR *PFNGLDELETEPROGRAMPROC)(GLuint program); +typedef void (GLAD_API_PTR *PFNGLDELETEQUERIESPROC)(GLsizei n, const GLuint * ids); +typedef void (GLAD_API_PTR *PFNGLDELETERENDERBUFFERSPROC)(GLsizei n, const GLuint * renderbuffers); +typedef void (GLAD_API_PTR *PFNGLDELETERENDERBUFFERSEXTPROC)(GLsizei n, const GLuint * renderbuffers); +typedef void (GLAD_API_PTR *PFNGLDELETESAMPLERSPROC)(GLsizei count, const GLuint * samplers); +typedef void (GLAD_API_PTR *PFNGLDELETESHADERPROC)(GLuint shader); +typedef void (GLAD_API_PTR *PFNGLDELETESYNCPROC)(GLsync sync); +typedef void (GLAD_API_PTR *PFNGLDELETETEXTURESPROC)(GLsizei n, const GLuint * textures); +typedef void (GLAD_API_PTR *PFNGLDELETEVERTEXARRAYSPROC)(GLsizei n, const GLuint * arrays); +typedef void (GLAD_API_PTR *PFNGLDEPTHFUNCPROC)(GLenum func); +typedef void (GLAD_API_PTR *PFNGLDEPTHMASKPROC)(GLboolean flag); +typedef void (GLAD_API_PTR *PFNGLDEPTHRANGEPROC)(GLdouble n, GLdouble f); +typedef void (GLAD_API_PTR *PFNGLDETACHSHADERPROC)(GLuint program, GLuint shader); +typedef void (GLAD_API_PTR *PFNGLDISABLEPROC)(GLenum cap); +typedef void (GLAD_API_PTR *PFNGLDISABLECLIENTSTATEPROC)(GLenum array); +typedef void (GLAD_API_PTR *PFNGLDISABLEVERTEXATTRIBARRAYPROC)(GLuint index); +typedef void (GLAD_API_PTR *PFNGLDISABLEIPROC)(GLenum target, GLuint index); +typedef void (GLAD_API_PTR *PFNGLDRAWARRAYSPROC)(GLenum mode, GLint first, GLsizei count); +typedef void (GLAD_API_PTR *PFNGLDRAWARRAYSINSTANCEDPROC)(GLenum mode, GLint first, GLsizei count, GLsizei instancecount); +typedef void (GLAD_API_PTR *PFNGLDRAWBUFFERPROC)(GLenum buf); +typedef void (GLAD_API_PTR *PFNGLDRAWBUFFERSPROC)(GLsizei n, const GLenum * bufs); +typedef void (GLAD_API_PTR *PFNGLDRAWELEMENTSPROC)(GLenum mode, GLsizei count, GLenum type, const void * indices); +typedef void (GLAD_API_PTR *PFNGLDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void * indices, GLint basevertex); +typedef void (GLAD_API_PTR *PFNGLDRAWELEMENTSINSTANCEDPROC)(GLenum mode, GLsizei count, GLenum type, const void * indices, GLsizei instancecount); +typedef void (GLAD_API_PTR *PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void * indices, GLsizei instancecount, GLint basevertex); +typedef void (GLAD_API_PTR *PFNGLDRAWPIXELSPROC)(GLsizei width, GLsizei height, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLDRAWRANGEELEMENTSPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void * indices); +typedef void (GLAD_API_PTR *PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void * indices, GLint basevertex); +typedef void (GLAD_API_PTR *PFNGLEDGEFLAGPROC)(GLboolean flag); +typedef void (GLAD_API_PTR *PFNGLEDGEFLAGPOINTERPROC)(GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLEDGEFLAGVPROC)(const GLboolean * flag); +typedef void (GLAD_API_PTR *PFNGLENABLEPROC)(GLenum cap); +typedef void (GLAD_API_PTR *PFNGLENABLECLIENTSTATEPROC)(GLenum array); +typedef void (GLAD_API_PTR *PFNGLENABLEVERTEXATTRIBARRAYPROC)(GLuint index); +typedef void (GLAD_API_PTR *PFNGLENABLEIPROC)(GLenum target, GLuint index); +typedef void (GLAD_API_PTR *PFNGLENDPROC)(void); +typedef void (GLAD_API_PTR *PFNGLENDCONDITIONALRENDERPROC)(void); +typedef void (GLAD_API_PTR *PFNGLENDLISTPROC)(void); +typedef void (GLAD_API_PTR *PFNGLENDQUERYPROC)(GLenum target); +typedef void (GLAD_API_PTR *PFNGLENDTRANSFORMFEEDBACKPROC)(void); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD1DPROC)(GLdouble u); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD1DVPROC)(const GLdouble * u); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD1FPROC)(GLfloat u); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD1FVPROC)(const GLfloat * u); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD2DPROC)(GLdouble u, GLdouble v); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD2DVPROC)(const GLdouble * u); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD2FPROC)(GLfloat u, GLfloat v); +typedef void (GLAD_API_PTR *PFNGLEVALCOORD2FVPROC)(const GLfloat * u); +typedef void (GLAD_API_PTR *PFNGLEVALMESH1PROC)(GLenum mode, GLint i1, GLint i2); +typedef void (GLAD_API_PTR *PFNGLEVALMESH2PROC)(GLenum mode, GLint i1, GLint i2, GLint j1, GLint j2); +typedef void (GLAD_API_PTR *PFNGLEVALPOINT1PROC)(GLint i); +typedef void (GLAD_API_PTR *PFNGLEVALPOINT2PROC)(GLint i, GLint j); +typedef void (GLAD_API_PTR *PFNGLFEEDBACKBUFFERPROC)(GLsizei size, GLenum type, GLfloat * buffer); +typedef GLsync (GLAD_API_PTR *PFNGLFENCESYNCPROC)(GLenum condition, GLbitfield flags); +typedef void (GLAD_API_PTR *PFNGLFINISHPROC)(void); +typedef void (GLAD_API_PTR *PFNGLFLUSHPROC)(void); +typedef void (GLAD_API_PTR *PFNGLFLUSHMAPPEDBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length); +typedef void (GLAD_API_PTR *PFNGLFOGCOORDPOINTERPROC)(GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLFOGCOORDDPROC)(GLdouble coord); +typedef void (GLAD_API_PTR *PFNGLFOGCOORDDVPROC)(const GLdouble * coord); +typedef void (GLAD_API_PTR *PFNGLFOGCOORDFPROC)(GLfloat coord); +typedef void (GLAD_API_PTR *PFNGLFOGCOORDFVPROC)(const GLfloat * coord); +typedef void (GLAD_API_PTR *PFNGLFOGFPROC)(GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLFOGFVPROC)(GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLFOGIPROC)(GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLFOGIVPROC)(GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERRENDERBUFFERPROC)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTUREPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE1DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE1DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE2DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE2DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE3DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURE3DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTURELAYERPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer); +typedef void (GLAD_API_PTR *PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews); +typedef void (GLAD_API_PTR *PFNGLFRONTFACEPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLFRUSTUMPROC)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar); +typedef void (GLAD_API_PTR *PFNGLGENBUFFERSPROC)(GLsizei n, GLuint * buffers); +typedef void (GLAD_API_PTR *PFNGLGENFRAMEBUFFERSPROC)(GLsizei n, GLuint * framebuffers); +typedef void (GLAD_API_PTR *PFNGLGENFRAMEBUFFERSEXTPROC)(GLsizei n, GLuint * framebuffers); +typedef GLuint (GLAD_API_PTR *PFNGLGENLISTSPROC)(GLsizei range); +typedef void (GLAD_API_PTR *PFNGLGENQUERIESPROC)(GLsizei n, GLuint * ids); +typedef void (GLAD_API_PTR *PFNGLGENRENDERBUFFERSPROC)(GLsizei n, GLuint * renderbuffers); +typedef void (GLAD_API_PTR *PFNGLGENRENDERBUFFERSEXTPROC)(GLsizei n, GLuint * renderbuffers); +typedef void (GLAD_API_PTR *PFNGLGENSAMPLERSPROC)(GLsizei count, GLuint * samplers); +typedef void (GLAD_API_PTR *PFNGLGENTEXTURESPROC)(GLsizei n, GLuint * textures); +typedef void (GLAD_API_PTR *PFNGLGENVERTEXARRAYSPROC)(GLsizei n, GLuint * arrays); +typedef void (GLAD_API_PTR *PFNGLGENERATEMIPMAPPROC)(GLenum target); +typedef void (GLAD_API_PTR *PFNGLGENERATEMIPMAPEXTPROC)(GLenum target); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEATTRIBPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei * length, GLint * size, GLenum * type, GLchar * name); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEUNIFORMPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei * length, GLint * size, GLenum * type, GLchar * name); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC)(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei * length, GLchar * uniformBlockName); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEUNIFORMBLOCKIVPROC)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEUNIFORMNAMEPROC)(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei * length, GLchar * uniformName); +typedef void (GLAD_API_PTR *PFNGLGETACTIVEUNIFORMSIVPROC)(GLuint program, GLsizei uniformCount, const GLuint * uniformIndices, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETATTACHEDSHADERSPROC)(GLuint program, GLsizei maxCount, GLsizei * count, GLuint * shaders); +typedef GLint (GLAD_API_PTR *PFNGLGETATTRIBLOCATIONPROC)(GLuint program, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLGETBOOLEANI_VPROC)(GLenum target, GLuint index, GLboolean * data); +typedef void (GLAD_API_PTR *PFNGLGETBOOLEANVPROC)(GLenum pname, GLboolean * data); +typedef void (GLAD_API_PTR *PFNGLGETBUFFERPARAMETERI64VPROC)(GLenum target, GLenum pname, GLint64 * params); +typedef void (GLAD_API_PTR *PFNGLGETBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETBUFFERPOINTERVPROC)(GLenum target, GLenum pname, void ** params); +typedef void (GLAD_API_PTR *PFNGLGETBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, void * data); +typedef void (GLAD_API_PTR *PFNGLGETCLIPPLANEPROC)(GLenum plane, GLdouble * equation); +typedef void (GLAD_API_PTR *PFNGLGETCOMPRESSEDTEXIMAGEPROC)(GLenum target, GLint level, void * img); +typedef GLuint (GLAD_API_PTR *PFNGLGETDEBUGMESSAGELOGARBPROC)(GLuint count, GLsizei bufSize, GLenum * sources, GLenum * types, GLuint * ids, GLenum * severities, GLsizei * lengths, GLchar * messageLog); +typedef void (GLAD_API_PTR *PFNGLGETDOUBLEVPROC)(GLenum pname, GLdouble * data); +typedef GLenum (GLAD_API_PTR *PFNGLGETERRORPROC)(void); +typedef void (GLAD_API_PTR *PFNGLGETFLOATVPROC)(GLenum pname, GLfloat * data); +typedef GLint (GLAD_API_PTR *PFNGLGETFRAGDATAINDEXPROC)(GLuint program, const GLchar * name); +typedef GLint (GLAD_API_PTR *PFNGLGETFRAGDATALOCATIONPROC)(GLuint program, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)(GLenum target, GLenum attachment, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC)(GLenum target, GLenum attachment, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETINTEGER64I_VPROC)(GLenum target, GLuint index, GLint64 * data); +typedef void (GLAD_API_PTR *PFNGLGETINTEGER64VPROC)(GLenum pname, GLint64 * data); +typedef void (GLAD_API_PTR *PFNGLGETINTEGERI_VPROC)(GLenum target, GLuint index, GLint * data); +typedef void (GLAD_API_PTR *PFNGLGETINTEGERVPROC)(GLenum pname, GLint * data); +typedef void (GLAD_API_PTR *PFNGLGETLIGHTFVPROC)(GLenum light, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETLIGHTIVPROC)(GLenum light, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETMAPDVPROC)(GLenum target, GLenum query, GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLGETMAPFVPROC)(GLenum target, GLenum query, GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLGETMAPIVPROC)(GLenum target, GLenum query, GLint * v); +typedef void (GLAD_API_PTR *PFNGLGETMATERIALFVPROC)(GLenum face, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETMATERIALIVPROC)(GLenum face, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETMULTISAMPLEFVPROC)(GLenum pname, GLuint index, GLfloat * val); +typedef void (GLAD_API_PTR *PFNGLGETPIXELMAPFVPROC)(GLenum map, GLfloat * values); +typedef void (GLAD_API_PTR *PFNGLGETPIXELMAPUIVPROC)(GLenum map, GLuint * values); +typedef void (GLAD_API_PTR *PFNGLGETPIXELMAPUSVPROC)(GLenum map, GLushort * values); +typedef void (GLAD_API_PTR *PFNGLGETPOINTERVPROC)(GLenum pname, void ** params); +typedef void (GLAD_API_PTR *PFNGLGETPOLYGONSTIPPLEPROC)(GLubyte * mask); +typedef void (GLAD_API_PTR *PFNGLGETPROGRAMINFOLOGPROC)(GLuint program, GLsizei bufSize, GLsizei * length, GLchar * infoLog); +typedef void (GLAD_API_PTR *PFNGLGETPROGRAMIVPROC)(GLuint program, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETQUERYOBJECTI64VPROC)(GLuint id, GLenum pname, GLint64 * params); +typedef void (GLAD_API_PTR *PFNGLGETQUERYOBJECTIVPROC)(GLuint id, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETQUERYOBJECTUI64VPROC)(GLuint id, GLenum pname, GLuint64 * params); +typedef void (GLAD_API_PTR *PFNGLGETQUERYOBJECTUIVPROC)(GLuint id, GLenum pname, GLuint * params); +typedef void (GLAD_API_PTR *PFNGLGETQUERYIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETRENDERBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, GLuint * params); +typedef void (GLAD_API_PTR *PFNGLGETSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETSHADERINFOLOGPROC)(GLuint shader, GLsizei bufSize, GLsizei * length, GLchar * infoLog); +typedef void (GLAD_API_PTR *PFNGLGETSHADERSOURCEPROC)(GLuint shader, GLsizei bufSize, GLsizei * length, GLchar * source); +typedef void (GLAD_API_PTR *PFNGLGETSHADERIVPROC)(GLuint shader, GLenum pname, GLint * params); +typedef const GLubyte * (GLAD_API_PTR *PFNGLGETSTRINGPROC)(GLenum name); +typedef const GLubyte * (GLAD_API_PTR *PFNGLGETSTRINGIPROC)(GLenum name, GLuint index); +typedef void (GLAD_API_PTR *PFNGLGETSYNCIVPROC)(GLsync sync, GLenum pname, GLsizei count, GLsizei * length, GLint * values); +typedef void (GLAD_API_PTR *PFNGLGETTEXENVFVPROC)(GLenum target, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXENVIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXGENDVPROC)(GLenum coord, GLenum pname, GLdouble * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXGENFVPROC)(GLenum coord, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXGENIVPROC)(GLenum coord, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXIMAGEPROC)(GLenum target, GLint level, GLenum format, GLenum type, void * pixels); +typedef void (GLAD_API_PTR *PFNGLGETTEXLEVELPARAMETERFVPROC)(GLenum target, GLint level, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXLEVELPARAMETERIVPROC)(GLenum target, GLint level, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, GLuint * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXPARAMETERFVPROC)(GLenum target, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETTEXPARAMETERIVPROC)(GLenum target, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETTRANSFORMFEEDBACKVARYINGPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei * length, GLsizei * size, GLenum * type, GLchar * name); +typedef GLuint (GLAD_API_PTR *PFNGLGETUNIFORMBLOCKINDEXPROC)(GLuint program, const GLchar * uniformBlockName); +typedef void (GLAD_API_PTR *PFNGLGETUNIFORMINDICESPROC)(GLuint program, GLsizei uniformCount, const GLchar *const* uniformNames, GLuint * uniformIndices); +typedef GLint (GLAD_API_PTR *PFNGLGETUNIFORMLOCATIONPROC)(GLuint program, const GLchar * name); +typedef void (GLAD_API_PTR *PFNGLGETUNIFORMFVPROC)(GLuint program, GLint location, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETUNIFORMIVPROC)(GLuint program, GLint location, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETUNIFORMUIVPROC)(GLuint program, GLint location, GLuint * params); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBIIVPROC)(GLuint index, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBIUIVPROC)(GLuint index, GLenum pname, GLuint * params); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBPOINTERVPROC)(GLuint index, GLenum pname, void ** pointer); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBDVPROC)(GLuint index, GLenum pname, GLdouble * params); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBFVPROC)(GLuint index, GLenum pname, GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLGETVERTEXATTRIBIVPROC)(GLuint index, GLenum pname, GLint * params); +typedef void (GLAD_API_PTR *PFNGLHINTPROC)(GLenum target, GLenum mode); +typedef void (GLAD_API_PTR *PFNGLINDEXMASKPROC)(GLuint mask); +typedef void (GLAD_API_PTR *PFNGLINDEXPOINTERPROC)(GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLINDEXDPROC)(GLdouble c); +typedef void (GLAD_API_PTR *PFNGLINDEXDVPROC)(const GLdouble * c); +typedef void (GLAD_API_PTR *PFNGLINDEXFPROC)(GLfloat c); +typedef void (GLAD_API_PTR *PFNGLINDEXFVPROC)(const GLfloat * c); +typedef void (GLAD_API_PTR *PFNGLINDEXIPROC)(GLint c); +typedef void (GLAD_API_PTR *PFNGLINDEXIVPROC)(const GLint * c); +typedef void (GLAD_API_PTR *PFNGLINDEXSPROC)(GLshort c); +typedef void (GLAD_API_PTR *PFNGLINDEXSVPROC)(const GLshort * c); +typedef void (GLAD_API_PTR *PFNGLINDEXUBPROC)(GLubyte c); +typedef void (GLAD_API_PTR *PFNGLINDEXUBVPROC)(const GLubyte * c); +typedef void (GLAD_API_PTR *PFNGLINITNAMESPROC)(void); +typedef void (GLAD_API_PTR *PFNGLINTERLEAVEDARRAYSPROC)(GLenum format, GLsizei stride, const void * pointer); +typedef GLboolean (GLAD_API_PTR *PFNGLISBUFFERPROC)(GLuint buffer); +typedef GLboolean (GLAD_API_PTR *PFNGLISENABLEDPROC)(GLenum cap); +typedef GLboolean (GLAD_API_PTR *PFNGLISENABLEDIPROC)(GLenum target, GLuint index); +typedef GLboolean (GLAD_API_PTR *PFNGLISFRAMEBUFFERPROC)(GLuint framebuffer); +typedef GLboolean (GLAD_API_PTR *PFNGLISFRAMEBUFFEREXTPROC)(GLuint framebuffer); +typedef GLboolean (GLAD_API_PTR *PFNGLISLISTPROC)(GLuint list); +typedef GLboolean (GLAD_API_PTR *PFNGLISPROGRAMPROC)(GLuint program); +typedef GLboolean (GLAD_API_PTR *PFNGLISQUERYPROC)(GLuint id); +typedef GLboolean (GLAD_API_PTR *PFNGLISRENDERBUFFERPROC)(GLuint renderbuffer); +typedef GLboolean (GLAD_API_PTR *PFNGLISRENDERBUFFEREXTPROC)(GLuint renderbuffer); +typedef GLboolean (GLAD_API_PTR *PFNGLISSAMPLERPROC)(GLuint sampler); +typedef GLboolean (GLAD_API_PTR *PFNGLISSHADERPROC)(GLuint shader); +typedef GLboolean (GLAD_API_PTR *PFNGLISSYNCPROC)(GLsync sync); +typedef GLboolean (GLAD_API_PTR *PFNGLISTEXTUREPROC)(GLuint texture); +typedef GLboolean (GLAD_API_PTR *PFNGLISVERTEXARRAYPROC)(GLuint array); +typedef void (GLAD_API_PTR *PFNGLLIGHTMODELFPROC)(GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLLIGHTMODELFVPROC)(GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLLIGHTMODELIPROC)(GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLLIGHTMODELIVPROC)(GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLLIGHTFPROC)(GLenum light, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLLIGHTFVPROC)(GLenum light, GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLLIGHTIPROC)(GLenum light, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLLIGHTIVPROC)(GLenum light, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLLINESTIPPLEPROC)(GLint factor, GLushort pattern); +typedef void (GLAD_API_PTR *PFNGLLINEWIDTHPROC)(GLfloat width); +typedef void (GLAD_API_PTR *PFNGLLINKPROGRAMPROC)(GLuint program); +typedef void (GLAD_API_PTR *PFNGLLISTBASEPROC)(GLuint base); +typedef void (GLAD_API_PTR *PFNGLLOADIDENTITYPROC)(void); +typedef void (GLAD_API_PTR *PFNGLLOADMATRIXDPROC)(const GLdouble * m); +typedef void (GLAD_API_PTR *PFNGLLOADMATRIXFPROC)(const GLfloat * m); +typedef void (GLAD_API_PTR *PFNGLLOADNAMEPROC)(GLuint name); +typedef void (GLAD_API_PTR *PFNGLLOADTRANSPOSEMATRIXDPROC)(const GLdouble * m); +typedef void (GLAD_API_PTR *PFNGLLOADTRANSPOSEMATRIXFPROC)(const GLfloat * m); +typedef void (GLAD_API_PTR *PFNGLLOGICOPPROC)(GLenum opcode); +typedef void (GLAD_API_PTR *PFNGLMAP1DPROC)(GLenum target, GLdouble u1, GLdouble u2, GLint stride, GLint order, const GLdouble * points); +typedef void (GLAD_API_PTR *PFNGLMAP1FPROC)(GLenum target, GLfloat u1, GLfloat u2, GLint stride, GLint order, const GLfloat * points); +typedef void (GLAD_API_PTR *PFNGLMAP2DPROC)(GLenum target, GLdouble u1, GLdouble u2, GLint ustride, GLint uorder, GLdouble v1, GLdouble v2, GLint vstride, GLint vorder, const GLdouble * points); +typedef void (GLAD_API_PTR *PFNGLMAP2FPROC)(GLenum target, GLfloat u1, GLfloat u2, GLint ustride, GLint uorder, GLfloat v1, GLfloat v2, GLint vstride, GLint vorder, const GLfloat * points); +typedef void * (GLAD_API_PTR *PFNGLMAPBUFFERPROC)(GLenum target, GLenum access); +typedef void * (GLAD_API_PTR *PFNGLMAPBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access); +typedef void (GLAD_API_PTR *PFNGLMAPGRID1DPROC)(GLint un, GLdouble u1, GLdouble u2); +typedef void (GLAD_API_PTR *PFNGLMAPGRID1FPROC)(GLint un, GLfloat u1, GLfloat u2); +typedef void (GLAD_API_PTR *PFNGLMAPGRID2DPROC)(GLint un, GLdouble u1, GLdouble u2, GLint vn, GLdouble v1, GLdouble v2); +typedef void (GLAD_API_PTR *PFNGLMAPGRID2FPROC)(GLint un, GLfloat u1, GLfloat u2, GLint vn, GLfloat v1, GLfloat v2); +typedef void (GLAD_API_PTR *PFNGLMATERIALFPROC)(GLenum face, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLMATERIALFVPROC)(GLenum face, GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLMATERIALIPROC)(GLenum face, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLMATERIALIVPROC)(GLenum face, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLMATRIXMODEPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLMULTMATRIXDPROC)(const GLdouble * m); +typedef void (GLAD_API_PTR *PFNGLMULTMATRIXFPROC)(const GLfloat * m); +typedef void (GLAD_API_PTR *PFNGLMULTTRANSPOSEMATRIXDPROC)(const GLdouble * m); +typedef void (GLAD_API_PTR *PFNGLMULTTRANSPOSEMATRIXFPROC)(const GLfloat * m); +typedef void (GLAD_API_PTR *PFNGLMULTIDRAWARRAYSPROC)(GLenum mode, const GLint * first, const GLsizei * count, GLsizei drawcount); +typedef void (GLAD_API_PTR *PFNGLMULTIDRAWELEMENTSPROC)(GLenum mode, const GLsizei * count, GLenum type, const void *const* indices, GLsizei drawcount); +typedef void (GLAD_API_PTR *PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, const GLsizei * count, GLenum type, const void *const* indices, GLsizei drawcount, const GLint * basevertex); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1DPROC)(GLenum target, GLdouble s); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1DVPROC)(GLenum target, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1FPROC)(GLenum target, GLfloat s); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1FVPROC)(GLenum target, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1IPROC)(GLenum target, GLint s); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1IVPROC)(GLenum target, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1SPROC)(GLenum target, GLshort s); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD1SVPROC)(GLenum target, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2DPROC)(GLenum target, GLdouble s, GLdouble t); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2DVPROC)(GLenum target, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2FPROC)(GLenum target, GLfloat s, GLfloat t); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2FVPROC)(GLenum target, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2IPROC)(GLenum target, GLint s, GLint t); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2IVPROC)(GLenum target, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2SPROC)(GLenum target, GLshort s, GLshort t); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD2SVPROC)(GLenum target, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3DPROC)(GLenum target, GLdouble s, GLdouble t, GLdouble r); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3DVPROC)(GLenum target, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3FPROC)(GLenum target, GLfloat s, GLfloat t, GLfloat r); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3FVPROC)(GLenum target, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3IPROC)(GLenum target, GLint s, GLint t, GLint r); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3IVPROC)(GLenum target, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3SPROC)(GLenum target, GLshort s, GLshort t, GLshort r); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD3SVPROC)(GLenum target, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4DPROC)(GLenum target, GLdouble s, GLdouble t, GLdouble r, GLdouble q); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4DVPROC)(GLenum target, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4FPROC)(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4FVPROC)(GLenum target, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4IPROC)(GLenum target, GLint s, GLint t, GLint r, GLint q); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4IVPROC)(GLenum target, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4SPROC)(GLenum target, GLshort s, GLshort t, GLshort r, GLshort q); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORD4SVPROC)(GLenum target, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP1UIPROC)(GLenum texture, GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP1UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP2UIPROC)(GLenum texture, GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP2UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP3UIPROC)(GLenum texture, GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP3UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP4UIPROC)(GLenum texture, GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLMULTITEXCOORDP4UIVPROC)(GLenum texture, GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLNEWLISTPROC)(GLuint list, GLenum mode); +typedef void (GLAD_API_PTR *PFNGLNORMAL3BPROC)(GLbyte nx, GLbyte ny, GLbyte nz); +typedef void (GLAD_API_PTR *PFNGLNORMAL3BVPROC)(const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLNORMAL3DPROC)(GLdouble nx, GLdouble ny, GLdouble nz); +typedef void (GLAD_API_PTR *PFNGLNORMAL3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLNORMAL3FPROC)(GLfloat nx, GLfloat ny, GLfloat nz); +typedef void (GLAD_API_PTR *PFNGLNORMAL3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLNORMAL3IPROC)(GLint nx, GLint ny, GLint nz); +typedef void (GLAD_API_PTR *PFNGLNORMAL3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLNORMAL3SPROC)(GLshort nx, GLshort ny, GLshort nz); +typedef void (GLAD_API_PTR *PFNGLNORMAL3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLNORMALP3UIPROC)(GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLNORMALP3UIVPROC)(GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLNORMALPOINTERPROC)(GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLORTHOPROC)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar); +typedef void (GLAD_API_PTR *PFNGLPASSTHROUGHPROC)(GLfloat token); +typedef void (GLAD_API_PTR *PFNGLPIXELMAPFVPROC)(GLenum map, GLsizei mapsize, const GLfloat * values); +typedef void (GLAD_API_PTR *PFNGLPIXELMAPUIVPROC)(GLenum map, GLsizei mapsize, const GLuint * values); +typedef void (GLAD_API_PTR *PFNGLPIXELMAPUSVPROC)(GLenum map, GLsizei mapsize, const GLushort * values); +typedef void (GLAD_API_PTR *PFNGLPIXELSTOREFPROC)(GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLPIXELSTOREIPROC)(GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLPIXELTRANSFERFPROC)(GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLPIXELTRANSFERIPROC)(GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLPIXELZOOMPROC)(GLfloat xfactor, GLfloat yfactor); +typedef void (GLAD_API_PTR *PFNGLPOINTPARAMETERFPROC)(GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLPOINTPARAMETERFVPROC)(GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLPOINTPARAMETERIPROC)(GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLPOINTPARAMETERIVPROC)(GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLPOINTSIZEPROC)(GLfloat size); +typedef void (GLAD_API_PTR *PFNGLPOLYGONMODEPROC)(GLenum face, GLenum mode); +typedef void (GLAD_API_PTR *PFNGLPOLYGONOFFSETPROC)(GLfloat factor, GLfloat units); +typedef void (GLAD_API_PTR *PFNGLPOLYGONSTIPPLEPROC)(const GLubyte * mask); +typedef void (GLAD_API_PTR *PFNGLPOPATTRIBPROC)(void); +typedef void (GLAD_API_PTR *PFNGLPOPCLIENTATTRIBPROC)(void); +typedef void (GLAD_API_PTR *PFNGLPOPMATRIXPROC)(void); +typedef void (GLAD_API_PTR *PFNGLPOPNAMEPROC)(void); +typedef void (GLAD_API_PTR *PFNGLPRIMITIVERESTARTINDEXPROC)(GLuint index); +typedef void (GLAD_API_PTR *PFNGLPRIORITIZETEXTURESPROC)(GLsizei n, const GLuint * textures, const GLfloat * priorities); +typedef void (GLAD_API_PTR *PFNGLPROVOKINGVERTEXPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLPUSHATTRIBPROC)(GLbitfield mask); +typedef void (GLAD_API_PTR *PFNGLPUSHCLIENTATTRIBPROC)(GLbitfield mask); +typedef void (GLAD_API_PTR *PFNGLPUSHMATRIXPROC)(void); +typedef void (GLAD_API_PTR *PFNGLPUSHNAMEPROC)(GLuint name); +typedef void (GLAD_API_PTR *PFNGLQUERYCOUNTERPROC)(GLuint id, GLenum target); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2DPROC)(GLdouble x, GLdouble y); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2FPROC)(GLfloat x, GLfloat y); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2IPROC)(GLint x, GLint y); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2SPROC)(GLshort x, GLshort y); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS2SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3DPROC)(GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3FPROC)(GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3IPROC)(GLint x, GLint y, GLint z); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3SPROC)(GLshort x, GLshort y, GLshort z); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4DPROC)(GLdouble x, GLdouble y, GLdouble z, GLdouble w); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4FPROC)(GLfloat x, GLfloat y, GLfloat z, GLfloat w); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4IPROC)(GLint x, GLint y, GLint z, GLint w); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4SPROC)(GLshort x, GLshort y, GLshort z, GLshort w); +typedef void (GLAD_API_PTR *PFNGLRASTERPOS4SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLREADBUFFERPROC)(GLenum src); +typedef void (GLAD_API_PTR *PFNGLREADPIXELSPROC)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, void * pixels); +typedef void (GLAD_API_PTR *PFNGLRECTDPROC)(GLdouble x1, GLdouble y1, GLdouble x2, GLdouble y2); +typedef void (GLAD_API_PTR *PFNGLRECTDVPROC)(const GLdouble * v1, const GLdouble * v2); +typedef void (GLAD_API_PTR *PFNGLRECTFPROC)(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2); +typedef void (GLAD_API_PTR *PFNGLRECTFVPROC)(const GLfloat * v1, const GLfloat * v2); +typedef void (GLAD_API_PTR *PFNGLRECTIPROC)(GLint x1, GLint y1, GLint x2, GLint y2); +typedef void (GLAD_API_PTR *PFNGLRECTIVPROC)(const GLint * v1, const GLint * v2); +typedef void (GLAD_API_PTR *PFNGLRECTSPROC)(GLshort x1, GLshort y1, GLshort x2, GLshort y2); +typedef void (GLAD_API_PTR *PFNGLRECTSVPROC)(const GLshort * v1, const GLshort * v2); +typedef GLint (GLAD_API_PTR *PFNGLRENDERMODEPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLRENDERBUFFERSTORAGEPROC)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLRENDERBUFFERSTORAGEEXTPROC)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLROTATEDPROC)(GLdouble angle, GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLROTATEFPROC)(GLfloat angle, GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLSAMPLECOVERAGEPROC)(GLfloat value, GLboolean invert); +typedef void (GLAD_API_PTR *PFNGLSAMPLEMASKIPROC)(GLuint maskNumber, GLbitfield mask); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, const GLint * param); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, const GLuint * param); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERFPROC)(GLuint sampler, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, const GLfloat * param); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERIPROC)(GLuint sampler, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, const GLint * param); +typedef void (GLAD_API_PTR *PFNGLSCALEDPROC)(GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLSCALEFPROC)(GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLSCISSORPROC)(GLint x, GLint y, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3BPROC)(GLbyte red, GLbyte green, GLbyte blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3BVPROC)(const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3DPROC)(GLdouble red, GLdouble green, GLdouble blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3FPROC)(GLfloat red, GLfloat green, GLfloat blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3IPROC)(GLint red, GLint green, GLint blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3SPROC)(GLshort red, GLshort green, GLshort blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3UBPROC)(GLubyte red, GLubyte green, GLubyte blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3UBVPROC)(const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3UIPROC)(GLuint red, GLuint green, GLuint blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3UIVPROC)(const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3USPROC)(GLushort red, GLushort green, GLushort blue); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLOR3USVPROC)(const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLORP3UIPROC)(GLenum type, GLuint color); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLORP3UIVPROC)(GLenum type, const GLuint * color); +typedef void (GLAD_API_PTR *PFNGLSECONDARYCOLORPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLSELECTBUFFERPROC)(GLsizei size, GLuint * buffer); +typedef void (GLAD_API_PTR *PFNGLSHADEMODELPROC)(GLenum mode); +typedef void (GLAD_API_PTR *PFNGLSHADERSOURCEPROC)(GLuint shader, GLsizei count, const GLchar *const* string, const GLint * length); +typedef void (GLAD_API_PTR *PFNGLSTENCILFUNCPROC)(GLenum func, GLint ref, GLuint mask); +typedef void (GLAD_API_PTR *PFNGLSTENCILFUNCSEPARATEPROC)(GLenum face, GLenum func, GLint ref, GLuint mask); +typedef void (GLAD_API_PTR *PFNGLSTENCILMASKPROC)(GLuint mask); +typedef void (GLAD_API_PTR *PFNGLSTENCILMASKSEPARATEPROC)(GLenum face, GLuint mask); +typedef void (GLAD_API_PTR *PFNGLSTENCILOPPROC)(GLenum fail, GLenum zfail, GLenum zpass); +typedef void (GLAD_API_PTR *PFNGLSTENCILOPSEPARATEPROC)(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass); +typedef void (GLAD_API_PTR *PFNGLTEXBUFFERPROC)(GLenum target, GLenum internalformat, GLuint buffer); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1DPROC)(GLdouble s); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1FPROC)(GLfloat s); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1IPROC)(GLint s); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1SPROC)(GLshort s); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD1SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2DPROC)(GLdouble s, GLdouble t); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2FPROC)(GLfloat s, GLfloat t); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2IPROC)(GLint s, GLint t); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2SPROC)(GLshort s, GLshort t); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD2SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3DPROC)(GLdouble s, GLdouble t, GLdouble r); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3FPROC)(GLfloat s, GLfloat t, GLfloat r); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3IPROC)(GLint s, GLint t, GLint r); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3SPROC)(GLshort s, GLshort t, GLshort r); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4DPROC)(GLdouble s, GLdouble t, GLdouble r, GLdouble q); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4FPROC)(GLfloat s, GLfloat t, GLfloat r, GLfloat q); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4IPROC)(GLint s, GLint t, GLint r, GLint q); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4SPROC)(GLshort s, GLshort t, GLshort r, GLshort q); +typedef void (GLAD_API_PTR *PFNGLTEXCOORD4SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP1UIPROC)(GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP1UIVPROC)(GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP2UIPROC)(GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP2UIVPROC)(GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP3UIPROC)(GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP3UIVPROC)(GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP4UIPROC)(GLenum type, GLuint coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDP4UIVPROC)(GLenum type, const GLuint * coords); +typedef void (GLAD_API_PTR *PFNGLTEXCOORDPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLTEXENVFPROC)(GLenum target, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLTEXENVFVPROC)(GLenum target, GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLTEXENVIPROC)(GLenum target, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLTEXENVIVPROC)(GLenum target, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLTEXGENDPROC)(GLenum coord, GLenum pname, GLdouble param); +typedef void (GLAD_API_PTR *PFNGLTEXGENDVPROC)(GLenum coord, GLenum pname, const GLdouble * params); +typedef void (GLAD_API_PTR *PFNGLTEXGENFPROC)(GLenum coord, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLTEXGENFVPROC)(GLenum coord, GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLTEXGENIPROC)(GLenum coord, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLTEXGENIVPROC)(GLenum coord, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLTEXIMAGE1DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTEXIMAGE2DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTEXIMAGE2DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations); +typedef void (GLAD_API_PTR *PFNGLTEXIMAGE3DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTEXIMAGE3DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, const GLuint * params); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERFPROC)(GLenum target, GLenum pname, GLfloat param); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERFVPROC)(GLenum target, GLenum pname, const GLfloat * params); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERIPROC)(GLenum target, GLenum pname, GLint param); +typedef void (GLAD_API_PTR *PFNGLTEXPARAMETERIVPROC)(GLenum target, GLenum pname, const GLint * params); +typedef void (GLAD_API_PTR *PFNGLTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void * pixels); +typedef void (GLAD_API_PTR *PFNGLTRANSFORMFEEDBACKVARYINGSPROC)(GLuint program, GLsizei count, const GLchar *const* varyings, GLenum bufferMode); +typedef void (GLAD_API_PTR *PFNGLTRANSLATEDPROC)(GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLTRANSLATEFPROC)(GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1FPROC)(GLint location, GLfloat v0); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1FVPROC)(GLint location, GLsizei count, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1IPROC)(GLint location, GLint v0); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1IVPROC)(GLint location, GLsizei count, const GLint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1UIPROC)(GLint location, GLuint v0); +typedef void (GLAD_API_PTR *PFNGLUNIFORM1UIVPROC)(GLint location, GLsizei count, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2FPROC)(GLint location, GLfloat v0, GLfloat v1); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2FVPROC)(GLint location, GLsizei count, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2IPROC)(GLint location, GLint v0, GLint v1); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2IVPROC)(GLint location, GLsizei count, const GLint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2UIPROC)(GLint location, GLuint v0, GLuint v1); +typedef void (GLAD_API_PTR *PFNGLUNIFORM2UIVPROC)(GLint location, GLsizei count, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3FVPROC)(GLint location, GLsizei count, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3IPROC)(GLint location, GLint v0, GLint v1, GLint v2); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3IVPROC)(GLint location, GLsizei count, const GLint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2); +typedef void (GLAD_API_PTR *PFNGLUNIFORM3UIVPROC)(GLint location, GLsizei count, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4FVPROC)(GLint location, GLsizei count, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4IPROC)(GLint location, GLint v0, GLint v1, GLint v2, GLint v3); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4IVPROC)(GLint location, GLsizei count, const GLint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3); +typedef void (GLAD_API_PTR *PFNGLUNIFORM4UIVPROC)(GLint location, GLsizei count, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMBLOCKBINDINGPROC)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX2X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX2X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX3X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX3X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX4X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef void (GLAD_API_PTR *PFNGLUNIFORMMATRIX4X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat * value); +typedef GLboolean (GLAD_API_PTR *PFNGLUNMAPBUFFERPROC)(GLenum target); +typedef void (GLAD_API_PTR *PFNGLUSEPROGRAMPROC)(GLuint program); +typedef void (GLAD_API_PTR *PFNGLVALIDATEPROGRAMPROC)(GLuint program); +typedef void (GLAD_API_PTR *PFNGLVERTEX2DPROC)(GLdouble x, GLdouble y); +typedef void (GLAD_API_PTR *PFNGLVERTEX2DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX2FPROC)(GLfloat x, GLfloat y); +typedef void (GLAD_API_PTR *PFNGLVERTEX2FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX2IPROC)(GLint x, GLint y); +typedef void (GLAD_API_PTR *PFNGLVERTEX2IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX2SPROC)(GLshort x, GLshort y); +typedef void (GLAD_API_PTR *PFNGLVERTEX2SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX3DPROC)(GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLVERTEX3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX3FPROC)(GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLVERTEX3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX3IPROC)(GLint x, GLint y, GLint z); +typedef void (GLAD_API_PTR *PFNGLVERTEX3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX3SPROC)(GLshort x, GLshort y, GLshort z); +typedef void (GLAD_API_PTR *PFNGLVERTEX3SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX4DPROC)(GLdouble x, GLdouble y, GLdouble z, GLdouble w); +typedef void (GLAD_API_PTR *PFNGLVERTEX4DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX4FPROC)(GLfloat x, GLfloat y, GLfloat z, GLfloat w); +typedef void (GLAD_API_PTR *PFNGLVERTEX4FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX4IPROC)(GLint x, GLint y, GLint z, GLint w); +typedef void (GLAD_API_PTR *PFNGLVERTEX4IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEX4SPROC)(GLshort x, GLshort y, GLshort z, GLshort w); +typedef void (GLAD_API_PTR *PFNGLVERTEX4SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1DPROC)(GLuint index, GLdouble x); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1DVPROC)(GLuint index, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1FPROC)(GLuint index, GLfloat x); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1FVPROC)(GLuint index, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1SPROC)(GLuint index, GLshort x); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB1SVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2DPROC)(GLuint index, GLdouble x, GLdouble y); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2DVPROC)(GLuint index, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2FPROC)(GLuint index, GLfloat x, GLfloat y); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2FVPROC)(GLuint index, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2SPROC)(GLuint index, GLshort x, GLshort y); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB2SVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3DVPROC)(GLuint index, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3FVPROC)(GLuint index, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3SPROC)(GLuint index, GLshort x, GLshort y, GLshort z); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB3SVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NBVPROC)(GLuint index, const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NIVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NSVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NUBPROC)(GLuint index, GLubyte x, GLubyte y, GLubyte z, GLubyte w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NUBVPROC)(GLuint index, const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NUIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4NUSVPROC)(GLuint index, const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4BVPROC)(GLuint index, const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4DVPROC)(GLuint index, const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4FVPROC)(GLuint index, const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4IVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4SPROC)(GLuint index, GLshort x, GLshort y, GLshort z, GLshort w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4SVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4UBVPROC)(GLuint index, const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4UIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIB4USVPROC)(GLuint index, const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBDIVISORPROC)(GLuint index, GLuint divisor); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI1IPROC)(GLuint index, GLint x); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI1IVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI1UIPROC)(GLuint index, GLuint x); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI1UIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI2IPROC)(GLuint index, GLint x, GLint y); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI2IVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI2UIPROC)(GLuint index, GLuint x, GLuint y); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI2UIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI3IPROC)(GLuint index, GLint x, GLint y, GLint z); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI3IVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI3UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI3UIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4BVPROC)(GLuint index, const GLbyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4IPROC)(GLuint index, GLint x, GLint y, GLint z, GLint w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4IVPROC)(GLuint index, const GLint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4SVPROC)(GLuint index, const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4UBVPROC)(GLuint index, const GLubyte * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4UIVPROC)(GLuint index, const GLuint * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBI4USVPROC)(GLuint index, const GLushort * v); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBIPOINTERPROC)(GLuint index, GLint size, GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP1UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP1UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP2UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP2UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP3UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP3UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP4UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBP4UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXATTRIBPOINTERPROC)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLVERTEXP2UIPROC)(GLenum type, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXP2UIVPROC)(GLenum type, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXP3UIPROC)(GLenum type, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXP3UIVPROC)(GLenum type, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXP4UIPROC)(GLenum type, GLuint value); +typedef void (GLAD_API_PTR *PFNGLVERTEXP4UIVPROC)(GLenum type, const GLuint * value); +typedef void (GLAD_API_PTR *PFNGLVERTEXPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void * pointer); +typedef void (GLAD_API_PTR *PFNGLVIEWPORTPROC)(GLint x, GLint y, GLsizei width, GLsizei height); +typedef void (GLAD_API_PTR *PFNGLWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2DPROC)(GLdouble x, GLdouble y); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2FPROC)(GLfloat x, GLfloat y); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2IPROC)(GLint x, GLint y); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2SPROC)(GLshort x, GLshort y); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS2SVPROC)(const GLshort * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3DPROC)(GLdouble x, GLdouble y, GLdouble z); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3DVPROC)(const GLdouble * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3FPROC)(GLfloat x, GLfloat y, GLfloat z); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3FVPROC)(const GLfloat * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3IPROC)(GLint x, GLint y, GLint z); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3IVPROC)(const GLint * v); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3SPROC)(GLshort x, GLshort y, GLshort z); +typedef void (GLAD_API_PTR *PFNGLWINDOWPOS3SVPROC)(const GLshort * v); + +GLAD_API_CALL PFNGLACCUMPROC glad_glAccum; +#define glAccum glad_glAccum +GLAD_API_CALL PFNGLACTIVETEXTUREPROC glad_glActiveTexture; +#define glActiveTexture glad_glActiveTexture +GLAD_API_CALL PFNGLALPHAFUNCPROC glad_glAlphaFunc; +#define glAlphaFunc glad_glAlphaFunc +GLAD_API_CALL PFNGLARETEXTURESRESIDENTPROC glad_glAreTexturesResident; +#define glAreTexturesResident glad_glAreTexturesResident +GLAD_API_CALL PFNGLARRAYELEMENTPROC glad_glArrayElement; +#define glArrayElement glad_glArrayElement +GLAD_API_CALL PFNGLATTACHSHADERPROC glad_glAttachShader; +#define glAttachShader glad_glAttachShader +GLAD_API_CALL PFNGLBEGINPROC glad_glBegin; +#define glBegin glad_glBegin +GLAD_API_CALL PFNGLBEGINCONDITIONALRENDERPROC glad_glBeginConditionalRender; +#define glBeginConditionalRender glad_glBeginConditionalRender +GLAD_API_CALL PFNGLBEGINQUERYPROC glad_glBeginQuery; +#define glBeginQuery glad_glBeginQuery +GLAD_API_CALL PFNGLBEGINTRANSFORMFEEDBACKPROC glad_glBeginTransformFeedback; +#define glBeginTransformFeedback glad_glBeginTransformFeedback +GLAD_API_CALL PFNGLBINDATTRIBLOCATIONPROC glad_glBindAttribLocation; +#define glBindAttribLocation glad_glBindAttribLocation +GLAD_API_CALL PFNGLBINDBUFFERPROC glad_glBindBuffer; +#define glBindBuffer glad_glBindBuffer +GLAD_API_CALL PFNGLBINDBUFFERBASEPROC glad_glBindBufferBase; +#define glBindBufferBase glad_glBindBufferBase +GLAD_API_CALL PFNGLBINDBUFFERRANGEPROC glad_glBindBufferRange; +#define glBindBufferRange glad_glBindBufferRange +GLAD_API_CALL PFNGLBINDFRAGDATALOCATIONPROC glad_glBindFragDataLocation; +#define glBindFragDataLocation glad_glBindFragDataLocation +GLAD_API_CALL PFNGLBINDFRAGDATALOCATIONINDEXEDPROC glad_glBindFragDataLocationIndexed; +#define glBindFragDataLocationIndexed glad_glBindFragDataLocationIndexed +GLAD_API_CALL PFNGLBINDFRAMEBUFFERPROC glad_glBindFramebuffer; +#define glBindFramebuffer glad_glBindFramebuffer +GLAD_API_CALL PFNGLBINDFRAMEBUFFEREXTPROC glad_glBindFramebufferEXT; +#define glBindFramebufferEXT glad_glBindFramebufferEXT +GLAD_API_CALL PFNGLBINDRENDERBUFFERPROC glad_glBindRenderbuffer; +#define glBindRenderbuffer glad_glBindRenderbuffer +GLAD_API_CALL PFNGLBINDRENDERBUFFEREXTPROC glad_glBindRenderbufferEXT; +#define glBindRenderbufferEXT glad_glBindRenderbufferEXT +GLAD_API_CALL PFNGLBINDSAMPLERPROC glad_glBindSampler; +#define glBindSampler glad_glBindSampler +GLAD_API_CALL PFNGLBINDTEXTUREPROC glad_glBindTexture; +#define glBindTexture glad_glBindTexture +GLAD_API_CALL PFNGLBINDVERTEXARRAYPROC glad_glBindVertexArray; +#define glBindVertexArray glad_glBindVertexArray +GLAD_API_CALL PFNGLBITMAPPROC glad_glBitmap; +#define glBitmap glad_glBitmap +GLAD_API_CALL PFNGLBLENDCOLORPROC glad_glBlendColor; +#define glBlendColor glad_glBlendColor +GLAD_API_CALL PFNGLBLENDEQUATIONPROC glad_glBlendEquation; +#define glBlendEquation glad_glBlendEquation +GLAD_API_CALL PFNGLBLENDEQUATIONSEPARATEPROC glad_glBlendEquationSeparate; +#define glBlendEquationSeparate glad_glBlendEquationSeparate +GLAD_API_CALL PFNGLBLENDFUNCPROC glad_glBlendFunc; +#define glBlendFunc glad_glBlendFunc +GLAD_API_CALL PFNGLBLENDFUNCSEPARATEPROC glad_glBlendFuncSeparate; +#define glBlendFuncSeparate glad_glBlendFuncSeparate +GLAD_API_CALL PFNGLBLITFRAMEBUFFERPROC glad_glBlitFramebuffer; +#define glBlitFramebuffer glad_glBlitFramebuffer +GLAD_API_CALL PFNGLBLITFRAMEBUFFEREXTPROC glad_glBlitFramebufferEXT; +#define glBlitFramebufferEXT glad_glBlitFramebufferEXT +GLAD_API_CALL PFNGLBUFFERDATAPROC glad_glBufferData; +#define glBufferData glad_glBufferData +GLAD_API_CALL PFNGLBUFFERSUBDATAPROC glad_glBufferSubData; +#define glBufferSubData glad_glBufferSubData +GLAD_API_CALL PFNGLCALLLISTPROC glad_glCallList; +#define glCallList glad_glCallList +GLAD_API_CALL PFNGLCALLLISTSPROC glad_glCallLists; +#define glCallLists glad_glCallLists +GLAD_API_CALL PFNGLCHECKFRAMEBUFFERSTATUSPROC glad_glCheckFramebufferStatus; +#define glCheckFramebufferStatus glad_glCheckFramebufferStatus +GLAD_API_CALL PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glad_glCheckFramebufferStatusEXT; +#define glCheckFramebufferStatusEXT glad_glCheckFramebufferStatusEXT +GLAD_API_CALL PFNGLCLAMPCOLORPROC glad_glClampColor; +#define glClampColor glad_glClampColor +GLAD_API_CALL PFNGLCLEARPROC glad_glClear; +#define glClear glad_glClear +GLAD_API_CALL PFNGLCLEARACCUMPROC glad_glClearAccum; +#define glClearAccum glad_glClearAccum +GLAD_API_CALL PFNGLCLEARBUFFERFIPROC glad_glClearBufferfi; +#define glClearBufferfi glad_glClearBufferfi +GLAD_API_CALL PFNGLCLEARBUFFERFVPROC glad_glClearBufferfv; +#define glClearBufferfv glad_glClearBufferfv +GLAD_API_CALL PFNGLCLEARBUFFERIVPROC glad_glClearBufferiv; +#define glClearBufferiv glad_glClearBufferiv +GLAD_API_CALL PFNGLCLEARBUFFERUIVPROC glad_glClearBufferuiv; +#define glClearBufferuiv glad_glClearBufferuiv +GLAD_API_CALL PFNGLCLEARCOLORPROC glad_glClearColor; +#define glClearColor glad_glClearColor +GLAD_API_CALL PFNGLCLEARDEPTHPROC glad_glClearDepth; +#define glClearDepth glad_glClearDepth +GLAD_API_CALL PFNGLCLEARINDEXPROC glad_glClearIndex; +#define glClearIndex glad_glClearIndex +GLAD_API_CALL PFNGLCLEARSTENCILPROC glad_glClearStencil; +#define glClearStencil glad_glClearStencil +GLAD_API_CALL PFNGLCLIENTACTIVETEXTUREPROC glad_glClientActiveTexture; +#define glClientActiveTexture glad_glClientActiveTexture +GLAD_API_CALL PFNGLCLIENTWAITSYNCPROC glad_glClientWaitSync; +#define glClientWaitSync glad_glClientWaitSync +GLAD_API_CALL PFNGLCLIPPLANEPROC glad_glClipPlane; +#define glClipPlane glad_glClipPlane +GLAD_API_CALL PFNGLCOLOR3BPROC glad_glColor3b; +#define glColor3b glad_glColor3b +GLAD_API_CALL PFNGLCOLOR3BVPROC glad_glColor3bv; +#define glColor3bv glad_glColor3bv +GLAD_API_CALL PFNGLCOLOR3DPROC glad_glColor3d; +#define glColor3d glad_glColor3d +GLAD_API_CALL PFNGLCOLOR3DVPROC glad_glColor3dv; +#define glColor3dv glad_glColor3dv +GLAD_API_CALL PFNGLCOLOR3FPROC glad_glColor3f; +#define glColor3f glad_glColor3f +GLAD_API_CALL PFNGLCOLOR3FVPROC glad_glColor3fv; +#define glColor3fv glad_glColor3fv +GLAD_API_CALL PFNGLCOLOR3IPROC glad_glColor3i; +#define glColor3i glad_glColor3i +GLAD_API_CALL PFNGLCOLOR3IVPROC glad_glColor3iv; +#define glColor3iv glad_glColor3iv +GLAD_API_CALL PFNGLCOLOR3SPROC glad_glColor3s; +#define glColor3s glad_glColor3s +GLAD_API_CALL PFNGLCOLOR3SVPROC glad_glColor3sv; +#define glColor3sv glad_glColor3sv +GLAD_API_CALL PFNGLCOLOR3UBPROC glad_glColor3ub; +#define glColor3ub glad_glColor3ub +GLAD_API_CALL PFNGLCOLOR3UBVPROC glad_glColor3ubv; +#define glColor3ubv glad_glColor3ubv +GLAD_API_CALL PFNGLCOLOR3UIPROC glad_glColor3ui; +#define glColor3ui glad_glColor3ui +GLAD_API_CALL PFNGLCOLOR3UIVPROC glad_glColor3uiv; +#define glColor3uiv glad_glColor3uiv +GLAD_API_CALL PFNGLCOLOR3USPROC glad_glColor3us; +#define glColor3us glad_glColor3us +GLAD_API_CALL PFNGLCOLOR3USVPROC glad_glColor3usv; +#define glColor3usv glad_glColor3usv +GLAD_API_CALL PFNGLCOLOR4BPROC glad_glColor4b; +#define glColor4b glad_glColor4b +GLAD_API_CALL PFNGLCOLOR4BVPROC glad_glColor4bv; +#define glColor4bv glad_glColor4bv +GLAD_API_CALL PFNGLCOLOR4DPROC glad_glColor4d; +#define glColor4d glad_glColor4d +GLAD_API_CALL PFNGLCOLOR4DVPROC glad_glColor4dv; +#define glColor4dv glad_glColor4dv +GLAD_API_CALL PFNGLCOLOR4FPROC glad_glColor4f; +#define glColor4f glad_glColor4f +GLAD_API_CALL PFNGLCOLOR4FVPROC glad_glColor4fv; +#define glColor4fv glad_glColor4fv +GLAD_API_CALL PFNGLCOLOR4IPROC glad_glColor4i; +#define glColor4i glad_glColor4i +GLAD_API_CALL PFNGLCOLOR4IVPROC glad_glColor4iv; +#define glColor4iv glad_glColor4iv +GLAD_API_CALL PFNGLCOLOR4SPROC glad_glColor4s; +#define glColor4s glad_glColor4s +GLAD_API_CALL PFNGLCOLOR4SVPROC glad_glColor4sv; +#define glColor4sv glad_glColor4sv +GLAD_API_CALL PFNGLCOLOR4UBPROC glad_glColor4ub; +#define glColor4ub glad_glColor4ub +GLAD_API_CALL PFNGLCOLOR4UBVPROC glad_glColor4ubv; +#define glColor4ubv glad_glColor4ubv +GLAD_API_CALL PFNGLCOLOR4UIPROC glad_glColor4ui; +#define glColor4ui glad_glColor4ui +GLAD_API_CALL PFNGLCOLOR4UIVPROC glad_glColor4uiv; +#define glColor4uiv glad_glColor4uiv +GLAD_API_CALL PFNGLCOLOR4USPROC glad_glColor4us; +#define glColor4us glad_glColor4us +GLAD_API_CALL PFNGLCOLOR4USVPROC glad_glColor4usv; +#define glColor4usv glad_glColor4usv +GLAD_API_CALL PFNGLCOLORMASKPROC glad_glColorMask; +#define glColorMask glad_glColorMask +GLAD_API_CALL PFNGLCOLORMASKIPROC glad_glColorMaski; +#define glColorMaski glad_glColorMaski +GLAD_API_CALL PFNGLCOLORMATERIALPROC glad_glColorMaterial; +#define glColorMaterial glad_glColorMaterial +GLAD_API_CALL PFNGLCOLORP3UIPROC glad_glColorP3ui; +#define glColorP3ui glad_glColorP3ui +GLAD_API_CALL PFNGLCOLORP3UIVPROC glad_glColorP3uiv; +#define glColorP3uiv glad_glColorP3uiv +GLAD_API_CALL PFNGLCOLORP4UIPROC glad_glColorP4ui; +#define glColorP4ui glad_glColorP4ui +GLAD_API_CALL PFNGLCOLORP4UIVPROC glad_glColorP4uiv; +#define glColorP4uiv glad_glColorP4uiv +GLAD_API_CALL PFNGLCOLORPOINTERPROC glad_glColorPointer; +#define glColorPointer glad_glColorPointer +GLAD_API_CALL PFNGLCOMPILESHADERPROC glad_glCompileShader; +#define glCompileShader glad_glCompileShader +GLAD_API_CALL PFNGLCOMPRESSEDTEXIMAGE1DPROC glad_glCompressedTexImage1D; +#define glCompressedTexImage1D glad_glCompressedTexImage1D +GLAD_API_CALL PFNGLCOMPRESSEDTEXIMAGE2DPROC glad_glCompressedTexImage2D; +#define glCompressedTexImage2D glad_glCompressedTexImage2D +GLAD_API_CALL PFNGLCOMPRESSEDTEXIMAGE3DPROC glad_glCompressedTexImage3D; +#define glCompressedTexImage3D glad_glCompressedTexImage3D +GLAD_API_CALL PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC glad_glCompressedTexSubImage1D; +#define glCompressedTexSubImage1D glad_glCompressedTexSubImage1D +GLAD_API_CALL PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC glad_glCompressedTexSubImage2D; +#define glCompressedTexSubImage2D glad_glCompressedTexSubImage2D +GLAD_API_CALL PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC glad_glCompressedTexSubImage3D; +#define glCompressedTexSubImage3D glad_glCompressedTexSubImage3D +GLAD_API_CALL PFNGLCOPYBUFFERSUBDATAPROC glad_glCopyBufferSubData; +#define glCopyBufferSubData glad_glCopyBufferSubData +GLAD_API_CALL PFNGLCOPYPIXELSPROC glad_glCopyPixels; +#define glCopyPixels glad_glCopyPixels +GLAD_API_CALL PFNGLCOPYTEXIMAGE1DPROC glad_glCopyTexImage1D; +#define glCopyTexImage1D glad_glCopyTexImage1D +GLAD_API_CALL PFNGLCOPYTEXIMAGE2DPROC glad_glCopyTexImage2D; +#define glCopyTexImage2D glad_glCopyTexImage2D +GLAD_API_CALL PFNGLCOPYTEXSUBIMAGE1DPROC glad_glCopyTexSubImage1D; +#define glCopyTexSubImage1D glad_glCopyTexSubImage1D +GLAD_API_CALL PFNGLCOPYTEXSUBIMAGE2DPROC glad_glCopyTexSubImage2D; +#define glCopyTexSubImage2D glad_glCopyTexSubImage2D +GLAD_API_CALL PFNGLCOPYTEXSUBIMAGE3DPROC glad_glCopyTexSubImage3D; +#define glCopyTexSubImage3D glad_glCopyTexSubImage3D +GLAD_API_CALL PFNGLCREATEPROGRAMPROC glad_glCreateProgram; +#define glCreateProgram glad_glCreateProgram +GLAD_API_CALL PFNGLCREATESHADERPROC glad_glCreateShader; +#define glCreateShader glad_glCreateShader +GLAD_API_CALL PFNGLCULLFACEPROC glad_glCullFace; +#define glCullFace glad_glCullFace +GLAD_API_CALL PFNGLDEBUGMESSAGECALLBACKARBPROC glad_glDebugMessageCallbackARB; +#define glDebugMessageCallbackARB glad_glDebugMessageCallbackARB +GLAD_API_CALL PFNGLDEBUGMESSAGECONTROLARBPROC glad_glDebugMessageControlARB; +#define glDebugMessageControlARB glad_glDebugMessageControlARB +GLAD_API_CALL PFNGLDEBUGMESSAGEINSERTARBPROC glad_glDebugMessageInsertARB; +#define glDebugMessageInsertARB glad_glDebugMessageInsertARB +GLAD_API_CALL PFNGLDELETEBUFFERSPROC glad_glDeleteBuffers; +#define glDeleteBuffers glad_glDeleteBuffers +GLAD_API_CALL PFNGLDELETEFRAMEBUFFERSPROC glad_glDeleteFramebuffers; +#define glDeleteFramebuffers glad_glDeleteFramebuffers +GLAD_API_CALL PFNGLDELETEFRAMEBUFFERSEXTPROC glad_glDeleteFramebuffersEXT; +#define glDeleteFramebuffersEXT glad_glDeleteFramebuffersEXT +GLAD_API_CALL PFNGLDELETELISTSPROC glad_glDeleteLists; +#define glDeleteLists glad_glDeleteLists +GLAD_API_CALL PFNGLDELETEPROGRAMPROC glad_glDeleteProgram; +#define glDeleteProgram glad_glDeleteProgram +GLAD_API_CALL PFNGLDELETEQUERIESPROC glad_glDeleteQueries; +#define glDeleteQueries glad_glDeleteQueries +GLAD_API_CALL PFNGLDELETERENDERBUFFERSPROC glad_glDeleteRenderbuffers; +#define glDeleteRenderbuffers glad_glDeleteRenderbuffers +GLAD_API_CALL PFNGLDELETERENDERBUFFERSEXTPROC glad_glDeleteRenderbuffersEXT; +#define glDeleteRenderbuffersEXT glad_glDeleteRenderbuffersEXT +GLAD_API_CALL PFNGLDELETESAMPLERSPROC glad_glDeleteSamplers; +#define glDeleteSamplers glad_glDeleteSamplers +GLAD_API_CALL PFNGLDELETESHADERPROC glad_glDeleteShader; +#define glDeleteShader glad_glDeleteShader +GLAD_API_CALL PFNGLDELETESYNCPROC glad_glDeleteSync; +#define glDeleteSync glad_glDeleteSync +GLAD_API_CALL PFNGLDELETETEXTURESPROC glad_glDeleteTextures; +#define glDeleteTextures glad_glDeleteTextures +GLAD_API_CALL PFNGLDELETEVERTEXARRAYSPROC glad_glDeleteVertexArrays; +#define glDeleteVertexArrays glad_glDeleteVertexArrays +GLAD_API_CALL PFNGLDEPTHFUNCPROC glad_glDepthFunc; +#define glDepthFunc glad_glDepthFunc +GLAD_API_CALL PFNGLDEPTHMASKPROC glad_glDepthMask; +#define glDepthMask glad_glDepthMask +GLAD_API_CALL PFNGLDEPTHRANGEPROC glad_glDepthRange; +#define glDepthRange glad_glDepthRange +GLAD_API_CALL PFNGLDETACHSHADERPROC glad_glDetachShader; +#define glDetachShader glad_glDetachShader +GLAD_API_CALL PFNGLDISABLEPROC glad_glDisable; +#define glDisable glad_glDisable +GLAD_API_CALL PFNGLDISABLECLIENTSTATEPROC glad_glDisableClientState; +#define glDisableClientState glad_glDisableClientState +GLAD_API_CALL PFNGLDISABLEVERTEXATTRIBARRAYPROC glad_glDisableVertexAttribArray; +#define glDisableVertexAttribArray glad_glDisableVertexAttribArray +GLAD_API_CALL PFNGLDISABLEIPROC glad_glDisablei; +#define glDisablei glad_glDisablei +GLAD_API_CALL PFNGLDRAWARRAYSPROC glad_glDrawArrays; +#define glDrawArrays glad_glDrawArrays +GLAD_API_CALL PFNGLDRAWARRAYSINSTANCEDPROC glad_glDrawArraysInstanced; +#define glDrawArraysInstanced glad_glDrawArraysInstanced +GLAD_API_CALL PFNGLDRAWBUFFERPROC glad_glDrawBuffer; +#define glDrawBuffer glad_glDrawBuffer +GLAD_API_CALL PFNGLDRAWBUFFERSPROC glad_glDrawBuffers; +#define glDrawBuffers glad_glDrawBuffers +GLAD_API_CALL PFNGLDRAWELEMENTSPROC glad_glDrawElements; +#define glDrawElements glad_glDrawElements +GLAD_API_CALL PFNGLDRAWELEMENTSBASEVERTEXPROC glad_glDrawElementsBaseVertex; +#define glDrawElementsBaseVertex glad_glDrawElementsBaseVertex +GLAD_API_CALL PFNGLDRAWELEMENTSINSTANCEDPROC glad_glDrawElementsInstanced; +#define glDrawElementsInstanced glad_glDrawElementsInstanced +GLAD_API_CALL PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC glad_glDrawElementsInstancedBaseVertex; +#define glDrawElementsInstancedBaseVertex glad_glDrawElementsInstancedBaseVertex +GLAD_API_CALL PFNGLDRAWPIXELSPROC glad_glDrawPixels; +#define glDrawPixels glad_glDrawPixels +GLAD_API_CALL PFNGLDRAWRANGEELEMENTSPROC glad_glDrawRangeElements; +#define glDrawRangeElements glad_glDrawRangeElements +GLAD_API_CALL PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC glad_glDrawRangeElementsBaseVertex; +#define glDrawRangeElementsBaseVertex glad_glDrawRangeElementsBaseVertex +GLAD_API_CALL PFNGLEDGEFLAGPROC glad_glEdgeFlag; +#define glEdgeFlag glad_glEdgeFlag +GLAD_API_CALL PFNGLEDGEFLAGPOINTERPROC glad_glEdgeFlagPointer; +#define glEdgeFlagPointer glad_glEdgeFlagPointer +GLAD_API_CALL PFNGLEDGEFLAGVPROC glad_glEdgeFlagv; +#define glEdgeFlagv glad_glEdgeFlagv +GLAD_API_CALL PFNGLENABLEPROC glad_glEnable; +#define glEnable glad_glEnable +GLAD_API_CALL PFNGLENABLECLIENTSTATEPROC glad_glEnableClientState; +#define glEnableClientState glad_glEnableClientState +GLAD_API_CALL PFNGLENABLEVERTEXATTRIBARRAYPROC glad_glEnableVertexAttribArray; +#define glEnableVertexAttribArray glad_glEnableVertexAttribArray +GLAD_API_CALL PFNGLENABLEIPROC glad_glEnablei; +#define glEnablei glad_glEnablei +GLAD_API_CALL PFNGLENDPROC glad_glEnd; +#define glEnd glad_glEnd +GLAD_API_CALL PFNGLENDCONDITIONALRENDERPROC glad_glEndConditionalRender; +#define glEndConditionalRender glad_glEndConditionalRender +GLAD_API_CALL PFNGLENDLISTPROC glad_glEndList; +#define glEndList glad_glEndList +GLAD_API_CALL PFNGLENDQUERYPROC glad_glEndQuery; +#define glEndQuery glad_glEndQuery +GLAD_API_CALL PFNGLENDTRANSFORMFEEDBACKPROC glad_glEndTransformFeedback; +#define glEndTransformFeedback glad_glEndTransformFeedback +GLAD_API_CALL PFNGLEVALCOORD1DPROC glad_glEvalCoord1d; +#define glEvalCoord1d glad_glEvalCoord1d +GLAD_API_CALL PFNGLEVALCOORD1DVPROC glad_glEvalCoord1dv; +#define glEvalCoord1dv glad_glEvalCoord1dv +GLAD_API_CALL PFNGLEVALCOORD1FPROC glad_glEvalCoord1f; +#define glEvalCoord1f glad_glEvalCoord1f +GLAD_API_CALL PFNGLEVALCOORD1FVPROC glad_glEvalCoord1fv; +#define glEvalCoord1fv glad_glEvalCoord1fv +GLAD_API_CALL PFNGLEVALCOORD2DPROC glad_glEvalCoord2d; +#define glEvalCoord2d glad_glEvalCoord2d +GLAD_API_CALL PFNGLEVALCOORD2DVPROC glad_glEvalCoord2dv; +#define glEvalCoord2dv glad_glEvalCoord2dv +GLAD_API_CALL PFNGLEVALCOORD2FPROC glad_glEvalCoord2f; +#define glEvalCoord2f glad_glEvalCoord2f +GLAD_API_CALL PFNGLEVALCOORD2FVPROC glad_glEvalCoord2fv; +#define glEvalCoord2fv glad_glEvalCoord2fv +GLAD_API_CALL PFNGLEVALMESH1PROC glad_glEvalMesh1; +#define glEvalMesh1 glad_glEvalMesh1 +GLAD_API_CALL PFNGLEVALMESH2PROC glad_glEvalMesh2; +#define glEvalMesh2 glad_glEvalMesh2 +GLAD_API_CALL PFNGLEVALPOINT1PROC glad_glEvalPoint1; +#define glEvalPoint1 glad_glEvalPoint1 +GLAD_API_CALL PFNGLEVALPOINT2PROC glad_glEvalPoint2; +#define glEvalPoint2 glad_glEvalPoint2 +GLAD_API_CALL PFNGLFEEDBACKBUFFERPROC glad_glFeedbackBuffer; +#define glFeedbackBuffer glad_glFeedbackBuffer +GLAD_API_CALL PFNGLFENCESYNCPROC glad_glFenceSync; +#define glFenceSync glad_glFenceSync +GLAD_API_CALL PFNGLFINISHPROC glad_glFinish; +#define glFinish glad_glFinish +GLAD_API_CALL PFNGLFLUSHPROC glad_glFlush; +#define glFlush glad_glFlush +GLAD_API_CALL PFNGLFLUSHMAPPEDBUFFERRANGEPROC glad_glFlushMappedBufferRange; +#define glFlushMappedBufferRange glad_glFlushMappedBufferRange +GLAD_API_CALL PFNGLFOGCOORDPOINTERPROC glad_glFogCoordPointer; +#define glFogCoordPointer glad_glFogCoordPointer +GLAD_API_CALL PFNGLFOGCOORDDPROC glad_glFogCoordd; +#define glFogCoordd glad_glFogCoordd +GLAD_API_CALL PFNGLFOGCOORDDVPROC glad_glFogCoorddv; +#define glFogCoorddv glad_glFogCoorddv +GLAD_API_CALL PFNGLFOGCOORDFPROC glad_glFogCoordf; +#define glFogCoordf glad_glFogCoordf +GLAD_API_CALL PFNGLFOGCOORDFVPROC glad_glFogCoordfv; +#define glFogCoordfv glad_glFogCoordfv +GLAD_API_CALL PFNGLFOGFPROC glad_glFogf; +#define glFogf glad_glFogf +GLAD_API_CALL PFNGLFOGFVPROC glad_glFogfv; +#define glFogfv glad_glFogfv +GLAD_API_CALL PFNGLFOGIPROC glad_glFogi; +#define glFogi glad_glFogi +GLAD_API_CALL PFNGLFOGIVPROC glad_glFogiv; +#define glFogiv glad_glFogiv +GLAD_API_CALL PFNGLFRAMEBUFFERRENDERBUFFERPROC glad_glFramebufferRenderbuffer; +#define glFramebufferRenderbuffer glad_glFramebufferRenderbuffer +GLAD_API_CALL PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glad_glFramebufferRenderbufferEXT; +#define glFramebufferRenderbufferEXT glad_glFramebufferRenderbufferEXT +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTUREPROC glad_glFramebufferTexture; +#define glFramebufferTexture glad_glFramebufferTexture +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE1DPROC glad_glFramebufferTexture1D; +#define glFramebufferTexture1D glad_glFramebufferTexture1D +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glad_glFramebufferTexture1DEXT; +#define glFramebufferTexture1DEXT glad_glFramebufferTexture1DEXT +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE2DPROC glad_glFramebufferTexture2D; +#define glFramebufferTexture2D glad_glFramebufferTexture2D +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glad_glFramebufferTexture2DEXT; +#define glFramebufferTexture2DEXT glad_glFramebufferTexture2DEXT +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE3DPROC glad_glFramebufferTexture3D; +#define glFramebufferTexture3D glad_glFramebufferTexture3D +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glad_glFramebufferTexture3DEXT; +#define glFramebufferTexture3DEXT glad_glFramebufferTexture3DEXT +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTURELAYERPROC glad_glFramebufferTextureLayer; +#define glFramebufferTextureLayer glad_glFramebufferTextureLayer +GLAD_API_CALL PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glFramebufferTextureMultiviewOVR; +#define glFramebufferTextureMultiviewOVR glad_glFramebufferTextureMultiviewOVR +GLAD_API_CALL PFNGLFRONTFACEPROC glad_glFrontFace; +#define glFrontFace glad_glFrontFace +GLAD_API_CALL PFNGLFRUSTUMPROC glad_glFrustum; +#define glFrustum glad_glFrustum +GLAD_API_CALL PFNGLGENBUFFERSPROC glad_glGenBuffers; +#define glGenBuffers glad_glGenBuffers +GLAD_API_CALL PFNGLGENFRAMEBUFFERSPROC glad_glGenFramebuffers; +#define glGenFramebuffers glad_glGenFramebuffers +GLAD_API_CALL PFNGLGENFRAMEBUFFERSEXTPROC glad_glGenFramebuffersEXT; +#define glGenFramebuffersEXT glad_glGenFramebuffersEXT +GLAD_API_CALL PFNGLGENLISTSPROC glad_glGenLists; +#define glGenLists glad_glGenLists +GLAD_API_CALL PFNGLGENQUERIESPROC glad_glGenQueries; +#define glGenQueries glad_glGenQueries +GLAD_API_CALL PFNGLGENRENDERBUFFERSPROC glad_glGenRenderbuffers; +#define glGenRenderbuffers glad_glGenRenderbuffers +GLAD_API_CALL PFNGLGENRENDERBUFFERSEXTPROC glad_glGenRenderbuffersEXT; +#define glGenRenderbuffersEXT glad_glGenRenderbuffersEXT +GLAD_API_CALL PFNGLGENSAMPLERSPROC glad_glGenSamplers; +#define glGenSamplers glad_glGenSamplers +GLAD_API_CALL PFNGLGENTEXTURESPROC glad_glGenTextures; +#define glGenTextures glad_glGenTextures +GLAD_API_CALL PFNGLGENVERTEXARRAYSPROC glad_glGenVertexArrays; +#define glGenVertexArrays glad_glGenVertexArrays +GLAD_API_CALL PFNGLGENERATEMIPMAPPROC glad_glGenerateMipmap; +#define glGenerateMipmap glad_glGenerateMipmap +GLAD_API_CALL PFNGLGENERATEMIPMAPEXTPROC glad_glGenerateMipmapEXT; +#define glGenerateMipmapEXT glad_glGenerateMipmapEXT +GLAD_API_CALL PFNGLGETACTIVEATTRIBPROC glad_glGetActiveAttrib; +#define glGetActiveAttrib glad_glGetActiveAttrib +GLAD_API_CALL PFNGLGETACTIVEUNIFORMPROC glad_glGetActiveUniform; +#define glGetActiveUniform glad_glGetActiveUniform +GLAD_API_CALL PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC glad_glGetActiveUniformBlockName; +#define glGetActiveUniformBlockName glad_glGetActiveUniformBlockName +GLAD_API_CALL PFNGLGETACTIVEUNIFORMBLOCKIVPROC glad_glGetActiveUniformBlockiv; +#define glGetActiveUniformBlockiv glad_glGetActiveUniformBlockiv +GLAD_API_CALL PFNGLGETACTIVEUNIFORMNAMEPROC glad_glGetActiveUniformName; +#define glGetActiveUniformName glad_glGetActiveUniformName +GLAD_API_CALL PFNGLGETACTIVEUNIFORMSIVPROC glad_glGetActiveUniformsiv; +#define glGetActiveUniformsiv glad_glGetActiveUniformsiv +GLAD_API_CALL PFNGLGETATTACHEDSHADERSPROC glad_glGetAttachedShaders; +#define glGetAttachedShaders glad_glGetAttachedShaders +GLAD_API_CALL PFNGLGETATTRIBLOCATIONPROC glad_glGetAttribLocation; +#define glGetAttribLocation glad_glGetAttribLocation +GLAD_API_CALL PFNGLGETBOOLEANI_VPROC glad_glGetBooleani_v; +#define glGetBooleani_v glad_glGetBooleani_v +GLAD_API_CALL PFNGLGETBOOLEANVPROC glad_glGetBooleanv; +#define glGetBooleanv glad_glGetBooleanv +GLAD_API_CALL PFNGLGETBUFFERPARAMETERI64VPROC glad_glGetBufferParameteri64v; +#define glGetBufferParameteri64v glad_glGetBufferParameteri64v +GLAD_API_CALL PFNGLGETBUFFERPARAMETERIVPROC glad_glGetBufferParameteriv; +#define glGetBufferParameteriv glad_glGetBufferParameteriv +GLAD_API_CALL PFNGLGETBUFFERPOINTERVPROC glad_glGetBufferPointerv; +#define glGetBufferPointerv glad_glGetBufferPointerv +GLAD_API_CALL PFNGLGETBUFFERSUBDATAPROC glad_glGetBufferSubData; +#define glGetBufferSubData glad_glGetBufferSubData +GLAD_API_CALL PFNGLGETCLIPPLANEPROC glad_glGetClipPlane; +#define glGetClipPlane glad_glGetClipPlane +GLAD_API_CALL PFNGLGETCOMPRESSEDTEXIMAGEPROC glad_glGetCompressedTexImage; +#define glGetCompressedTexImage glad_glGetCompressedTexImage +GLAD_API_CALL PFNGLGETDEBUGMESSAGELOGARBPROC glad_glGetDebugMessageLogARB; +#define glGetDebugMessageLogARB glad_glGetDebugMessageLogARB +GLAD_API_CALL PFNGLGETDOUBLEVPROC glad_glGetDoublev; +#define glGetDoublev glad_glGetDoublev +GLAD_API_CALL PFNGLGETERRORPROC glad_glGetError; +#define glGetError glad_glGetError +GLAD_API_CALL PFNGLGETFLOATVPROC glad_glGetFloatv; +#define glGetFloatv glad_glGetFloatv +GLAD_API_CALL PFNGLGETFRAGDATAINDEXPROC glad_glGetFragDataIndex; +#define glGetFragDataIndex glad_glGetFragDataIndex +GLAD_API_CALL PFNGLGETFRAGDATALOCATIONPROC glad_glGetFragDataLocation; +#define glGetFragDataLocation glad_glGetFragDataLocation +GLAD_API_CALL PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC glad_glGetFramebufferAttachmentParameteriv; +#define glGetFramebufferAttachmentParameteriv glad_glGetFramebufferAttachmentParameteriv +GLAD_API_CALL PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glad_glGetFramebufferAttachmentParameterivEXT; +#define glGetFramebufferAttachmentParameterivEXT glad_glGetFramebufferAttachmentParameterivEXT +GLAD_API_CALL PFNGLGETINTEGER64I_VPROC glad_glGetInteger64i_v; +#define glGetInteger64i_v glad_glGetInteger64i_v +GLAD_API_CALL PFNGLGETINTEGER64VPROC glad_glGetInteger64v; +#define glGetInteger64v glad_glGetInteger64v +GLAD_API_CALL PFNGLGETINTEGERI_VPROC glad_glGetIntegeri_v; +#define glGetIntegeri_v glad_glGetIntegeri_v +GLAD_API_CALL PFNGLGETINTEGERVPROC glad_glGetIntegerv; +#define glGetIntegerv glad_glGetIntegerv +GLAD_API_CALL PFNGLGETLIGHTFVPROC glad_glGetLightfv; +#define glGetLightfv glad_glGetLightfv +GLAD_API_CALL PFNGLGETLIGHTIVPROC glad_glGetLightiv; +#define glGetLightiv glad_glGetLightiv +GLAD_API_CALL PFNGLGETMAPDVPROC glad_glGetMapdv; +#define glGetMapdv glad_glGetMapdv +GLAD_API_CALL PFNGLGETMAPFVPROC glad_glGetMapfv; +#define glGetMapfv glad_glGetMapfv +GLAD_API_CALL PFNGLGETMAPIVPROC glad_glGetMapiv; +#define glGetMapiv glad_glGetMapiv +GLAD_API_CALL PFNGLGETMATERIALFVPROC glad_glGetMaterialfv; +#define glGetMaterialfv glad_glGetMaterialfv +GLAD_API_CALL PFNGLGETMATERIALIVPROC glad_glGetMaterialiv; +#define glGetMaterialiv glad_glGetMaterialiv +GLAD_API_CALL PFNGLGETMULTISAMPLEFVPROC glad_glGetMultisamplefv; +#define glGetMultisamplefv glad_glGetMultisamplefv +GLAD_API_CALL PFNGLGETPIXELMAPFVPROC glad_glGetPixelMapfv; +#define glGetPixelMapfv glad_glGetPixelMapfv +GLAD_API_CALL PFNGLGETPIXELMAPUIVPROC glad_glGetPixelMapuiv; +#define glGetPixelMapuiv glad_glGetPixelMapuiv +GLAD_API_CALL PFNGLGETPIXELMAPUSVPROC glad_glGetPixelMapusv; +#define glGetPixelMapusv glad_glGetPixelMapusv +GLAD_API_CALL PFNGLGETPOINTERVPROC glad_glGetPointerv; +#define glGetPointerv glad_glGetPointerv +GLAD_API_CALL PFNGLGETPOLYGONSTIPPLEPROC glad_glGetPolygonStipple; +#define glGetPolygonStipple glad_glGetPolygonStipple +GLAD_API_CALL PFNGLGETPROGRAMINFOLOGPROC glad_glGetProgramInfoLog; +#define glGetProgramInfoLog glad_glGetProgramInfoLog +GLAD_API_CALL PFNGLGETPROGRAMIVPROC glad_glGetProgramiv; +#define glGetProgramiv glad_glGetProgramiv +GLAD_API_CALL PFNGLGETQUERYOBJECTI64VPROC glad_glGetQueryObjecti64v; +#define glGetQueryObjecti64v glad_glGetQueryObjecti64v +GLAD_API_CALL PFNGLGETQUERYOBJECTIVPROC glad_glGetQueryObjectiv; +#define glGetQueryObjectiv glad_glGetQueryObjectiv +GLAD_API_CALL PFNGLGETQUERYOBJECTUI64VPROC glad_glGetQueryObjectui64v; +#define glGetQueryObjectui64v glad_glGetQueryObjectui64v +GLAD_API_CALL PFNGLGETQUERYOBJECTUIVPROC glad_glGetQueryObjectuiv; +#define glGetQueryObjectuiv glad_glGetQueryObjectuiv +GLAD_API_CALL PFNGLGETQUERYIVPROC glad_glGetQueryiv; +#define glGetQueryiv glad_glGetQueryiv +GLAD_API_CALL PFNGLGETRENDERBUFFERPARAMETERIVPROC glad_glGetRenderbufferParameteriv; +#define glGetRenderbufferParameteriv glad_glGetRenderbufferParameteriv +GLAD_API_CALL PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glad_glGetRenderbufferParameterivEXT; +#define glGetRenderbufferParameterivEXT glad_glGetRenderbufferParameterivEXT +GLAD_API_CALL PFNGLGETSAMPLERPARAMETERIIVPROC glad_glGetSamplerParameterIiv; +#define glGetSamplerParameterIiv glad_glGetSamplerParameterIiv +GLAD_API_CALL PFNGLGETSAMPLERPARAMETERIUIVPROC glad_glGetSamplerParameterIuiv; +#define glGetSamplerParameterIuiv glad_glGetSamplerParameterIuiv +GLAD_API_CALL PFNGLGETSAMPLERPARAMETERFVPROC glad_glGetSamplerParameterfv; +#define glGetSamplerParameterfv glad_glGetSamplerParameterfv +GLAD_API_CALL PFNGLGETSAMPLERPARAMETERIVPROC glad_glGetSamplerParameteriv; +#define glGetSamplerParameteriv glad_glGetSamplerParameteriv +GLAD_API_CALL PFNGLGETSHADERINFOLOGPROC glad_glGetShaderInfoLog; +#define glGetShaderInfoLog glad_glGetShaderInfoLog +GLAD_API_CALL PFNGLGETSHADERSOURCEPROC glad_glGetShaderSource; +#define glGetShaderSource glad_glGetShaderSource +GLAD_API_CALL PFNGLGETSHADERIVPROC glad_glGetShaderiv; +#define glGetShaderiv glad_glGetShaderiv +GLAD_API_CALL PFNGLGETSTRINGPROC glad_glGetString; +#define glGetString glad_glGetString +GLAD_API_CALL PFNGLGETSTRINGIPROC glad_glGetStringi; +#define glGetStringi glad_glGetStringi +GLAD_API_CALL PFNGLGETSYNCIVPROC glad_glGetSynciv; +#define glGetSynciv glad_glGetSynciv +GLAD_API_CALL PFNGLGETTEXENVFVPROC glad_glGetTexEnvfv; +#define glGetTexEnvfv glad_glGetTexEnvfv +GLAD_API_CALL PFNGLGETTEXENVIVPROC glad_glGetTexEnviv; +#define glGetTexEnviv glad_glGetTexEnviv +GLAD_API_CALL PFNGLGETTEXGENDVPROC glad_glGetTexGendv; +#define glGetTexGendv glad_glGetTexGendv +GLAD_API_CALL PFNGLGETTEXGENFVPROC glad_glGetTexGenfv; +#define glGetTexGenfv glad_glGetTexGenfv +GLAD_API_CALL PFNGLGETTEXGENIVPROC glad_glGetTexGeniv; +#define glGetTexGeniv glad_glGetTexGeniv +GLAD_API_CALL PFNGLGETTEXIMAGEPROC glad_glGetTexImage; +#define glGetTexImage glad_glGetTexImage +GLAD_API_CALL PFNGLGETTEXLEVELPARAMETERFVPROC glad_glGetTexLevelParameterfv; +#define glGetTexLevelParameterfv glad_glGetTexLevelParameterfv +GLAD_API_CALL PFNGLGETTEXLEVELPARAMETERIVPROC glad_glGetTexLevelParameteriv; +#define glGetTexLevelParameteriv glad_glGetTexLevelParameteriv +GLAD_API_CALL PFNGLGETTEXPARAMETERIIVPROC glad_glGetTexParameterIiv; +#define glGetTexParameterIiv glad_glGetTexParameterIiv +GLAD_API_CALL PFNGLGETTEXPARAMETERIUIVPROC glad_glGetTexParameterIuiv; +#define glGetTexParameterIuiv glad_glGetTexParameterIuiv +GLAD_API_CALL PFNGLGETTEXPARAMETERFVPROC glad_glGetTexParameterfv; +#define glGetTexParameterfv glad_glGetTexParameterfv +GLAD_API_CALL PFNGLGETTEXPARAMETERIVPROC glad_glGetTexParameteriv; +#define glGetTexParameteriv glad_glGetTexParameteriv +GLAD_API_CALL PFNGLGETTRANSFORMFEEDBACKVARYINGPROC glad_glGetTransformFeedbackVarying; +#define glGetTransformFeedbackVarying glad_glGetTransformFeedbackVarying +GLAD_API_CALL PFNGLGETUNIFORMBLOCKINDEXPROC glad_glGetUniformBlockIndex; +#define glGetUniformBlockIndex glad_glGetUniformBlockIndex +GLAD_API_CALL PFNGLGETUNIFORMINDICESPROC glad_glGetUniformIndices; +#define glGetUniformIndices glad_glGetUniformIndices +GLAD_API_CALL PFNGLGETUNIFORMLOCATIONPROC glad_glGetUniformLocation; +#define glGetUniformLocation glad_glGetUniformLocation +GLAD_API_CALL PFNGLGETUNIFORMFVPROC glad_glGetUniformfv; +#define glGetUniformfv glad_glGetUniformfv +GLAD_API_CALL PFNGLGETUNIFORMIVPROC glad_glGetUniformiv; +#define glGetUniformiv glad_glGetUniformiv +GLAD_API_CALL PFNGLGETUNIFORMUIVPROC glad_glGetUniformuiv; +#define glGetUniformuiv glad_glGetUniformuiv +GLAD_API_CALL PFNGLGETVERTEXATTRIBIIVPROC glad_glGetVertexAttribIiv; +#define glGetVertexAttribIiv glad_glGetVertexAttribIiv +GLAD_API_CALL PFNGLGETVERTEXATTRIBIUIVPROC glad_glGetVertexAttribIuiv; +#define glGetVertexAttribIuiv glad_glGetVertexAttribIuiv +GLAD_API_CALL PFNGLGETVERTEXATTRIBPOINTERVPROC glad_glGetVertexAttribPointerv; +#define glGetVertexAttribPointerv glad_glGetVertexAttribPointerv +GLAD_API_CALL PFNGLGETVERTEXATTRIBDVPROC glad_glGetVertexAttribdv; +#define glGetVertexAttribdv glad_glGetVertexAttribdv +GLAD_API_CALL PFNGLGETVERTEXATTRIBFVPROC glad_glGetVertexAttribfv; +#define glGetVertexAttribfv glad_glGetVertexAttribfv +GLAD_API_CALL PFNGLGETVERTEXATTRIBIVPROC glad_glGetVertexAttribiv; +#define glGetVertexAttribiv glad_glGetVertexAttribiv +GLAD_API_CALL PFNGLHINTPROC glad_glHint; +#define glHint glad_glHint +GLAD_API_CALL PFNGLINDEXMASKPROC glad_glIndexMask; +#define glIndexMask glad_glIndexMask +GLAD_API_CALL PFNGLINDEXPOINTERPROC glad_glIndexPointer; +#define glIndexPointer glad_glIndexPointer +GLAD_API_CALL PFNGLINDEXDPROC glad_glIndexd; +#define glIndexd glad_glIndexd +GLAD_API_CALL PFNGLINDEXDVPROC glad_glIndexdv; +#define glIndexdv glad_glIndexdv +GLAD_API_CALL PFNGLINDEXFPROC glad_glIndexf; +#define glIndexf glad_glIndexf +GLAD_API_CALL PFNGLINDEXFVPROC glad_glIndexfv; +#define glIndexfv glad_glIndexfv +GLAD_API_CALL PFNGLINDEXIPROC glad_glIndexi; +#define glIndexi glad_glIndexi +GLAD_API_CALL PFNGLINDEXIVPROC glad_glIndexiv; +#define glIndexiv glad_glIndexiv +GLAD_API_CALL PFNGLINDEXSPROC glad_glIndexs; +#define glIndexs glad_glIndexs +GLAD_API_CALL PFNGLINDEXSVPROC glad_glIndexsv; +#define glIndexsv glad_glIndexsv +GLAD_API_CALL PFNGLINDEXUBPROC glad_glIndexub; +#define glIndexub glad_glIndexub +GLAD_API_CALL PFNGLINDEXUBVPROC glad_glIndexubv; +#define glIndexubv glad_glIndexubv +GLAD_API_CALL PFNGLINITNAMESPROC glad_glInitNames; +#define glInitNames glad_glInitNames +GLAD_API_CALL PFNGLINTERLEAVEDARRAYSPROC glad_glInterleavedArrays; +#define glInterleavedArrays glad_glInterleavedArrays +GLAD_API_CALL PFNGLISBUFFERPROC glad_glIsBuffer; +#define glIsBuffer glad_glIsBuffer +GLAD_API_CALL PFNGLISENABLEDPROC glad_glIsEnabled; +#define glIsEnabled glad_glIsEnabled +GLAD_API_CALL PFNGLISENABLEDIPROC glad_glIsEnabledi; +#define glIsEnabledi glad_glIsEnabledi +GLAD_API_CALL PFNGLISFRAMEBUFFERPROC glad_glIsFramebuffer; +#define glIsFramebuffer glad_glIsFramebuffer +GLAD_API_CALL PFNGLISFRAMEBUFFEREXTPROC glad_glIsFramebufferEXT; +#define glIsFramebufferEXT glad_glIsFramebufferEXT +GLAD_API_CALL PFNGLISLISTPROC glad_glIsList; +#define glIsList glad_glIsList +GLAD_API_CALL PFNGLISPROGRAMPROC glad_glIsProgram; +#define glIsProgram glad_glIsProgram +GLAD_API_CALL PFNGLISQUERYPROC glad_glIsQuery; +#define glIsQuery glad_glIsQuery +GLAD_API_CALL PFNGLISRENDERBUFFERPROC glad_glIsRenderbuffer; +#define glIsRenderbuffer glad_glIsRenderbuffer +GLAD_API_CALL PFNGLISRENDERBUFFEREXTPROC glad_glIsRenderbufferEXT; +#define glIsRenderbufferEXT glad_glIsRenderbufferEXT +GLAD_API_CALL PFNGLISSAMPLERPROC glad_glIsSampler; +#define glIsSampler glad_glIsSampler +GLAD_API_CALL PFNGLISSHADERPROC glad_glIsShader; +#define glIsShader glad_glIsShader +GLAD_API_CALL PFNGLISSYNCPROC glad_glIsSync; +#define glIsSync glad_glIsSync +GLAD_API_CALL PFNGLISTEXTUREPROC glad_glIsTexture; +#define glIsTexture glad_glIsTexture +GLAD_API_CALL PFNGLISVERTEXARRAYPROC glad_glIsVertexArray; +#define glIsVertexArray glad_glIsVertexArray +GLAD_API_CALL PFNGLLIGHTMODELFPROC glad_glLightModelf; +#define glLightModelf glad_glLightModelf +GLAD_API_CALL PFNGLLIGHTMODELFVPROC glad_glLightModelfv; +#define glLightModelfv glad_glLightModelfv +GLAD_API_CALL PFNGLLIGHTMODELIPROC glad_glLightModeli; +#define glLightModeli glad_glLightModeli +GLAD_API_CALL PFNGLLIGHTMODELIVPROC glad_glLightModeliv; +#define glLightModeliv glad_glLightModeliv +GLAD_API_CALL PFNGLLIGHTFPROC glad_glLightf; +#define glLightf glad_glLightf +GLAD_API_CALL PFNGLLIGHTFVPROC glad_glLightfv; +#define glLightfv glad_glLightfv +GLAD_API_CALL PFNGLLIGHTIPROC glad_glLighti; +#define glLighti glad_glLighti +GLAD_API_CALL PFNGLLIGHTIVPROC glad_glLightiv; +#define glLightiv glad_glLightiv +GLAD_API_CALL PFNGLLINESTIPPLEPROC glad_glLineStipple; +#define glLineStipple glad_glLineStipple +GLAD_API_CALL PFNGLLINEWIDTHPROC glad_glLineWidth; +#define glLineWidth glad_glLineWidth +GLAD_API_CALL PFNGLLINKPROGRAMPROC glad_glLinkProgram; +#define glLinkProgram glad_glLinkProgram +GLAD_API_CALL PFNGLLISTBASEPROC glad_glListBase; +#define glListBase glad_glListBase +GLAD_API_CALL PFNGLLOADIDENTITYPROC glad_glLoadIdentity; +#define glLoadIdentity glad_glLoadIdentity +GLAD_API_CALL PFNGLLOADMATRIXDPROC glad_glLoadMatrixd; +#define glLoadMatrixd glad_glLoadMatrixd +GLAD_API_CALL PFNGLLOADMATRIXFPROC glad_glLoadMatrixf; +#define glLoadMatrixf glad_glLoadMatrixf +GLAD_API_CALL PFNGLLOADNAMEPROC glad_glLoadName; +#define glLoadName glad_glLoadName +GLAD_API_CALL PFNGLLOADTRANSPOSEMATRIXDPROC glad_glLoadTransposeMatrixd; +#define glLoadTransposeMatrixd glad_glLoadTransposeMatrixd +GLAD_API_CALL PFNGLLOADTRANSPOSEMATRIXFPROC glad_glLoadTransposeMatrixf; +#define glLoadTransposeMatrixf glad_glLoadTransposeMatrixf +GLAD_API_CALL PFNGLLOGICOPPROC glad_glLogicOp; +#define glLogicOp glad_glLogicOp +GLAD_API_CALL PFNGLMAP1DPROC glad_glMap1d; +#define glMap1d glad_glMap1d +GLAD_API_CALL PFNGLMAP1FPROC glad_glMap1f; +#define glMap1f glad_glMap1f +GLAD_API_CALL PFNGLMAP2DPROC glad_glMap2d; +#define glMap2d glad_glMap2d +GLAD_API_CALL PFNGLMAP2FPROC glad_glMap2f; +#define glMap2f glad_glMap2f +GLAD_API_CALL PFNGLMAPBUFFERPROC glad_glMapBuffer; +#define glMapBuffer glad_glMapBuffer +GLAD_API_CALL PFNGLMAPBUFFERRANGEPROC glad_glMapBufferRange; +#define glMapBufferRange glad_glMapBufferRange +GLAD_API_CALL PFNGLMAPGRID1DPROC glad_glMapGrid1d; +#define glMapGrid1d glad_glMapGrid1d +GLAD_API_CALL PFNGLMAPGRID1FPROC glad_glMapGrid1f; +#define glMapGrid1f glad_glMapGrid1f +GLAD_API_CALL PFNGLMAPGRID2DPROC glad_glMapGrid2d; +#define glMapGrid2d glad_glMapGrid2d +GLAD_API_CALL PFNGLMAPGRID2FPROC glad_glMapGrid2f; +#define glMapGrid2f glad_glMapGrid2f +GLAD_API_CALL PFNGLMATERIALFPROC glad_glMaterialf; +#define glMaterialf glad_glMaterialf +GLAD_API_CALL PFNGLMATERIALFVPROC glad_glMaterialfv; +#define glMaterialfv glad_glMaterialfv +GLAD_API_CALL PFNGLMATERIALIPROC glad_glMateriali; +#define glMateriali glad_glMateriali +GLAD_API_CALL PFNGLMATERIALIVPROC glad_glMaterialiv; +#define glMaterialiv glad_glMaterialiv +GLAD_API_CALL PFNGLMATRIXMODEPROC glad_glMatrixMode; +#define glMatrixMode glad_glMatrixMode +GLAD_API_CALL PFNGLMULTMATRIXDPROC glad_glMultMatrixd; +#define glMultMatrixd glad_glMultMatrixd +GLAD_API_CALL PFNGLMULTMATRIXFPROC glad_glMultMatrixf; +#define glMultMatrixf glad_glMultMatrixf +GLAD_API_CALL PFNGLMULTTRANSPOSEMATRIXDPROC glad_glMultTransposeMatrixd; +#define glMultTransposeMatrixd glad_glMultTransposeMatrixd +GLAD_API_CALL PFNGLMULTTRANSPOSEMATRIXFPROC glad_glMultTransposeMatrixf; +#define glMultTransposeMatrixf glad_glMultTransposeMatrixf +GLAD_API_CALL PFNGLMULTIDRAWARRAYSPROC glad_glMultiDrawArrays; +#define glMultiDrawArrays glad_glMultiDrawArrays +GLAD_API_CALL PFNGLMULTIDRAWELEMENTSPROC glad_glMultiDrawElements; +#define glMultiDrawElements glad_glMultiDrawElements +GLAD_API_CALL PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC glad_glMultiDrawElementsBaseVertex; +#define glMultiDrawElementsBaseVertex glad_glMultiDrawElementsBaseVertex +GLAD_API_CALL PFNGLMULTITEXCOORD1DPROC glad_glMultiTexCoord1d; +#define glMultiTexCoord1d glad_glMultiTexCoord1d +GLAD_API_CALL PFNGLMULTITEXCOORD1DVPROC glad_glMultiTexCoord1dv; +#define glMultiTexCoord1dv glad_glMultiTexCoord1dv +GLAD_API_CALL PFNGLMULTITEXCOORD1FPROC glad_glMultiTexCoord1f; +#define glMultiTexCoord1f glad_glMultiTexCoord1f +GLAD_API_CALL PFNGLMULTITEXCOORD1FVPROC glad_glMultiTexCoord1fv; +#define glMultiTexCoord1fv glad_glMultiTexCoord1fv +GLAD_API_CALL PFNGLMULTITEXCOORD1IPROC glad_glMultiTexCoord1i; +#define glMultiTexCoord1i glad_glMultiTexCoord1i +GLAD_API_CALL PFNGLMULTITEXCOORD1IVPROC glad_glMultiTexCoord1iv; +#define glMultiTexCoord1iv glad_glMultiTexCoord1iv +GLAD_API_CALL PFNGLMULTITEXCOORD1SPROC glad_glMultiTexCoord1s; +#define glMultiTexCoord1s glad_glMultiTexCoord1s +GLAD_API_CALL PFNGLMULTITEXCOORD1SVPROC glad_glMultiTexCoord1sv; +#define glMultiTexCoord1sv glad_glMultiTexCoord1sv +GLAD_API_CALL PFNGLMULTITEXCOORD2DPROC glad_glMultiTexCoord2d; +#define glMultiTexCoord2d glad_glMultiTexCoord2d +GLAD_API_CALL PFNGLMULTITEXCOORD2DVPROC glad_glMultiTexCoord2dv; +#define glMultiTexCoord2dv glad_glMultiTexCoord2dv +GLAD_API_CALL PFNGLMULTITEXCOORD2FPROC glad_glMultiTexCoord2f; +#define glMultiTexCoord2f glad_glMultiTexCoord2f +GLAD_API_CALL PFNGLMULTITEXCOORD2FVPROC glad_glMultiTexCoord2fv; +#define glMultiTexCoord2fv glad_glMultiTexCoord2fv +GLAD_API_CALL PFNGLMULTITEXCOORD2IPROC glad_glMultiTexCoord2i; +#define glMultiTexCoord2i glad_glMultiTexCoord2i +GLAD_API_CALL PFNGLMULTITEXCOORD2IVPROC glad_glMultiTexCoord2iv; +#define glMultiTexCoord2iv glad_glMultiTexCoord2iv +GLAD_API_CALL PFNGLMULTITEXCOORD2SPROC glad_glMultiTexCoord2s; +#define glMultiTexCoord2s glad_glMultiTexCoord2s +GLAD_API_CALL PFNGLMULTITEXCOORD2SVPROC glad_glMultiTexCoord2sv; +#define glMultiTexCoord2sv glad_glMultiTexCoord2sv +GLAD_API_CALL PFNGLMULTITEXCOORD3DPROC glad_glMultiTexCoord3d; +#define glMultiTexCoord3d glad_glMultiTexCoord3d +GLAD_API_CALL PFNGLMULTITEXCOORD3DVPROC glad_glMultiTexCoord3dv; +#define glMultiTexCoord3dv glad_glMultiTexCoord3dv +GLAD_API_CALL PFNGLMULTITEXCOORD3FPROC glad_glMultiTexCoord3f; +#define glMultiTexCoord3f glad_glMultiTexCoord3f +GLAD_API_CALL PFNGLMULTITEXCOORD3FVPROC glad_glMultiTexCoord3fv; +#define glMultiTexCoord3fv glad_glMultiTexCoord3fv +GLAD_API_CALL PFNGLMULTITEXCOORD3IPROC glad_glMultiTexCoord3i; +#define glMultiTexCoord3i glad_glMultiTexCoord3i +GLAD_API_CALL PFNGLMULTITEXCOORD3IVPROC glad_glMultiTexCoord3iv; +#define glMultiTexCoord3iv glad_glMultiTexCoord3iv +GLAD_API_CALL PFNGLMULTITEXCOORD3SPROC glad_glMultiTexCoord3s; +#define glMultiTexCoord3s glad_glMultiTexCoord3s +GLAD_API_CALL PFNGLMULTITEXCOORD3SVPROC glad_glMultiTexCoord3sv; +#define glMultiTexCoord3sv glad_glMultiTexCoord3sv +GLAD_API_CALL PFNGLMULTITEXCOORD4DPROC glad_glMultiTexCoord4d; +#define glMultiTexCoord4d glad_glMultiTexCoord4d +GLAD_API_CALL PFNGLMULTITEXCOORD4DVPROC glad_glMultiTexCoord4dv; +#define glMultiTexCoord4dv glad_glMultiTexCoord4dv +GLAD_API_CALL PFNGLMULTITEXCOORD4FPROC glad_glMultiTexCoord4f; +#define glMultiTexCoord4f glad_glMultiTexCoord4f +GLAD_API_CALL PFNGLMULTITEXCOORD4FVPROC glad_glMultiTexCoord4fv; +#define glMultiTexCoord4fv glad_glMultiTexCoord4fv +GLAD_API_CALL PFNGLMULTITEXCOORD4IPROC glad_glMultiTexCoord4i; +#define glMultiTexCoord4i glad_glMultiTexCoord4i +GLAD_API_CALL PFNGLMULTITEXCOORD4IVPROC glad_glMultiTexCoord4iv; +#define glMultiTexCoord4iv glad_glMultiTexCoord4iv +GLAD_API_CALL PFNGLMULTITEXCOORD4SPROC glad_glMultiTexCoord4s; +#define glMultiTexCoord4s glad_glMultiTexCoord4s +GLAD_API_CALL PFNGLMULTITEXCOORD4SVPROC glad_glMultiTexCoord4sv; +#define glMultiTexCoord4sv glad_glMultiTexCoord4sv +GLAD_API_CALL PFNGLMULTITEXCOORDP1UIPROC glad_glMultiTexCoordP1ui; +#define glMultiTexCoordP1ui glad_glMultiTexCoordP1ui +GLAD_API_CALL PFNGLMULTITEXCOORDP1UIVPROC glad_glMultiTexCoordP1uiv; +#define glMultiTexCoordP1uiv glad_glMultiTexCoordP1uiv +GLAD_API_CALL PFNGLMULTITEXCOORDP2UIPROC glad_glMultiTexCoordP2ui; +#define glMultiTexCoordP2ui glad_glMultiTexCoordP2ui +GLAD_API_CALL PFNGLMULTITEXCOORDP2UIVPROC glad_glMultiTexCoordP2uiv; +#define glMultiTexCoordP2uiv glad_glMultiTexCoordP2uiv +GLAD_API_CALL PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui; +#define glMultiTexCoordP3ui glad_glMultiTexCoordP3ui +GLAD_API_CALL PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv; +#define glMultiTexCoordP3uiv glad_glMultiTexCoordP3uiv +GLAD_API_CALL PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui; +#define glMultiTexCoordP4ui glad_glMultiTexCoordP4ui +GLAD_API_CALL PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv; +#define glMultiTexCoordP4uiv glad_glMultiTexCoordP4uiv +GLAD_API_CALL PFNGLNEWLISTPROC glad_glNewList; +#define glNewList glad_glNewList +GLAD_API_CALL PFNGLNORMAL3BPROC glad_glNormal3b; +#define glNormal3b glad_glNormal3b +GLAD_API_CALL PFNGLNORMAL3BVPROC glad_glNormal3bv; +#define glNormal3bv glad_glNormal3bv +GLAD_API_CALL PFNGLNORMAL3DPROC glad_glNormal3d; +#define glNormal3d glad_glNormal3d +GLAD_API_CALL PFNGLNORMAL3DVPROC glad_glNormal3dv; +#define glNormal3dv glad_glNormal3dv +GLAD_API_CALL PFNGLNORMAL3FPROC glad_glNormal3f; +#define glNormal3f glad_glNormal3f +GLAD_API_CALL PFNGLNORMAL3FVPROC glad_glNormal3fv; +#define glNormal3fv glad_glNormal3fv +GLAD_API_CALL PFNGLNORMAL3IPROC glad_glNormal3i; +#define glNormal3i glad_glNormal3i +GLAD_API_CALL PFNGLNORMAL3IVPROC glad_glNormal3iv; +#define glNormal3iv glad_glNormal3iv +GLAD_API_CALL PFNGLNORMAL3SPROC glad_glNormal3s; +#define glNormal3s glad_glNormal3s +GLAD_API_CALL PFNGLNORMAL3SVPROC glad_glNormal3sv; +#define glNormal3sv glad_glNormal3sv +GLAD_API_CALL PFNGLNORMALP3UIPROC glad_glNormalP3ui; +#define glNormalP3ui glad_glNormalP3ui +GLAD_API_CALL PFNGLNORMALP3UIVPROC glad_glNormalP3uiv; +#define glNormalP3uiv glad_glNormalP3uiv +GLAD_API_CALL PFNGLNORMALPOINTERPROC glad_glNormalPointer; +#define glNormalPointer glad_glNormalPointer +GLAD_API_CALL PFNGLORTHOPROC glad_glOrtho; +#define glOrtho glad_glOrtho +GLAD_API_CALL PFNGLPASSTHROUGHPROC glad_glPassThrough; +#define glPassThrough glad_glPassThrough +GLAD_API_CALL PFNGLPIXELMAPFVPROC glad_glPixelMapfv; +#define glPixelMapfv glad_glPixelMapfv +GLAD_API_CALL PFNGLPIXELMAPUIVPROC glad_glPixelMapuiv; +#define glPixelMapuiv glad_glPixelMapuiv +GLAD_API_CALL PFNGLPIXELMAPUSVPROC glad_glPixelMapusv; +#define glPixelMapusv glad_glPixelMapusv +GLAD_API_CALL PFNGLPIXELSTOREFPROC glad_glPixelStoref; +#define glPixelStoref glad_glPixelStoref +GLAD_API_CALL PFNGLPIXELSTOREIPROC glad_glPixelStorei; +#define glPixelStorei glad_glPixelStorei +GLAD_API_CALL PFNGLPIXELTRANSFERFPROC glad_glPixelTransferf; +#define glPixelTransferf glad_glPixelTransferf +GLAD_API_CALL PFNGLPIXELTRANSFERIPROC glad_glPixelTransferi; +#define glPixelTransferi glad_glPixelTransferi +GLAD_API_CALL PFNGLPIXELZOOMPROC glad_glPixelZoom; +#define glPixelZoom glad_glPixelZoom +GLAD_API_CALL PFNGLPOINTPARAMETERFPROC glad_glPointParameterf; +#define glPointParameterf glad_glPointParameterf +GLAD_API_CALL PFNGLPOINTPARAMETERFVPROC glad_glPointParameterfv; +#define glPointParameterfv glad_glPointParameterfv +GLAD_API_CALL PFNGLPOINTPARAMETERIPROC glad_glPointParameteri; +#define glPointParameteri glad_glPointParameteri +GLAD_API_CALL PFNGLPOINTPARAMETERIVPROC glad_glPointParameteriv; +#define glPointParameteriv glad_glPointParameteriv +GLAD_API_CALL PFNGLPOINTSIZEPROC glad_glPointSize; +#define glPointSize glad_glPointSize +GLAD_API_CALL PFNGLPOLYGONMODEPROC glad_glPolygonMode; +#define glPolygonMode glad_glPolygonMode +GLAD_API_CALL PFNGLPOLYGONOFFSETPROC glad_glPolygonOffset; +#define glPolygonOffset glad_glPolygonOffset +GLAD_API_CALL PFNGLPOLYGONSTIPPLEPROC glad_glPolygonStipple; +#define glPolygonStipple glad_glPolygonStipple +GLAD_API_CALL PFNGLPOPATTRIBPROC glad_glPopAttrib; +#define glPopAttrib glad_glPopAttrib +GLAD_API_CALL PFNGLPOPCLIENTATTRIBPROC glad_glPopClientAttrib; +#define glPopClientAttrib glad_glPopClientAttrib +GLAD_API_CALL PFNGLPOPMATRIXPROC glad_glPopMatrix; +#define glPopMatrix glad_glPopMatrix +GLAD_API_CALL PFNGLPOPNAMEPROC glad_glPopName; +#define glPopName glad_glPopName +GLAD_API_CALL PFNGLPRIMITIVERESTARTINDEXPROC glad_glPrimitiveRestartIndex; +#define glPrimitiveRestartIndex glad_glPrimitiveRestartIndex +GLAD_API_CALL PFNGLPRIORITIZETEXTURESPROC glad_glPrioritizeTextures; +#define glPrioritizeTextures glad_glPrioritizeTextures +GLAD_API_CALL PFNGLPROVOKINGVERTEXPROC glad_glProvokingVertex; +#define glProvokingVertex glad_glProvokingVertex +GLAD_API_CALL PFNGLPUSHATTRIBPROC glad_glPushAttrib; +#define glPushAttrib glad_glPushAttrib +GLAD_API_CALL PFNGLPUSHCLIENTATTRIBPROC glad_glPushClientAttrib; +#define glPushClientAttrib glad_glPushClientAttrib +GLAD_API_CALL PFNGLPUSHMATRIXPROC glad_glPushMatrix; +#define glPushMatrix glad_glPushMatrix +GLAD_API_CALL PFNGLPUSHNAMEPROC glad_glPushName; +#define glPushName glad_glPushName +GLAD_API_CALL PFNGLQUERYCOUNTERPROC glad_glQueryCounter; +#define glQueryCounter glad_glQueryCounter +GLAD_API_CALL PFNGLRASTERPOS2DPROC glad_glRasterPos2d; +#define glRasterPos2d glad_glRasterPos2d +GLAD_API_CALL PFNGLRASTERPOS2DVPROC glad_glRasterPos2dv; +#define glRasterPos2dv glad_glRasterPos2dv +GLAD_API_CALL PFNGLRASTERPOS2FPROC glad_glRasterPos2f; +#define glRasterPos2f glad_glRasterPos2f +GLAD_API_CALL PFNGLRASTERPOS2FVPROC glad_glRasterPos2fv; +#define glRasterPos2fv glad_glRasterPos2fv +GLAD_API_CALL PFNGLRASTERPOS2IPROC glad_glRasterPos2i; +#define glRasterPos2i glad_glRasterPos2i +GLAD_API_CALL PFNGLRASTERPOS2IVPROC glad_glRasterPos2iv; +#define glRasterPos2iv glad_glRasterPos2iv +GLAD_API_CALL PFNGLRASTERPOS2SPROC glad_glRasterPos2s; +#define glRasterPos2s glad_glRasterPos2s +GLAD_API_CALL PFNGLRASTERPOS2SVPROC glad_glRasterPos2sv; +#define glRasterPos2sv glad_glRasterPos2sv +GLAD_API_CALL PFNGLRASTERPOS3DPROC glad_glRasterPos3d; +#define glRasterPos3d glad_glRasterPos3d +GLAD_API_CALL PFNGLRASTERPOS3DVPROC glad_glRasterPos3dv; +#define glRasterPos3dv glad_glRasterPos3dv +GLAD_API_CALL PFNGLRASTERPOS3FPROC glad_glRasterPos3f; +#define glRasterPos3f glad_glRasterPos3f +GLAD_API_CALL PFNGLRASTERPOS3FVPROC glad_glRasterPos3fv; +#define glRasterPos3fv glad_glRasterPos3fv +GLAD_API_CALL PFNGLRASTERPOS3IPROC glad_glRasterPos3i; +#define glRasterPos3i glad_glRasterPos3i +GLAD_API_CALL PFNGLRASTERPOS3IVPROC glad_glRasterPos3iv; +#define glRasterPos3iv glad_glRasterPos3iv +GLAD_API_CALL PFNGLRASTERPOS3SPROC glad_glRasterPos3s; +#define glRasterPos3s glad_glRasterPos3s +GLAD_API_CALL PFNGLRASTERPOS3SVPROC glad_glRasterPos3sv; +#define glRasterPos3sv glad_glRasterPos3sv +GLAD_API_CALL PFNGLRASTERPOS4DPROC glad_glRasterPos4d; +#define glRasterPos4d glad_glRasterPos4d +GLAD_API_CALL PFNGLRASTERPOS4DVPROC glad_glRasterPos4dv; +#define glRasterPos4dv glad_glRasterPos4dv +GLAD_API_CALL PFNGLRASTERPOS4FPROC glad_glRasterPos4f; +#define glRasterPos4f glad_glRasterPos4f +GLAD_API_CALL PFNGLRASTERPOS4FVPROC glad_glRasterPos4fv; +#define glRasterPos4fv glad_glRasterPos4fv +GLAD_API_CALL PFNGLRASTERPOS4IPROC glad_glRasterPos4i; +#define glRasterPos4i glad_glRasterPos4i +GLAD_API_CALL PFNGLRASTERPOS4IVPROC glad_glRasterPos4iv; +#define glRasterPos4iv glad_glRasterPos4iv +GLAD_API_CALL PFNGLRASTERPOS4SPROC glad_glRasterPos4s; +#define glRasterPos4s glad_glRasterPos4s +GLAD_API_CALL PFNGLRASTERPOS4SVPROC glad_glRasterPos4sv; +#define glRasterPos4sv glad_glRasterPos4sv +GLAD_API_CALL PFNGLREADBUFFERPROC glad_glReadBuffer; +#define glReadBuffer glad_glReadBuffer +GLAD_API_CALL PFNGLREADPIXELSPROC glad_glReadPixels; +#define glReadPixels glad_glReadPixels +GLAD_API_CALL PFNGLRECTDPROC glad_glRectd; +#define glRectd glad_glRectd +GLAD_API_CALL PFNGLRECTDVPROC glad_glRectdv; +#define glRectdv glad_glRectdv +GLAD_API_CALL PFNGLRECTFPROC glad_glRectf; +#define glRectf glad_glRectf +GLAD_API_CALL PFNGLRECTFVPROC glad_glRectfv; +#define glRectfv glad_glRectfv +GLAD_API_CALL PFNGLRECTIPROC glad_glRecti; +#define glRecti glad_glRecti +GLAD_API_CALL PFNGLRECTIVPROC glad_glRectiv; +#define glRectiv glad_glRectiv +GLAD_API_CALL PFNGLRECTSPROC glad_glRects; +#define glRects glad_glRects +GLAD_API_CALL PFNGLRECTSVPROC glad_glRectsv; +#define glRectsv glad_glRectsv +GLAD_API_CALL PFNGLRENDERMODEPROC glad_glRenderMode; +#define glRenderMode glad_glRenderMode +GLAD_API_CALL PFNGLRENDERBUFFERSTORAGEPROC glad_glRenderbufferStorage; +#define glRenderbufferStorage glad_glRenderbufferStorage +GLAD_API_CALL PFNGLRENDERBUFFERSTORAGEEXTPROC glad_glRenderbufferStorageEXT; +#define glRenderbufferStorageEXT glad_glRenderbufferStorageEXT +GLAD_API_CALL PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC glad_glRenderbufferStorageMultisample; +#define glRenderbufferStorageMultisample glad_glRenderbufferStorageMultisample +GLAD_API_CALL PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glad_glRenderbufferStorageMultisampleEXT; +#define glRenderbufferStorageMultisampleEXT glad_glRenderbufferStorageMultisampleEXT +GLAD_API_CALL PFNGLROTATEDPROC glad_glRotated; +#define glRotated glad_glRotated +GLAD_API_CALL PFNGLROTATEFPROC glad_glRotatef; +#define glRotatef glad_glRotatef +GLAD_API_CALL PFNGLSAMPLECOVERAGEPROC glad_glSampleCoverage; +#define glSampleCoverage glad_glSampleCoverage +GLAD_API_CALL PFNGLSAMPLEMASKIPROC glad_glSampleMaski; +#define glSampleMaski glad_glSampleMaski +GLAD_API_CALL PFNGLSAMPLERPARAMETERIIVPROC glad_glSamplerParameterIiv; +#define glSamplerParameterIiv glad_glSamplerParameterIiv +GLAD_API_CALL PFNGLSAMPLERPARAMETERIUIVPROC glad_glSamplerParameterIuiv; +#define glSamplerParameterIuiv glad_glSamplerParameterIuiv +GLAD_API_CALL PFNGLSAMPLERPARAMETERFPROC glad_glSamplerParameterf; +#define glSamplerParameterf glad_glSamplerParameterf +GLAD_API_CALL PFNGLSAMPLERPARAMETERFVPROC glad_glSamplerParameterfv; +#define glSamplerParameterfv glad_glSamplerParameterfv +GLAD_API_CALL PFNGLSAMPLERPARAMETERIPROC glad_glSamplerParameteri; +#define glSamplerParameteri glad_glSamplerParameteri +GLAD_API_CALL PFNGLSAMPLERPARAMETERIVPROC glad_glSamplerParameteriv; +#define glSamplerParameteriv glad_glSamplerParameteriv +GLAD_API_CALL PFNGLSCALEDPROC glad_glScaled; +#define glScaled glad_glScaled +GLAD_API_CALL PFNGLSCALEFPROC glad_glScalef; +#define glScalef glad_glScalef +GLAD_API_CALL PFNGLSCISSORPROC glad_glScissor; +#define glScissor glad_glScissor +GLAD_API_CALL PFNGLSECONDARYCOLOR3BPROC glad_glSecondaryColor3b; +#define glSecondaryColor3b glad_glSecondaryColor3b +GLAD_API_CALL PFNGLSECONDARYCOLOR3BVPROC glad_glSecondaryColor3bv; +#define glSecondaryColor3bv glad_glSecondaryColor3bv +GLAD_API_CALL PFNGLSECONDARYCOLOR3DPROC glad_glSecondaryColor3d; +#define glSecondaryColor3d glad_glSecondaryColor3d +GLAD_API_CALL PFNGLSECONDARYCOLOR3DVPROC glad_glSecondaryColor3dv; +#define glSecondaryColor3dv glad_glSecondaryColor3dv +GLAD_API_CALL PFNGLSECONDARYCOLOR3FPROC glad_glSecondaryColor3f; +#define glSecondaryColor3f glad_glSecondaryColor3f +GLAD_API_CALL PFNGLSECONDARYCOLOR3FVPROC glad_glSecondaryColor3fv; +#define glSecondaryColor3fv glad_glSecondaryColor3fv +GLAD_API_CALL PFNGLSECONDARYCOLOR3IPROC glad_glSecondaryColor3i; +#define glSecondaryColor3i glad_glSecondaryColor3i +GLAD_API_CALL PFNGLSECONDARYCOLOR3IVPROC glad_glSecondaryColor3iv; +#define glSecondaryColor3iv glad_glSecondaryColor3iv +GLAD_API_CALL PFNGLSECONDARYCOLOR3SPROC glad_glSecondaryColor3s; +#define glSecondaryColor3s glad_glSecondaryColor3s +GLAD_API_CALL PFNGLSECONDARYCOLOR3SVPROC glad_glSecondaryColor3sv; +#define glSecondaryColor3sv glad_glSecondaryColor3sv +GLAD_API_CALL PFNGLSECONDARYCOLOR3UBPROC glad_glSecondaryColor3ub; +#define glSecondaryColor3ub glad_glSecondaryColor3ub +GLAD_API_CALL PFNGLSECONDARYCOLOR3UBVPROC glad_glSecondaryColor3ubv; +#define glSecondaryColor3ubv glad_glSecondaryColor3ubv +GLAD_API_CALL PFNGLSECONDARYCOLOR3UIPROC glad_glSecondaryColor3ui; +#define glSecondaryColor3ui glad_glSecondaryColor3ui +GLAD_API_CALL PFNGLSECONDARYCOLOR3UIVPROC glad_glSecondaryColor3uiv; +#define glSecondaryColor3uiv glad_glSecondaryColor3uiv +GLAD_API_CALL PFNGLSECONDARYCOLOR3USPROC glad_glSecondaryColor3us; +#define glSecondaryColor3us glad_glSecondaryColor3us +GLAD_API_CALL PFNGLSECONDARYCOLOR3USVPROC glad_glSecondaryColor3usv; +#define glSecondaryColor3usv glad_glSecondaryColor3usv +GLAD_API_CALL PFNGLSECONDARYCOLORP3UIPROC glad_glSecondaryColorP3ui; +#define glSecondaryColorP3ui glad_glSecondaryColorP3ui +GLAD_API_CALL PFNGLSECONDARYCOLORP3UIVPROC glad_glSecondaryColorP3uiv; +#define glSecondaryColorP3uiv glad_glSecondaryColorP3uiv +GLAD_API_CALL PFNGLSECONDARYCOLORPOINTERPROC glad_glSecondaryColorPointer; +#define glSecondaryColorPointer glad_glSecondaryColorPointer +GLAD_API_CALL PFNGLSELECTBUFFERPROC glad_glSelectBuffer; +#define glSelectBuffer glad_glSelectBuffer +GLAD_API_CALL PFNGLSHADEMODELPROC glad_glShadeModel; +#define glShadeModel glad_glShadeModel +GLAD_API_CALL PFNGLSHADERSOURCEPROC glad_glShaderSource; +#define glShaderSource glad_glShaderSource +GLAD_API_CALL PFNGLSTENCILFUNCPROC glad_glStencilFunc; +#define glStencilFunc glad_glStencilFunc +GLAD_API_CALL PFNGLSTENCILFUNCSEPARATEPROC glad_glStencilFuncSeparate; +#define glStencilFuncSeparate glad_glStencilFuncSeparate +GLAD_API_CALL PFNGLSTENCILMASKPROC glad_glStencilMask; +#define glStencilMask glad_glStencilMask +GLAD_API_CALL PFNGLSTENCILMASKSEPARATEPROC glad_glStencilMaskSeparate; +#define glStencilMaskSeparate glad_glStencilMaskSeparate +GLAD_API_CALL PFNGLSTENCILOPPROC glad_glStencilOp; +#define glStencilOp glad_glStencilOp +GLAD_API_CALL PFNGLSTENCILOPSEPARATEPROC glad_glStencilOpSeparate; +#define glStencilOpSeparate glad_glStencilOpSeparate +GLAD_API_CALL PFNGLTEXBUFFERPROC glad_glTexBuffer; +#define glTexBuffer glad_glTexBuffer +GLAD_API_CALL PFNGLTEXCOORD1DPROC glad_glTexCoord1d; +#define glTexCoord1d glad_glTexCoord1d +GLAD_API_CALL PFNGLTEXCOORD1DVPROC glad_glTexCoord1dv; +#define glTexCoord1dv glad_glTexCoord1dv +GLAD_API_CALL PFNGLTEXCOORD1FPROC glad_glTexCoord1f; +#define glTexCoord1f glad_glTexCoord1f +GLAD_API_CALL PFNGLTEXCOORD1FVPROC glad_glTexCoord1fv; +#define glTexCoord1fv glad_glTexCoord1fv +GLAD_API_CALL PFNGLTEXCOORD1IPROC glad_glTexCoord1i; +#define glTexCoord1i glad_glTexCoord1i +GLAD_API_CALL PFNGLTEXCOORD1IVPROC glad_glTexCoord1iv; +#define glTexCoord1iv glad_glTexCoord1iv +GLAD_API_CALL PFNGLTEXCOORD1SPROC glad_glTexCoord1s; +#define glTexCoord1s glad_glTexCoord1s +GLAD_API_CALL PFNGLTEXCOORD1SVPROC glad_glTexCoord1sv; +#define glTexCoord1sv glad_glTexCoord1sv +GLAD_API_CALL PFNGLTEXCOORD2DPROC glad_glTexCoord2d; +#define glTexCoord2d glad_glTexCoord2d +GLAD_API_CALL PFNGLTEXCOORD2DVPROC glad_glTexCoord2dv; +#define glTexCoord2dv glad_glTexCoord2dv +GLAD_API_CALL PFNGLTEXCOORD2FPROC glad_glTexCoord2f; +#define glTexCoord2f glad_glTexCoord2f +GLAD_API_CALL PFNGLTEXCOORD2FVPROC glad_glTexCoord2fv; +#define glTexCoord2fv glad_glTexCoord2fv +GLAD_API_CALL PFNGLTEXCOORD2IPROC glad_glTexCoord2i; +#define glTexCoord2i glad_glTexCoord2i +GLAD_API_CALL PFNGLTEXCOORD2IVPROC glad_glTexCoord2iv; +#define glTexCoord2iv glad_glTexCoord2iv +GLAD_API_CALL PFNGLTEXCOORD2SPROC glad_glTexCoord2s; +#define glTexCoord2s glad_glTexCoord2s +GLAD_API_CALL PFNGLTEXCOORD2SVPROC glad_glTexCoord2sv; +#define glTexCoord2sv glad_glTexCoord2sv +GLAD_API_CALL PFNGLTEXCOORD3DPROC glad_glTexCoord3d; +#define glTexCoord3d glad_glTexCoord3d +GLAD_API_CALL PFNGLTEXCOORD3DVPROC glad_glTexCoord3dv; +#define glTexCoord3dv glad_glTexCoord3dv +GLAD_API_CALL PFNGLTEXCOORD3FPROC glad_glTexCoord3f; +#define glTexCoord3f glad_glTexCoord3f +GLAD_API_CALL PFNGLTEXCOORD3FVPROC glad_glTexCoord3fv; +#define glTexCoord3fv glad_glTexCoord3fv +GLAD_API_CALL PFNGLTEXCOORD3IPROC glad_glTexCoord3i; +#define glTexCoord3i glad_glTexCoord3i +GLAD_API_CALL PFNGLTEXCOORD3IVPROC glad_glTexCoord3iv; +#define glTexCoord3iv glad_glTexCoord3iv +GLAD_API_CALL PFNGLTEXCOORD3SPROC glad_glTexCoord3s; +#define glTexCoord3s glad_glTexCoord3s +GLAD_API_CALL PFNGLTEXCOORD3SVPROC glad_glTexCoord3sv; +#define glTexCoord3sv glad_glTexCoord3sv +GLAD_API_CALL PFNGLTEXCOORD4DPROC glad_glTexCoord4d; +#define glTexCoord4d glad_glTexCoord4d +GLAD_API_CALL PFNGLTEXCOORD4DVPROC glad_glTexCoord4dv; +#define glTexCoord4dv glad_glTexCoord4dv +GLAD_API_CALL PFNGLTEXCOORD4FPROC glad_glTexCoord4f; +#define glTexCoord4f glad_glTexCoord4f +GLAD_API_CALL PFNGLTEXCOORD4FVPROC glad_glTexCoord4fv; +#define glTexCoord4fv glad_glTexCoord4fv +GLAD_API_CALL PFNGLTEXCOORD4IPROC glad_glTexCoord4i; +#define glTexCoord4i glad_glTexCoord4i +GLAD_API_CALL PFNGLTEXCOORD4IVPROC glad_glTexCoord4iv; +#define glTexCoord4iv glad_glTexCoord4iv +GLAD_API_CALL PFNGLTEXCOORD4SPROC glad_glTexCoord4s; +#define glTexCoord4s glad_glTexCoord4s +GLAD_API_CALL PFNGLTEXCOORD4SVPROC glad_glTexCoord4sv; +#define glTexCoord4sv glad_glTexCoord4sv +GLAD_API_CALL PFNGLTEXCOORDP1UIPROC glad_glTexCoordP1ui; +#define glTexCoordP1ui glad_glTexCoordP1ui +GLAD_API_CALL PFNGLTEXCOORDP1UIVPROC glad_glTexCoordP1uiv; +#define glTexCoordP1uiv glad_glTexCoordP1uiv +GLAD_API_CALL PFNGLTEXCOORDP2UIPROC glad_glTexCoordP2ui; +#define glTexCoordP2ui glad_glTexCoordP2ui +GLAD_API_CALL PFNGLTEXCOORDP2UIVPROC glad_glTexCoordP2uiv; +#define glTexCoordP2uiv glad_glTexCoordP2uiv +GLAD_API_CALL PFNGLTEXCOORDP3UIPROC glad_glTexCoordP3ui; +#define glTexCoordP3ui glad_glTexCoordP3ui +GLAD_API_CALL PFNGLTEXCOORDP3UIVPROC glad_glTexCoordP3uiv; +#define glTexCoordP3uiv glad_glTexCoordP3uiv +GLAD_API_CALL PFNGLTEXCOORDP4UIPROC glad_glTexCoordP4ui; +#define glTexCoordP4ui glad_glTexCoordP4ui +GLAD_API_CALL PFNGLTEXCOORDP4UIVPROC glad_glTexCoordP4uiv; +#define glTexCoordP4uiv glad_glTexCoordP4uiv +GLAD_API_CALL PFNGLTEXCOORDPOINTERPROC glad_glTexCoordPointer; +#define glTexCoordPointer glad_glTexCoordPointer +GLAD_API_CALL PFNGLTEXENVFPROC glad_glTexEnvf; +#define glTexEnvf glad_glTexEnvf +GLAD_API_CALL PFNGLTEXENVFVPROC glad_glTexEnvfv; +#define glTexEnvfv glad_glTexEnvfv +GLAD_API_CALL PFNGLTEXENVIPROC glad_glTexEnvi; +#define glTexEnvi glad_glTexEnvi +GLAD_API_CALL PFNGLTEXENVIVPROC glad_glTexEnviv; +#define glTexEnviv glad_glTexEnviv +GLAD_API_CALL PFNGLTEXGENDPROC glad_glTexGend; +#define glTexGend glad_glTexGend +GLAD_API_CALL PFNGLTEXGENDVPROC glad_glTexGendv; +#define glTexGendv glad_glTexGendv +GLAD_API_CALL PFNGLTEXGENFPROC glad_glTexGenf; +#define glTexGenf glad_glTexGenf +GLAD_API_CALL PFNGLTEXGENFVPROC glad_glTexGenfv; +#define glTexGenfv glad_glTexGenfv +GLAD_API_CALL PFNGLTEXGENIPROC glad_glTexGeni; +#define glTexGeni glad_glTexGeni +GLAD_API_CALL PFNGLTEXGENIVPROC glad_glTexGeniv; +#define glTexGeniv glad_glTexGeniv +GLAD_API_CALL PFNGLTEXIMAGE1DPROC glad_glTexImage1D; +#define glTexImage1D glad_glTexImage1D +GLAD_API_CALL PFNGLTEXIMAGE2DPROC glad_glTexImage2D; +#define glTexImage2D glad_glTexImage2D +GLAD_API_CALL PFNGLTEXIMAGE2DMULTISAMPLEPROC glad_glTexImage2DMultisample; +#define glTexImage2DMultisample glad_glTexImage2DMultisample +GLAD_API_CALL PFNGLTEXIMAGE3DPROC glad_glTexImage3D; +#define glTexImage3D glad_glTexImage3D +GLAD_API_CALL PFNGLTEXIMAGE3DMULTISAMPLEPROC glad_glTexImage3DMultisample; +#define glTexImage3DMultisample glad_glTexImage3DMultisample +GLAD_API_CALL PFNGLTEXPARAMETERIIVPROC glad_glTexParameterIiv; +#define glTexParameterIiv glad_glTexParameterIiv +GLAD_API_CALL PFNGLTEXPARAMETERIUIVPROC glad_glTexParameterIuiv; +#define glTexParameterIuiv glad_glTexParameterIuiv +GLAD_API_CALL PFNGLTEXPARAMETERFPROC glad_glTexParameterf; +#define glTexParameterf glad_glTexParameterf +GLAD_API_CALL PFNGLTEXPARAMETERFVPROC glad_glTexParameterfv; +#define glTexParameterfv glad_glTexParameterfv +GLAD_API_CALL PFNGLTEXPARAMETERIPROC glad_glTexParameteri; +#define glTexParameteri glad_glTexParameteri +GLAD_API_CALL PFNGLTEXPARAMETERIVPROC glad_glTexParameteriv; +#define glTexParameteriv glad_glTexParameteriv +GLAD_API_CALL PFNGLTEXSUBIMAGE1DPROC glad_glTexSubImage1D; +#define glTexSubImage1D glad_glTexSubImage1D +GLAD_API_CALL PFNGLTEXSUBIMAGE2DPROC glad_glTexSubImage2D; +#define glTexSubImage2D glad_glTexSubImage2D +GLAD_API_CALL PFNGLTEXSUBIMAGE3DPROC glad_glTexSubImage3D; +#define glTexSubImage3D glad_glTexSubImage3D +GLAD_API_CALL PFNGLTRANSFORMFEEDBACKVARYINGSPROC glad_glTransformFeedbackVaryings; +#define glTransformFeedbackVaryings glad_glTransformFeedbackVaryings +GLAD_API_CALL PFNGLTRANSLATEDPROC glad_glTranslated; +#define glTranslated glad_glTranslated +GLAD_API_CALL PFNGLTRANSLATEFPROC glad_glTranslatef; +#define glTranslatef glad_glTranslatef +GLAD_API_CALL PFNGLUNIFORM1FPROC glad_glUniform1f; +#define glUniform1f glad_glUniform1f +GLAD_API_CALL PFNGLUNIFORM1FVPROC glad_glUniform1fv; +#define glUniform1fv glad_glUniform1fv +GLAD_API_CALL PFNGLUNIFORM1IPROC glad_glUniform1i; +#define glUniform1i glad_glUniform1i +GLAD_API_CALL PFNGLUNIFORM1IVPROC glad_glUniform1iv; +#define glUniform1iv glad_glUniform1iv +GLAD_API_CALL PFNGLUNIFORM1UIPROC glad_glUniform1ui; +#define glUniform1ui glad_glUniform1ui +GLAD_API_CALL PFNGLUNIFORM1UIVPROC glad_glUniform1uiv; +#define glUniform1uiv glad_glUniform1uiv +GLAD_API_CALL PFNGLUNIFORM2FPROC glad_glUniform2f; +#define glUniform2f glad_glUniform2f +GLAD_API_CALL PFNGLUNIFORM2FVPROC glad_glUniform2fv; +#define glUniform2fv glad_glUniform2fv +GLAD_API_CALL PFNGLUNIFORM2IPROC glad_glUniform2i; +#define glUniform2i glad_glUniform2i +GLAD_API_CALL PFNGLUNIFORM2IVPROC glad_glUniform2iv; +#define glUniform2iv glad_glUniform2iv +GLAD_API_CALL PFNGLUNIFORM2UIPROC glad_glUniform2ui; +#define glUniform2ui glad_glUniform2ui +GLAD_API_CALL PFNGLUNIFORM2UIVPROC glad_glUniform2uiv; +#define glUniform2uiv glad_glUniform2uiv +GLAD_API_CALL PFNGLUNIFORM3FPROC glad_glUniform3f; +#define glUniform3f glad_glUniform3f +GLAD_API_CALL PFNGLUNIFORM3FVPROC glad_glUniform3fv; +#define glUniform3fv glad_glUniform3fv +GLAD_API_CALL PFNGLUNIFORM3IPROC glad_glUniform3i; +#define glUniform3i glad_glUniform3i +GLAD_API_CALL PFNGLUNIFORM3IVPROC glad_glUniform3iv; +#define glUniform3iv glad_glUniform3iv +GLAD_API_CALL PFNGLUNIFORM3UIPROC glad_glUniform3ui; +#define glUniform3ui glad_glUniform3ui +GLAD_API_CALL PFNGLUNIFORM3UIVPROC glad_glUniform3uiv; +#define glUniform3uiv glad_glUniform3uiv +GLAD_API_CALL PFNGLUNIFORM4FPROC glad_glUniform4f; +#define glUniform4f glad_glUniform4f +GLAD_API_CALL PFNGLUNIFORM4FVPROC glad_glUniform4fv; +#define glUniform4fv glad_glUniform4fv +GLAD_API_CALL PFNGLUNIFORM4IPROC glad_glUniform4i; +#define glUniform4i glad_glUniform4i +GLAD_API_CALL PFNGLUNIFORM4IVPROC glad_glUniform4iv; +#define glUniform4iv glad_glUniform4iv +GLAD_API_CALL PFNGLUNIFORM4UIPROC glad_glUniform4ui; +#define glUniform4ui glad_glUniform4ui +GLAD_API_CALL PFNGLUNIFORM4UIVPROC glad_glUniform4uiv; +#define glUniform4uiv glad_glUniform4uiv +GLAD_API_CALL PFNGLUNIFORMBLOCKBINDINGPROC glad_glUniformBlockBinding; +#define glUniformBlockBinding glad_glUniformBlockBinding +GLAD_API_CALL PFNGLUNIFORMMATRIX2FVPROC glad_glUniformMatrix2fv; +#define glUniformMatrix2fv glad_glUniformMatrix2fv +GLAD_API_CALL PFNGLUNIFORMMATRIX2X3FVPROC glad_glUniformMatrix2x3fv; +#define glUniformMatrix2x3fv glad_glUniformMatrix2x3fv +GLAD_API_CALL PFNGLUNIFORMMATRIX2X4FVPROC glad_glUniformMatrix2x4fv; +#define glUniformMatrix2x4fv glad_glUniformMatrix2x4fv +GLAD_API_CALL PFNGLUNIFORMMATRIX3FVPROC glad_glUniformMatrix3fv; +#define glUniformMatrix3fv glad_glUniformMatrix3fv +GLAD_API_CALL PFNGLUNIFORMMATRIX3X2FVPROC glad_glUniformMatrix3x2fv; +#define glUniformMatrix3x2fv glad_glUniformMatrix3x2fv +GLAD_API_CALL PFNGLUNIFORMMATRIX3X4FVPROC glad_glUniformMatrix3x4fv; +#define glUniformMatrix3x4fv glad_glUniformMatrix3x4fv +GLAD_API_CALL PFNGLUNIFORMMATRIX4FVPROC glad_glUniformMatrix4fv; +#define glUniformMatrix4fv glad_glUniformMatrix4fv +GLAD_API_CALL PFNGLUNIFORMMATRIX4X2FVPROC glad_glUniformMatrix4x2fv; +#define glUniformMatrix4x2fv glad_glUniformMatrix4x2fv +GLAD_API_CALL PFNGLUNIFORMMATRIX4X3FVPROC glad_glUniformMatrix4x3fv; +#define glUniformMatrix4x3fv glad_glUniformMatrix4x3fv +GLAD_API_CALL PFNGLUNMAPBUFFERPROC glad_glUnmapBuffer; +#define glUnmapBuffer glad_glUnmapBuffer +GLAD_API_CALL PFNGLUSEPROGRAMPROC glad_glUseProgram; +#define glUseProgram glad_glUseProgram +GLAD_API_CALL PFNGLVALIDATEPROGRAMPROC glad_glValidateProgram; +#define glValidateProgram glad_glValidateProgram +GLAD_API_CALL PFNGLVERTEX2DPROC glad_glVertex2d; +#define glVertex2d glad_glVertex2d +GLAD_API_CALL PFNGLVERTEX2DVPROC glad_glVertex2dv; +#define glVertex2dv glad_glVertex2dv +GLAD_API_CALL PFNGLVERTEX2FPROC glad_glVertex2f; +#define glVertex2f glad_glVertex2f +GLAD_API_CALL PFNGLVERTEX2FVPROC glad_glVertex2fv; +#define glVertex2fv glad_glVertex2fv +GLAD_API_CALL PFNGLVERTEX2IPROC glad_glVertex2i; +#define glVertex2i glad_glVertex2i +GLAD_API_CALL PFNGLVERTEX2IVPROC glad_glVertex2iv; +#define glVertex2iv glad_glVertex2iv +GLAD_API_CALL PFNGLVERTEX2SPROC glad_glVertex2s; +#define glVertex2s glad_glVertex2s +GLAD_API_CALL PFNGLVERTEX2SVPROC glad_glVertex2sv; +#define glVertex2sv glad_glVertex2sv +GLAD_API_CALL PFNGLVERTEX3DPROC glad_glVertex3d; +#define glVertex3d glad_glVertex3d +GLAD_API_CALL PFNGLVERTEX3DVPROC glad_glVertex3dv; +#define glVertex3dv glad_glVertex3dv +GLAD_API_CALL PFNGLVERTEX3FPROC glad_glVertex3f; +#define glVertex3f glad_glVertex3f +GLAD_API_CALL PFNGLVERTEX3FVPROC glad_glVertex3fv; +#define glVertex3fv glad_glVertex3fv +GLAD_API_CALL PFNGLVERTEX3IPROC glad_glVertex3i; +#define glVertex3i glad_glVertex3i +GLAD_API_CALL PFNGLVERTEX3IVPROC glad_glVertex3iv; +#define glVertex3iv glad_glVertex3iv +GLAD_API_CALL PFNGLVERTEX3SPROC glad_glVertex3s; +#define glVertex3s glad_glVertex3s +GLAD_API_CALL PFNGLVERTEX3SVPROC glad_glVertex3sv; +#define glVertex3sv glad_glVertex3sv +GLAD_API_CALL PFNGLVERTEX4DPROC glad_glVertex4d; +#define glVertex4d glad_glVertex4d +GLAD_API_CALL PFNGLVERTEX4DVPROC glad_glVertex4dv; +#define glVertex4dv glad_glVertex4dv +GLAD_API_CALL PFNGLVERTEX4FPROC glad_glVertex4f; +#define glVertex4f glad_glVertex4f +GLAD_API_CALL PFNGLVERTEX4FVPROC glad_glVertex4fv; +#define glVertex4fv glad_glVertex4fv +GLAD_API_CALL PFNGLVERTEX4IPROC glad_glVertex4i; +#define glVertex4i glad_glVertex4i +GLAD_API_CALL PFNGLVERTEX4IVPROC glad_glVertex4iv; +#define glVertex4iv glad_glVertex4iv +GLAD_API_CALL PFNGLVERTEX4SPROC glad_glVertex4s; +#define glVertex4s glad_glVertex4s +GLAD_API_CALL PFNGLVERTEX4SVPROC glad_glVertex4sv; +#define glVertex4sv glad_glVertex4sv +GLAD_API_CALL PFNGLVERTEXATTRIB1DPROC glad_glVertexAttrib1d; +#define glVertexAttrib1d glad_glVertexAttrib1d +GLAD_API_CALL PFNGLVERTEXATTRIB1DVPROC glad_glVertexAttrib1dv; +#define glVertexAttrib1dv glad_glVertexAttrib1dv +GLAD_API_CALL PFNGLVERTEXATTRIB1FPROC glad_glVertexAttrib1f; +#define glVertexAttrib1f glad_glVertexAttrib1f +GLAD_API_CALL PFNGLVERTEXATTRIB1FVPROC glad_glVertexAttrib1fv; +#define glVertexAttrib1fv glad_glVertexAttrib1fv +GLAD_API_CALL PFNGLVERTEXATTRIB1SPROC glad_glVertexAttrib1s; +#define glVertexAttrib1s glad_glVertexAttrib1s +GLAD_API_CALL PFNGLVERTEXATTRIB1SVPROC glad_glVertexAttrib1sv; +#define glVertexAttrib1sv glad_glVertexAttrib1sv +GLAD_API_CALL PFNGLVERTEXATTRIB2DPROC glad_glVertexAttrib2d; +#define glVertexAttrib2d glad_glVertexAttrib2d +GLAD_API_CALL PFNGLVERTEXATTRIB2DVPROC glad_glVertexAttrib2dv; +#define glVertexAttrib2dv glad_glVertexAttrib2dv +GLAD_API_CALL PFNGLVERTEXATTRIB2FPROC glad_glVertexAttrib2f; +#define glVertexAttrib2f glad_glVertexAttrib2f +GLAD_API_CALL PFNGLVERTEXATTRIB2FVPROC glad_glVertexAttrib2fv; +#define glVertexAttrib2fv glad_glVertexAttrib2fv +GLAD_API_CALL PFNGLVERTEXATTRIB2SPROC glad_glVertexAttrib2s; +#define glVertexAttrib2s glad_glVertexAttrib2s +GLAD_API_CALL PFNGLVERTEXATTRIB2SVPROC glad_glVertexAttrib2sv; +#define glVertexAttrib2sv glad_glVertexAttrib2sv +GLAD_API_CALL PFNGLVERTEXATTRIB3DPROC glad_glVertexAttrib3d; +#define glVertexAttrib3d glad_glVertexAttrib3d +GLAD_API_CALL PFNGLVERTEXATTRIB3DVPROC glad_glVertexAttrib3dv; +#define glVertexAttrib3dv glad_glVertexAttrib3dv +GLAD_API_CALL PFNGLVERTEXATTRIB3FPROC glad_glVertexAttrib3f; +#define glVertexAttrib3f glad_glVertexAttrib3f +GLAD_API_CALL PFNGLVERTEXATTRIB3FVPROC glad_glVertexAttrib3fv; +#define glVertexAttrib3fv glad_glVertexAttrib3fv +GLAD_API_CALL PFNGLVERTEXATTRIB3SPROC glad_glVertexAttrib3s; +#define glVertexAttrib3s glad_glVertexAttrib3s +GLAD_API_CALL PFNGLVERTEXATTRIB3SVPROC glad_glVertexAttrib3sv; +#define glVertexAttrib3sv glad_glVertexAttrib3sv +GLAD_API_CALL PFNGLVERTEXATTRIB4NBVPROC glad_glVertexAttrib4Nbv; +#define glVertexAttrib4Nbv glad_glVertexAttrib4Nbv +GLAD_API_CALL PFNGLVERTEXATTRIB4NIVPROC glad_glVertexAttrib4Niv; +#define glVertexAttrib4Niv glad_glVertexAttrib4Niv +GLAD_API_CALL PFNGLVERTEXATTRIB4NSVPROC glad_glVertexAttrib4Nsv; +#define glVertexAttrib4Nsv glad_glVertexAttrib4Nsv +GLAD_API_CALL PFNGLVERTEXATTRIB4NUBPROC glad_glVertexAttrib4Nub; +#define glVertexAttrib4Nub glad_glVertexAttrib4Nub +GLAD_API_CALL PFNGLVERTEXATTRIB4NUBVPROC glad_glVertexAttrib4Nubv; +#define glVertexAttrib4Nubv glad_glVertexAttrib4Nubv +GLAD_API_CALL PFNGLVERTEXATTRIB4NUIVPROC glad_glVertexAttrib4Nuiv; +#define glVertexAttrib4Nuiv glad_glVertexAttrib4Nuiv +GLAD_API_CALL PFNGLVERTEXATTRIB4NUSVPROC glad_glVertexAttrib4Nusv; +#define glVertexAttrib4Nusv glad_glVertexAttrib4Nusv +GLAD_API_CALL PFNGLVERTEXATTRIB4BVPROC glad_glVertexAttrib4bv; +#define glVertexAttrib4bv glad_glVertexAttrib4bv +GLAD_API_CALL PFNGLVERTEXATTRIB4DPROC glad_glVertexAttrib4d; +#define glVertexAttrib4d glad_glVertexAttrib4d +GLAD_API_CALL PFNGLVERTEXATTRIB4DVPROC glad_glVertexAttrib4dv; +#define glVertexAttrib4dv glad_glVertexAttrib4dv +GLAD_API_CALL PFNGLVERTEXATTRIB4FPROC glad_glVertexAttrib4f; +#define glVertexAttrib4f glad_glVertexAttrib4f +GLAD_API_CALL PFNGLVERTEXATTRIB4FVPROC glad_glVertexAttrib4fv; +#define glVertexAttrib4fv glad_glVertexAttrib4fv +GLAD_API_CALL PFNGLVERTEXATTRIB4IVPROC glad_glVertexAttrib4iv; +#define glVertexAttrib4iv glad_glVertexAttrib4iv +GLAD_API_CALL PFNGLVERTEXATTRIB4SPROC glad_glVertexAttrib4s; +#define glVertexAttrib4s glad_glVertexAttrib4s +GLAD_API_CALL PFNGLVERTEXATTRIB4SVPROC glad_glVertexAttrib4sv; +#define glVertexAttrib4sv glad_glVertexAttrib4sv +GLAD_API_CALL PFNGLVERTEXATTRIB4UBVPROC glad_glVertexAttrib4ubv; +#define glVertexAttrib4ubv glad_glVertexAttrib4ubv +GLAD_API_CALL PFNGLVERTEXATTRIB4UIVPROC glad_glVertexAttrib4uiv; +#define glVertexAttrib4uiv glad_glVertexAttrib4uiv +GLAD_API_CALL PFNGLVERTEXATTRIB4USVPROC glad_glVertexAttrib4usv; +#define glVertexAttrib4usv glad_glVertexAttrib4usv +GLAD_API_CALL PFNGLVERTEXATTRIBDIVISORPROC glad_glVertexAttribDivisor; +#define glVertexAttribDivisor glad_glVertexAttribDivisor +GLAD_API_CALL PFNGLVERTEXATTRIBI1IPROC glad_glVertexAttribI1i; +#define glVertexAttribI1i glad_glVertexAttribI1i +GLAD_API_CALL PFNGLVERTEXATTRIBI1IVPROC glad_glVertexAttribI1iv; +#define glVertexAttribI1iv glad_glVertexAttribI1iv +GLAD_API_CALL PFNGLVERTEXATTRIBI1UIPROC glad_glVertexAttribI1ui; +#define glVertexAttribI1ui glad_glVertexAttribI1ui +GLAD_API_CALL PFNGLVERTEXATTRIBI1UIVPROC glad_glVertexAttribI1uiv; +#define glVertexAttribI1uiv glad_glVertexAttribI1uiv +GLAD_API_CALL PFNGLVERTEXATTRIBI2IPROC glad_glVertexAttribI2i; +#define glVertexAttribI2i glad_glVertexAttribI2i +GLAD_API_CALL PFNGLVERTEXATTRIBI2IVPROC glad_glVertexAttribI2iv; +#define glVertexAttribI2iv glad_glVertexAttribI2iv +GLAD_API_CALL PFNGLVERTEXATTRIBI2UIPROC glad_glVertexAttribI2ui; +#define glVertexAttribI2ui glad_glVertexAttribI2ui +GLAD_API_CALL PFNGLVERTEXATTRIBI2UIVPROC glad_glVertexAttribI2uiv; +#define glVertexAttribI2uiv glad_glVertexAttribI2uiv +GLAD_API_CALL PFNGLVERTEXATTRIBI3IPROC glad_glVertexAttribI3i; +#define glVertexAttribI3i glad_glVertexAttribI3i +GLAD_API_CALL PFNGLVERTEXATTRIBI3IVPROC glad_glVertexAttribI3iv; +#define glVertexAttribI3iv glad_glVertexAttribI3iv +GLAD_API_CALL PFNGLVERTEXATTRIBI3UIPROC glad_glVertexAttribI3ui; +#define glVertexAttribI3ui glad_glVertexAttribI3ui +GLAD_API_CALL PFNGLVERTEXATTRIBI3UIVPROC glad_glVertexAttribI3uiv; +#define glVertexAttribI3uiv glad_glVertexAttribI3uiv +GLAD_API_CALL PFNGLVERTEXATTRIBI4BVPROC glad_glVertexAttribI4bv; +#define glVertexAttribI4bv glad_glVertexAttribI4bv +GLAD_API_CALL PFNGLVERTEXATTRIBI4IPROC glad_glVertexAttribI4i; +#define glVertexAttribI4i glad_glVertexAttribI4i +GLAD_API_CALL PFNGLVERTEXATTRIBI4IVPROC glad_glVertexAttribI4iv; +#define glVertexAttribI4iv glad_glVertexAttribI4iv +GLAD_API_CALL PFNGLVERTEXATTRIBI4SVPROC glad_glVertexAttribI4sv; +#define glVertexAttribI4sv glad_glVertexAttribI4sv +GLAD_API_CALL PFNGLVERTEXATTRIBI4UBVPROC glad_glVertexAttribI4ubv; +#define glVertexAttribI4ubv glad_glVertexAttribI4ubv +GLAD_API_CALL PFNGLVERTEXATTRIBI4UIPROC glad_glVertexAttribI4ui; +#define glVertexAttribI4ui glad_glVertexAttribI4ui +GLAD_API_CALL PFNGLVERTEXATTRIBI4UIVPROC glad_glVertexAttribI4uiv; +#define glVertexAttribI4uiv glad_glVertexAttribI4uiv +GLAD_API_CALL PFNGLVERTEXATTRIBI4USVPROC glad_glVertexAttribI4usv; +#define glVertexAttribI4usv glad_glVertexAttribI4usv +GLAD_API_CALL PFNGLVERTEXATTRIBIPOINTERPROC glad_glVertexAttribIPointer; +#define glVertexAttribIPointer glad_glVertexAttribIPointer +GLAD_API_CALL PFNGLVERTEXATTRIBP1UIPROC glad_glVertexAttribP1ui; +#define glVertexAttribP1ui glad_glVertexAttribP1ui +GLAD_API_CALL PFNGLVERTEXATTRIBP1UIVPROC glad_glVertexAttribP1uiv; +#define glVertexAttribP1uiv glad_glVertexAttribP1uiv +GLAD_API_CALL PFNGLVERTEXATTRIBP2UIPROC glad_glVertexAttribP2ui; +#define glVertexAttribP2ui glad_glVertexAttribP2ui +GLAD_API_CALL PFNGLVERTEXATTRIBP2UIVPROC glad_glVertexAttribP2uiv; +#define glVertexAttribP2uiv glad_glVertexAttribP2uiv +GLAD_API_CALL PFNGLVERTEXATTRIBP3UIPROC glad_glVertexAttribP3ui; +#define glVertexAttribP3ui glad_glVertexAttribP3ui +GLAD_API_CALL PFNGLVERTEXATTRIBP3UIVPROC glad_glVertexAttribP3uiv; +#define glVertexAttribP3uiv glad_glVertexAttribP3uiv +GLAD_API_CALL PFNGLVERTEXATTRIBP4UIPROC glad_glVertexAttribP4ui; +#define glVertexAttribP4ui glad_glVertexAttribP4ui +GLAD_API_CALL PFNGLVERTEXATTRIBP4UIVPROC glad_glVertexAttribP4uiv; +#define glVertexAttribP4uiv glad_glVertexAttribP4uiv +GLAD_API_CALL PFNGLVERTEXATTRIBPOINTERPROC glad_glVertexAttribPointer; +#define glVertexAttribPointer glad_glVertexAttribPointer +GLAD_API_CALL PFNGLVERTEXP2UIPROC glad_glVertexP2ui; +#define glVertexP2ui glad_glVertexP2ui +GLAD_API_CALL PFNGLVERTEXP2UIVPROC glad_glVertexP2uiv; +#define glVertexP2uiv glad_glVertexP2uiv +GLAD_API_CALL PFNGLVERTEXP3UIPROC glad_glVertexP3ui; +#define glVertexP3ui glad_glVertexP3ui +GLAD_API_CALL PFNGLVERTEXP3UIVPROC glad_glVertexP3uiv; +#define glVertexP3uiv glad_glVertexP3uiv +GLAD_API_CALL PFNGLVERTEXP4UIPROC glad_glVertexP4ui; +#define glVertexP4ui glad_glVertexP4ui +GLAD_API_CALL PFNGLVERTEXP4UIVPROC glad_glVertexP4uiv; +#define glVertexP4uiv glad_glVertexP4uiv +GLAD_API_CALL PFNGLVERTEXPOINTERPROC glad_glVertexPointer; +#define glVertexPointer glad_glVertexPointer +GLAD_API_CALL PFNGLVIEWPORTPROC glad_glViewport; +#define glViewport glad_glViewport +GLAD_API_CALL PFNGLWAITSYNCPROC glad_glWaitSync; +#define glWaitSync glad_glWaitSync +GLAD_API_CALL PFNGLWINDOWPOS2DPROC glad_glWindowPos2d; +#define glWindowPos2d glad_glWindowPos2d +GLAD_API_CALL PFNGLWINDOWPOS2DVPROC glad_glWindowPos2dv; +#define glWindowPos2dv glad_glWindowPos2dv +GLAD_API_CALL PFNGLWINDOWPOS2FPROC glad_glWindowPos2f; +#define glWindowPos2f glad_glWindowPos2f +GLAD_API_CALL PFNGLWINDOWPOS2FVPROC glad_glWindowPos2fv; +#define glWindowPos2fv glad_glWindowPos2fv +GLAD_API_CALL PFNGLWINDOWPOS2IPROC glad_glWindowPos2i; +#define glWindowPos2i glad_glWindowPos2i +GLAD_API_CALL PFNGLWINDOWPOS2IVPROC glad_glWindowPos2iv; +#define glWindowPos2iv glad_glWindowPos2iv +GLAD_API_CALL PFNGLWINDOWPOS2SPROC glad_glWindowPos2s; +#define glWindowPos2s glad_glWindowPos2s +GLAD_API_CALL PFNGLWINDOWPOS2SVPROC glad_glWindowPos2sv; +#define glWindowPos2sv glad_glWindowPos2sv +GLAD_API_CALL PFNGLWINDOWPOS3DPROC glad_glWindowPos3d; +#define glWindowPos3d glad_glWindowPos3d +GLAD_API_CALL PFNGLWINDOWPOS3DVPROC glad_glWindowPos3dv; +#define glWindowPos3dv glad_glWindowPos3dv +GLAD_API_CALL PFNGLWINDOWPOS3FPROC glad_glWindowPos3f; +#define glWindowPos3f glad_glWindowPos3f +GLAD_API_CALL PFNGLWINDOWPOS3FVPROC glad_glWindowPos3fv; +#define glWindowPos3fv glad_glWindowPos3fv +GLAD_API_CALL PFNGLWINDOWPOS3IPROC glad_glWindowPos3i; +#define glWindowPos3i glad_glWindowPos3i +GLAD_API_CALL PFNGLWINDOWPOS3IVPROC glad_glWindowPos3iv; +#define glWindowPos3iv glad_glWindowPos3iv +GLAD_API_CALL PFNGLWINDOWPOS3SPROC glad_glWindowPos3s; +#define glWindowPos3s glad_glWindowPos3s +GLAD_API_CALL PFNGLWINDOWPOS3SVPROC glad_glWindowPos3sv; +#define glWindowPos3sv glad_glWindowPos3sv + + + + + +GLAD_API_CALL int gladLoadGLUserPtr( GLADuserptrloadfunc load, void *userptr); +GLAD_API_CALL int gladLoadGL( GLADloadfunc load); + + +#ifdef GLAD_GL + +GLAD_API_CALL int gladLoaderLoadGL(void); +GLAD_API_CALL void gladLoaderUnloadGL(void); + +#endif + +#ifdef __cplusplus +} +#endif +#endif diff --git a/thirdparty/glad/glad/glad.h b/thirdparty/glad/glad/glad.h deleted file mode 100644 index 37d12e4ee2..0000000000 --- a/thirdparty/glad/glad/glad.h +++ /dev/null @@ -1,3801 +0,0 @@ -/* - - OpenGL loader generated by glad 0.1.36 on Sun Sep 4 15:50:32 2022. - - Language/Generator: C/C++ - Specification: gl - APIs: gl=3.3 - Profile: compatibility - Extensions: - GL_ARB_debug_output, - GL_ARB_framebuffer_object, - GL_EXT_framebuffer_blit, - GL_EXT_framebuffer_multisample, - GL_EXT_framebuffer_object, - GL_OVR_multiview, - GL_OVR_multiview2 - Loader: True - Local files: False - Omit khrplatform: False - Reproducible: False - - Commandline: - --profile="compatibility" --api="gl=3.3" --generator="c" --spec="gl" --extensions="GL_ARB_debug_output,GL_ARB_framebuffer_object,GL_EXT_framebuffer_blit,GL_EXT_framebuffer_multisample,GL_EXT_framebuffer_object,GL_OVR_multiview,GL_OVR_multiview2" - Online: - https://glad.dav1d.de/#profile=compatibility&language=c&specification=gl&loader=on&api=gl%3D3.3&extensions=GL_ARB_debug_output&extensions=GL_ARB_framebuffer_object&extensions=GL_EXT_framebuffer_blit&extensions=GL_EXT_framebuffer_multisample&extensions=GL_EXT_framebuffer_object&extensions=GL_OVR_multiview&extensions=GL_OVR_multiview2 -*/ - - -#ifndef __glad_h_ -#define __glad_h_ - -#ifdef __gl_h_ -#error OpenGL header already included, remove this include, glad already provides it -#endif -#define __gl_h_ - -#if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__) -#define APIENTRY __stdcall -#endif - -#ifndef APIENTRY -#define APIENTRY -#endif -#ifndef APIENTRYP -#define APIENTRYP APIENTRY * -#endif - -#ifndef GLAPIENTRY -#define GLAPIENTRY APIENTRY -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -struct gladGLversionStruct { - int major; - int minor; -}; - -typedef void* (* GLADloadproc)(const char *name); - -#ifndef GLAPI -# if defined(GLAD_GLAPI_EXPORT) -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(GLAD_GLAPI_EXPORT_BUILD) -# if defined(__GNUC__) -# define GLAPI __attribute__ ((dllexport)) extern -# else -# define GLAPI __declspec(dllexport) extern -# endif -# else -# if defined(__GNUC__) -# define GLAPI __attribute__ ((dllimport)) extern -# else -# define GLAPI __declspec(dllimport) extern -# endif -# endif -# elif defined(__GNUC__) && defined(GLAD_GLAPI_EXPORT_BUILD) -# define GLAPI __attribute__ ((visibility ("default"))) extern -# else -# define GLAPI extern -# endif -# else -# define GLAPI extern -# endif -#endif - -GLAPI struct gladGLversionStruct GLVersion; - -GLAPI int gladLoadGL(void); - -GLAPI int gladLoadGLLoader(GLADloadproc); - -#include <KHR/khrplatform.h> -typedef unsigned int GLenum; -typedef unsigned char GLboolean; -typedef unsigned int GLbitfield; -typedef void GLvoid; -typedef khronos_int8_t GLbyte; -typedef khronos_uint8_t GLubyte; -typedef khronos_int16_t GLshort; -typedef khronos_uint16_t GLushort; -typedef int GLint; -typedef unsigned int GLuint; -typedef khronos_int32_t GLclampx; -typedef int GLsizei; -typedef khronos_float_t GLfloat; -typedef khronos_float_t GLclampf; -typedef double GLdouble; -typedef double GLclampd; -typedef void *GLeglClientBufferEXT; -typedef void *GLeglImageOES; -typedef char GLchar; -typedef char GLcharARB; -#ifdef __APPLE__ -typedef void *GLhandleARB; -#else -typedef unsigned int GLhandleARB; -#endif -typedef khronos_uint16_t GLhalf; -typedef khronos_uint16_t GLhalfARB; -typedef khronos_int32_t GLfixed; -typedef khronos_intptr_t GLintptr; -typedef khronos_intptr_t GLintptrARB; -typedef khronos_ssize_t GLsizeiptr; -typedef khronos_ssize_t GLsizeiptrARB; -typedef khronos_int64_t GLint64; -typedef khronos_int64_t GLint64EXT; -typedef khronos_uint64_t GLuint64; -typedef khronos_uint64_t GLuint64EXT; -typedef struct __GLsync *GLsync; -struct _cl_context; -struct _cl_event; -typedef void (APIENTRY *GLDEBUGPROC)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); -typedef void (APIENTRY *GLDEBUGPROCARB)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); -typedef void (APIENTRY *GLDEBUGPROCKHR)(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam); -typedef void (APIENTRY *GLDEBUGPROCAMD)(GLuint id,GLenum category,GLenum severity,GLsizei length,const GLchar *message,void *userParam); -typedef unsigned short GLhalfNV; -typedef GLintptr GLvdpauSurfaceNV; -typedef void (APIENTRY *GLVULKANPROCNV)(void); -#define GL_DEPTH_BUFFER_BIT 0x00000100 -#define GL_STENCIL_BUFFER_BIT 0x00000400 -#define GL_COLOR_BUFFER_BIT 0x00004000 -#define GL_FALSE 0 -#define GL_TRUE 1 -#define GL_POINTS 0x0000 -#define GL_LINES 0x0001 -#define GL_LINE_LOOP 0x0002 -#define GL_LINE_STRIP 0x0003 -#define GL_TRIANGLES 0x0004 -#define GL_TRIANGLE_STRIP 0x0005 -#define GL_TRIANGLE_FAN 0x0006 -#define GL_QUADS 0x0007 -#define GL_NEVER 0x0200 -#define GL_LESS 0x0201 -#define GL_EQUAL 0x0202 -#define GL_LEQUAL 0x0203 -#define GL_GREATER 0x0204 -#define GL_NOTEQUAL 0x0205 -#define GL_GEQUAL 0x0206 -#define GL_ALWAYS 0x0207 -#define GL_ZERO 0 -#define GL_ONE 1 -#define GL_SRC_COLOR 0x0300 -#define GL_ONE_MINUS_SRC_COLOR 0x0301 -#define GL_SRC_ALPHA 0x0302 -#define GL_ONE_MINUS_SRC_ALPHA 0x0303 -#define GL_DST_ALPHA 0x0304 -#define GL_ONE_MINUS_DST_ALPHA 0x0305 -#define GL_DST_COLOR 0x0306 -#define GL_ONE_MINUS_DST_COLOR 0x0307 -#define GL_SRC_ALPHA_SATURATE 0x0308 -#define GL_NONE 0 -#define GL_FRONT_LEFT 0x0400 -#define GL_FRONT_RIGHT 0x0401 -#define GL_BACK_LEFT 0x0402 -#define GL_BACK_RIGHT 0x0403 -#define GL_FRONT 0x0404 -#define GL_BACK 0x0405 -#define GL_LEFT 0x0406 -#define GL_RIGHT 0x0407 -#define GL_FRONT_AND_BACK 0x0408 -#define GL_NO_ERROR 0 -#define GL_INVALID_ENUM 0x0500 -#define GL_INVALID_VALUE 0x0501 -#define GL_INVALID_OPERATION 0x0502 -#define GL_OUT_OF_MEMORY 0x0505 -#define GL_CW 0x0900 -#define GL_CCW 0x0901 -#define GL_POINT_SIZE 0x0B11 -#define GL_POINT_SIZE_RANGE 0x0B12 -#define GL_POINT_SIZE_GRANULARITY 0x0B13 -#define GL_LINE_SMOOTH 0x0B20 -#define GL_LINE_WIDTH 0x0B21 -#define GL_LINE_WIDTH_RANGE 0x0B22 -#define GL_LINE_WIDTH_GRANULARITY 0x0B23 -#define GL_POLYGON_MODE 0x0B40 -#define GL_POLYGON_SMOOTH 0x0B41 -#define GL_CULL_FACE 0x0B44 -#define GL_CULL_FACE_MODE 0x0B45 -#define GL_FRONT_FACE 0x0B46 -#define GL_DEPTH_RANGE 0x0B70 -#define GL_DEPTH_TEST 0x0B71 -#define GL_DEPTH_WRITEMASK 0x0B72 -#define GL_DEPTH_CLEAR_VALUE 0x0B73 -#define GL_DEPTH_FUNC 0x0B74 -#define GL_STENCIL_TEST 0x0B90 -#define GL_STENCIL_CLEAR_VALUE 0x0B91 -#define GL_STENCIL_FUNC 0x0B92 -#define GL_STENCIL_VALUE_MASK 0x0B93 -#define GL_STENCIL_FAIL 0x0B94 -#define GL_STENCIL_PASS_DEPTH_FAIL 0x0B95 -#define GL_STENCIL_PASS_DEPTH_PASS 0x0B96 -#define GL_STENCIL_REF 0x0B97 -#define GL_STENCIL_WRITEMASK 0x0B98 -#define GL_VIEWPORT 0x0BA2 -#define GL_DITHER 0x0BD0 -#define GL_BLEND_DST 0x0BE0 -#define GL_BLEND_SRC 0x0BE1 -#define GL_BLEND 0x0BE2 -#define GL_LOGIC_OP_MODE 0x0BF0 -#define GL_DRAW_BUFFER 0x0C01 -#define GL_READ_BUFFER 0x0C02 -#define GL_SCISSOR_BOX 0x0C10 -#define GL_SCISSOR_TEST 0x0C11 -#define GL_COLOR_CLEAR_VALUE 0x0C22 -#define GL_COLOR_WRITEMASK 0x0C23 -#define GL_DOUBLEBUFFER 0x0C32 -#define GL_STEREO 0x0C33 -#define GL_LINE_SMOOTH_HINT 0x0C52 -#define GL_POLYGON_SMOOTH_HINT 0x0C53 -#define GL_UNPACK_SWAP_BYTES 0x0CF0 -#define GL_UNPACK_LSB_FIRST 0x0CF1 -#define GL_UNPACK_ROW_LENGTH 0x0CF2 -#define GL_UNPACK_SKIP_ROWS 0x0CF3 -#define GL_UNPACK_SKIP_PIXELS 0x0CF4 -#define GL_UNPACK_ALIGNMENT 0x0CF5 -#define GL_PACK_SWAP_BYTES 0x0D00 -#define GL_PACK_LSB_FIRST 0x0D01 -#define GL_PACK_ROW_LENGTH 0x0D02 -#define GL_PACK_SKIP_ROWS 0x0D03 -#define GL_PACK_SKIP_PIXELS 0x0D04 -#define GL_PACK_ALIGNMENT 0x0D05 -#define GL_MAX_TEXTURE_SIZE 0x0D33 -#define GL_MAX_VIEWPORT_DIMS 0x0D3A -#define GL_SUBPIXEL_BITS 0x0D50 -#define GL_TEXTURE_1D 0x0DE0 -#define GL_TEXTURE_2D 0x0DE1 -#define GL_TEXTURE_WIDTH 0x1000 -#define GL_TEXTURE_HEIGHT 0x1001 -#define GL_TEXTURE_BORDER_COLOR 0x1004 -#define GL_DONT_CARE 0x1100 -#define GL_FASTEST 0x1101 -#define GL_NICEST 0x1102 -#define GL_BYTE 0x1400 -#define GL_UNSIGNED_BYTE 0x1401 -#define GL_SHORT 0x1402 -#define GL_UNSIGNED_SHORT 0x1403 -#define GL_INT 0x1404 -#define GL_UNSIGNED_INT 0x1405 -#define GL_FLOAT 0x1406 -#define GL_STACK_OVERFLOW 0x0503 -#define GL_STACK_UNDERFLOW 0x0504 -#define GL_CLEAR 0x1500 -#define GL_AND 0x1501 -#define GL_AND_REVERSE 0x1502 -#define GL_COPY 0x1503 -#define GL_AND_INVERTED 0x1504 -#define GL_NOOP 0x1505 -#define GL_XOR 0x1506 -#define GL_OR 0x1507 -#define GL_NOR 0x1508 -#define GL_EQUIV 0x1509 -#define GL_INVERT 0x150A -#define GL_OR_REVERSE 0x150B -#define GL_COPY_INVERTED 0x150C -#define GL_OR_INVERTED 0x150D -#define GL_NAND 0x150E -#define GL_SET 0x150F -#define GL_TEXTURE 0x1702 -#define GL_COLOR 0x1800 -#define GL_DEPTH 0x1801 -#define GL_STENCIL 0x1802 -#define GL_STENCIL_INDEX 0x1901 -#define GL_DEPTH_COMPONENT 0x1902 -#define GL_RED 0x1903 -#define GL_GREEN 0x1904 -#define GL_BLUE 0x1905 -#define GL_ALPHA 0x1906 -#define GL_RGB 0x1907 -#define GL_RGBA 0x1908 -#define GL_POINT 0x1B00 -#define GL_LINE 0x1B01 -#define GL_FILL 0x1B02 -#define GL_KEEP 0x1E00 -#define GL_REPLACE 0x1E01 -#define GL_INCR 0x1E02 -#define GL_DECR 0x1E03 -#define GL_VENDOR 0x1F00 -#define GL_RENDERER 0x1F01 -#define GL_VERSION 0x1F02 -#define GL_EXTENSIONS 0x1F03 -#define GL_NEAREST 0x2600 -#define GL_LINEAR 0x2601 -#define GL_NEAREST_MIPMAP_NEAREST 0x2700 -#define GL_LINEAR_MIPMAP_NEAREST 0x2701 -#define GL_NEAREST_MIPMAP_LINEAR 0x2702 -#define GL_LINEAR_MIPMAP_LINEAR 0x2703 -#define GL_TEXTURE_MAG_FILTER 0x2800 -#define GL_TEXTURE_MIN_FILTER 0x2801 -#define GL_TEXTURE_WRAP_S 0x2802 -#define GL_TEXTURE_WRAP_T 0x2803 -#define GL_REPEAT 0x2901 -#define GL_CURRENT_BIT 0x00000001 -#define GL_POINT_BIT 0x00000002 -#define GL_LINE_BIT 0x00000004 -#define GL_POLYGON_BIT 0x00000008 -#define GL_POLYGON_STIPPLE_BIT 0x00000010 -#define GL_PIXEL_MODE_BIT 0x00000020 -#define GL_LIGHTING_BIT 0x00000040 -#define GL_FOG_BIT 0x00000080 -#define GL_ACCUM_BUFFER_BIT 0x00000200 -#define GL_VIEWPORT_BIT 0x00000800 -#define GL_TRANSFORM_BIT 0x00001000 -#define GL_ENABLE_BIT 0x00002000 -#define GL_HINT_BIT 0x00008000 -#define GL_EVAL_BIT 0x00010000 -#define GL_LIST_BIT 0x00020000 -#define GL_TEXTURE_BIT 0x00040000 -#define GL_SCISSOR_BIT 0x00080000 -#define GL_ALL_ATTRIB_BITS 0xFFFFFFFF -#define GL_QUAD_STRIP 0x0008 -#define GL_POLYGON 0x0009 -#define GL_ACCUM 0x0100 -#define GL_LOAD 0x0101 -#define GL_RETURN 0x0102 -#define GL_MULT 0x0103 -#define GL_ADD 0x0104 -#define GL_AUX0 0x0409 -#define GL_AUX1 0x040A -#define GL_AUX2 0x040B -#define GL_AUX3 0x040C -#define GL_2D 0x0600 -#define GL_3D 0x0601 -#define GL_3D_COLOR 0x0602 -#define GL_3D_COLOR_TEXTURE 0x0603 -#define GL_4D_COLOR_TEXTURE 0x0604 -#define GL_PASS_THROUGH_TOKEN 0x0700 -#define GL_POINT_TOKEN 0x0701 -#define GL_LINE_TOKEN 0x0702 -#define GL_POLYGON_TOKEN 0x0703 -#define GL_BITMAP_TOKEN 0x0704 -#define GL_DRAW_PIXEL_TOKEN 0x0705 -#define GL_COPY_PIXEL_TOKEN 0x0706 -#define GL_LINE_RESET_TOKEN 0x0707 -#define GL_EXP 0x0800 -#define GL_EXP2 0x0801 -#define GL_COEFF 0x0A00 -#define GL_ORDER 0x0A01 -#define GL_DOMAIN 0x0A02 -#define GL_PIXEL_MAP_I_TO_I 0x0C70 -#define GL_PIXEL_MAP_S_TO_S 0x0C71 -#define GL_PIXEL_MAP_I_TO_R 0x0C72 -#define GL_PIXEL_MAP_I_TO_G 0x0C73 -#define GL_PIXEL_MAP_I_TO_B 0x0C74 -#define GL_PIXEL_MAP_I_TO_A 0x0C75 -#define GL_PIXEL_MAP_R_TO_R 0x0C76 -#define GL_PIXEL_MAP_G_TO_G 0x0C77 -#define GL_PIXEL_MAP_B_TO_B 0x0C78 -#define GL_PIXEL_MAP_A_TO_A 0x0C79 -#define GL_CURRENT_COLOR 0x0B00 -#define GL_CURRENT_INDEX 0x0B01 -#define GL_CURRENT_NORMAL 0x0B02 -#define GL_CURRENT_TEXTURE_COORDS 0x0B03 -#define GL_CURRENT_RASTER_COLOR 0x0B04 -#define GL_CURRENT_RASTER_INDEX 0x0B05 -#define GL_CURRENT_RASTER_TEXTURE_COORDS 0x0B06 -#define GL_CURRENT_RASTER_POSITION 0x0B07 -#define GL_CURRENT_RASTER_POSITION_VALID 0x0B08 -#define GL_CURRENT_RASTER_DISTANCE 0x0B09 -#define GL_POINT_SMOOTH 0x0B10 -#define GL_LINE_STIPPLE 0x0B24 -#define GL_LINE_STIPPLE_PATTERN 0x0B25 -#define GL_LINE_STIPPLE_REPEAT 0x0B26 -#define GL_LIST_MODE 0x0B30 -#define GL_MAX_LIST_NESTING 0x0B31 -#define GL_LIST_BASE 0x0B32 -#define GL_LIST_INDEX 0x0B33 -#define GL_POLYGON_STIPPLE 0x0B42 -#define GL_EDGE_FLAG 0x0B43 -#define GL_LIGHTING 0x0B50 -#define GL_LIGHT_MODEL_LOCAL_VIEWER 0x0B51 -#define GL_LIGHT_MODEL_TWO_SIDE 0x0B52 -#define GL_LIGHT_MODEL_AMBIENT 0x0B53 -#define GL_SHADE_MODEL 0x0B54 -#define GL_COLOR_MATERIAL_FACE 0x0B55 -#define GL_COLOR_MATERIAL_PARAMETER 0x0B56 -#define GL_COLOR_MATERIAL 0x0B57 -#define GL_FOG 0x0B60 -#define GL_FOG_INDEX 0x0B61 -#define GL_FOG_DENSITY 0x0B62 -#define GL_FOG_START 0x0B63 -#define GL_FOG_END 0x0B64 -#define GL_FOG_MODE 0x0B65 -#define GL_FOG_COLOR 0x0B66 -#define GL_ACCUM_CLEAR_VALUE 0x0B80 -#define GL_MATRIX_MODE 0x0BA0 -#define GL_NORMALIZE 0x0BA1 -#define GL_MODELVIEW_STACK_DEPTH 0x0BA3 -#define GL_PROJECTION_STACK_DEPTH 0x0BA4 -#define GL_TEXTURE_STACK_DEPTH 0x0BA5 -#define GL_MODELVIEW_MATRIX 0x0BA6 -#define GL_PROJECTION_MATRIX 0x0BA7 -#define GL_TEXTURE_MATRIX 0x0BA8 -#define GL_ATTRIB_STACK_DEPTH 0x0BB0 -#define GL_ALPHA_TEST 0x0BC0 -#define GL_ALPHA_TEST_FUNC 0x0BC1 -#define GL_ALPHA_TEST_REF 0x0BC2 -#define GL_LOGIC_OP 0x0BF1 -#define GL_AUX_BUFFERS 0x0C00 -#define GL_INDEX_CLEAR_VALUE 0x0C20 -#define GL_INDEX_WRITEMASK 0x0C21 -#define GL_INDEX_MODE 0x0C30 -#define GL_RGBA_MODE 0x0C31 -#define GL_RENDER_MODE 0x0C40 -#define GL_PERSPECTIVE_CORRECTION_HINT 0x0C50 -#define GL_POINT_SMOOTH_HINT 0x0C51 -#define GL_FOG_HINT 0x0C54 -#define GL_TEXTURE_GEN_S 0x0C60 -#define GL_TEXTURE_GEN_T 0x0C61 -#define GL_TEXTURE_GEN_R 0x0C62 -#define GL_TEXTURE_GEN_Q 0x0C63 -#define GL_PIXEL_MAP_I_TO_I_SIZE 0x0CB0 -#define GL_PIXEL_MAP_S_TO_S_SIZE 0x0CB1 -#define GL_PIXEL_MAP_I_TO_R_SIZE 0x0CB2 -#define GL_PIXEL_MAP_I_TO_G_SIZE 0x0CB3 -#define GL_PIXEL_MAP_I_TO_B_SIZE 0x0CB4 -#define GL_PIXEL_MAP_I_TO_A_SIZE 0x0CB5 -#define GL_PIXEL_MAP_R_TO_R_SIZE 0x0CB6 -#define GL_PIXEL_MAP_G_TO_G_SIZE 0x0CB7 -#define GL_PIXEL_MAP_B_TO_B_SIZE 0x0CB8 -#define GL_PIXEL_MAP_A_TO_A_SIZE 0x0CB9 -#define GL_MAP_COLOR 0x0D10 -#define GL_MAP_STENCIL 0x0D11 -#define GL_INDEX_SHIFT 0x0D12 -#define GL_INDEX_OFFSET 0x0D13 -#define GL_RED_SCALE 0x0D14 -#define GL_RED_BIAS 0x0D15 -#define GL_ZOOM_X 0x0D16 -#define GL_ZOOM_Y 0x0D17 -#define GL_GREEN_SCALE 0x0D18 -#define GL_GREEN_BIAS 0x0D19 -#define GL_BLUE_SCALE 0x0D1A -#define GL_BLUE_BIAS 0x0D1B -#define GL_ALPHA_SCALE 0x0D1C -#define GL_ALPHA_BIAS 0x0D1D -#define GL_DEPTH_SCALE 0x0D1E -#define GL_DEPTH_BIAS 0x0D1F -#define GL_MAX_EVAL_ORDER 0x0D30 -#define GL_MAX_LIGHTS 0x0D31 -#define GL_MAX_CLIP_PLANES 0x0D32 -#define GL_MAX_PIXEL_MAP_TABLE 0x0D34 -#define GL_MAX_ATTRIB_STACK_DEPTH 0x0D35 -#define GL_MAX_MODELVIEW_STACK_DEPTH 0x0D36 -#define GL_MAX_NAME_STACK_DEPTH 0x0D37 -#define GL_MAX_PROJECTION_STACK_DEPTH 0x0D38 -#define GL_MAX_TEXTURE_STACK_DEPTH 0x0D39 -#define GL_INDEX_BITS 0x0D51 -#define GL_RED_BITS 0x0D52 -#define GL_GREEN_BITS 0x0D53 -#define GL_BLUE_BITS 0x0D54 -#define GL_ALPHA_BITS 0x0D55 -#define GL_DEPTH_BITS 0x0D56 -#define GL_STENCIL_BITS 0x0D57 -#define GL_ACCUM_RED_BITS 0x0D58 -#define GL_ACCUM_GREEN_BITS 0x0D59 -#define GL_ACCUM_BLUE_BITS 0x0D5A -#define GL_ACCUM_ALPHA_BITS 0x0D5B -#define GL_NAME_STACK_DEPTH 0x0D70 -#define GL_AUTO_NORMAL 0x0D80 -#define GL_MAP1_COLOR_4 0x0D90 -#define GL_MAP1_INDEX 0x0D91 -#define GL_MAP1_NORMAL 0x0D92 -#define GL_MAP1_TEXTURE_COORD_1 0x0D93 -#define GL_MAP1_TEXTURE_COORD_2 0x0D94 -#define GL_MAP1_TEXTURE_COORD_3 0x0D95 -#define GL_MAP1_TEXTURE_COORD_4 0x0D96 -#define GL_MAP1_VERTEX_3 0x0D97 -#define GL_MAP1_VERTEX_4 0x0D98 -#define GL_MAP2_COLOR_4 0x0DB0 -#define GL_MAP2_INDEX 0x0DB1 -#define GL_MAP2_NORMAL 0x0DB2 -#define GL_MAP2_TEXTURE_COORD_1 0x0DB3 -#define GL_MAP2_TEXTURE_COORD_2 0x0DB4 -#define GL_MAP2_TEXTURE_COORD_3 0x0DB5 -#define GL_MAP2_TEXTURE_COORD_4 0x0DB6 -#define GL_MAP2_VERTEX_3 0x0DB7 -#define GL_MAP2_VERTEX_4 0x0DB8 -#define GL_MAP1_GRID_DOMAIN 0x0DD0 -#define GL_MAP1_GRID_SEGMENTS 0x0DD1 -#define GL_MAP2_GRID_DOMAIN 0x0DD2 -#define GL_MAP2_GRID_SEGMENTS 0x0DD3 -#define GL_TEXTURE_COMPONENTS 0x1003 -#define GL_TEXTURE_BORDER 0x1005 -#define GL_AMBIENT 0x1200 -#define GL_DIFFUSE 0x1201 -#define GL_SPECULAR 0x1202 -#define GL_POSITION 0x1203 -#define GL_SPOT_DIRECTION 0x1204 -#define GL_SPOT_EXPONENT 0x1205 -#define GL_SPOT_CUTOFF 0x1206 -#define GL_CONSTANT_ATTENUATION 0x1207 -#define GL_LINEAR_ATTENUATION 0x1208 -#define GL_QUADRATIC_ATTENUATION 0x1209 -#define GL_COMPILE 0x1300 -#define GL_COMPILE_AND_EXECUTE 0x1301 -#define GL_2_BYTES 0x1407 -#define GL_3_BYTES 0x1408 -#define GL_4_BYTES 0x1409 -#define GL_EMISSION 0x1600 -#define GL_SHININESS 0x1601 -#define GL_AMBIENT_AND_DIFFUSE 0x1602 -#define GL_COLOR_INDEXES 0x1603 -#define GL_MODELVIEW 0x1700 -#define GL_PROJECTION 0x1701 -#define GL_COLOR_INDEX 0x1900 -#define GL_LUMINANCE 0x1909 -#define GL_LUMINANCE_ALPHA 0x190A -#define GL_BITMAP 0x1A00 -#define GL_RENDER 0x1C00 -#define GL_FEEDBACK 0x1C01 -#define GL_SELECT 0x1C02 -#define GL_FLAT 0x1D00 -#define GL_SMOOTH 0x1D01 -#define GL_S 0x2000 -#define GL_T 0x2001 -#define GL_R 0x2002 -#define GL_Q 0x2003 -#define GL_MODULATE 0x2100 -#define GL_DECAL 0x2101 -#define GL_TEXTURE_ENV_MODE 0x2200 -#define GL_TEXTURE_ENV_COLOR 0x2201 -#define GL_TEXTURE_ENV 0x2300 -#define GL_EYE_LINEAR 0x2400 -#define GL_OBJECT_LINEAR 0x2401 -#define GL_SPHERE_MAP 0x2402 -#define GL_TEXTURE_GEN_MODE 0x2500 -#define GL_OBJECT_PLANE 0x2501 -#define GL_EYE_PLANE 0x2502 -#define GL_CLAMP 0x2900 -#define GL_CLIP_PLANE0 0x3000 -#define GL_CLIP_PLANE1 0x3001 -#define GL_CLIP_PLANE2 0x3002 -#define GL_CLIP_PLANE3 0x3003 -#define GL_CLIP_PLANE4 0x3004 -#define GL_CLIP_PLANE5 0x3005 -#define GL_LIGHT0 0x4000 -#define GL_LIGHT1 0x4001 -#define GL_LIGHT2 0x4002 -#define GL_LIGHT3 0x4003 -#define GL_LIGHT4 0x4004 -#define GL_LIGHT5 0x4005 -#define GL_LIGHT6 0x4006 -#define GL_LIGHT7 0x4007 -#define GL_COLOR_LOGIC_OP 0x0BF2 -#define GL_POLYGON_OFFSET_UNITS 0x2A00 -#define GL_POLYGON_OFFSET_POINT 0x2A01 -#define GL_POLYGON_OFFSET_LINE 0x2A02 -#define GL_POLYGON_OFFSET_FILL 0x8037 -#define GL_POLYGON_OFFSET_FACTOR 0x8038 -#define GL_TEXTURE_BINDING_1D 0x8068 -#define GL_TEXTURE_BINDING_2D 0x8069 -#define GL_TEXTURE_INTERNAL_FORMAT 0x1003 -#define GL_TEXTURE_RED_SIZE 0x805C -#define GL_TEXTURE_GREEN_SIZE 0x805D -#define GL_TEXTURE_BLUE_SIZE 0x805E -#define GL_TEXTURE_ALPHA_SIZE 0x805F -#define GL_DOUBLE 0x140A -#define GL_PROXY_TEXTURE_1D 0x8063 -#define GL_PROXY_TEXTURE_2D 0x8064 -#define GL_R3_G3_B2 0x2A10 -#define GL_RGB4 0x804F -#define GL_RGB5 0x8050 -#define GL_RGB8 0x8051 -#define GL_RGB10 0x8052 -#define GL_RGB12 0x8053 -#define GL_RGB16 0x8054 -#define GL_RGBA2 0x8055 -#define GL_RGBA4 0x8056 -#define GL_RGB5_A1 0x8057 -#define GL_RGBA8 0x8058 -#define GL_RGB10_A2 0x8059 -#define GL_RGBA12 0x805A -#define GL_RGBA16 0x805B -#define GL_CLIENT_PIXEL_STORE_BIT 0x00000001 -#define GL_CLIENT_VERTEX_ARRAY_BIT 0x00000002 -#define GL_CLIENT_ALL_ATTRIB_BITS 0xFFFFFFFF -#define GL_VERTEX_ARRAY_POINTER 0x808E -#define GL_NORMAL_ARRAY_POINTER 0x808F -#define GL_COLOR_ARRAY_POINTER 0x8090 -#define GL_INDEX_ARRAY_POINTER 0x8091 -#define GL_TEXTURE_COORD_ARRAY_POINTER 0x8092 -#define GL_EDGE_FLAG_ARRAY_POINTER 0x8093 -#define GL_FEEDBACK_BUFFER_POINTER 0x0DF0 -#define GL_SELECTION_BUFFER_POINTER 0x0DF3 -#define GL_CLIENT_ATTRIB_STACK_DEPTH 0x0BB1 -#define GL_INDEX_LOGIC_OP 0x0BF1 -#define GL_MAX_CLIENT_ATTRIB_STACK_DEPTH 0x0D3B -#define GL_FEEDBACK_BUFFER_SIZE 0x0DF1 -#define GL_FEEDBACK_BUFFER_TYPE 0x0DF2 -#define GL_SELECTION_BUFFER_SIZE 0x0DF4 -#define GL_VERTEX_ARRAY 0x8074 -#define GL_NORMAL_ARRAY 0x8075 -#define GL_COLOR_ARRAY 0x8076 -#define GL_INDEX_ARRAY 0x8077 -#define GL_TEXTURE_COORD_ARRAY 0x8078 -#define GL_EDGE_FLAG_ARRAY 0x8079 -#define GL_VERTEX_ARRAY_SIZE 0x807A -#define GL_VERTEX_ARRAY_TYPE 0x807B -#define GL_VERTEX_ARRAY_STRIDE 0x807C -#define GL_NORMAL_ARRAY_TYPE 0x807E -#define GL_NORMAL_ARRAY_STRIDE 0x807F -#define GL_COLOR_ARRAY_SIZE 0x8081 -#define GL_COLOR_ARRAY_TYPE 0x8082 -#define GL_COLOR_ARRAY_STRIDE 0x8083 -#define GL_INDEX_ARRAY_TYPE 0x8085 -#define GL_INDEX_ARRAY_STRIDE 0x8086 -#define GL_TEXTURE_COORD_ARRAY_SIZE 0x8088 -#define GL_TEXTURE_COORD_ARRAY_TYPE 0x8089 -#define GL_TEXTURE_COORD_ARRAY_STRIDE 0x808A -#define GL_EDGE_FLAG_ARRAY_STRIDE 0x808C -#define GL_TEXTURE_LUMINANCE_SIZE 0x8060 -#define GL_TEXTURE_INTENSITY_SIZE 0x8061 -#define GL_TEXTURE_PRIORITY 0x8066 -#define GL_TEXTURE_RESIDENT 0x8067 -#define GL_ALPHA4 0x803B -#define GL_ALPHA8 0x803C -#define GL_ALPHA12 0x803D -#define GL_ALPHA16 0x803E -#define GL_LUMINANCE4 0x803F -#define GL_LUMINANCE8 0x8040 -#define GL_LUMINANCE12 0x8041 -#define GL_LUMINANCE16 0x8042 -#define GL_LUMINANCE4_ALPHA4 0x8043 -#define GL_LUMINANCE6_ALPHA2 0x8044 -#define GL_LUMINANCE8_ALPHA8 0x8045 -#define GL_LUMINANCE12_ALPHA4 0x8046 -#define GL_LUMINANCE12_ALPHA12 0x8047 -#define GL_LUMINANCE16_ALPHA16 0x8048 -#define GL_INTENSITY 0x8049 -#define GL_INTENSITY4 0x804A -#define GL_INTENSITY8 0x804B -#define GL_INTENSITY12 0x804C -#define GL_INTENSITY16 0x804D -#define GL_V2F 0x2A20 -#define GL_V3F 0x2A21 -#define GL_C4UB_V2F 0x2A22 -#define GL_C4UB_V3F 0x2A23 -#define GL_C3F_V3F 0x2A24 -#define GL_N3F_V3F 0x2A25 -#define GL_C4F_N3F_V3F 0x2A26 -#define GL_T2F_V3F 0x2A27 -#define GL_T4F_V4F 0x2A28 -#define GL_T2F_C4UB_V3F 0x2A29 -#define GL_T2F_C3F_V3F 0x2A2A -#define GL_T2F_N3F_V3F 0x2A2B -#define GL_T2F_C4F_N3F_V3F 0x2A2C -#define GL_T4F_C4F_N3F_V4F 0x2A2D -#define GL_UNSIGNED_BYTE_3_3_2 0x8032 -#define GL_UNSIGNED_SHORT_4_4_4_4 0x8033 -#define GL_UNSIGNED_SHORT_5_5_5_1 0x8034 -#define GL_UNSIGNED_INT_8_8_8_8 0x8035 -#define GL_UNSIGNED_INT_10_10_10_2 0x8036 -#define GL_TEXTURE_BINDING_3D 0x806A -#define GL_PACK_SKIP_IMAGES 0x806B -#define GL_PACK_IMAGE_HEIGHT 0x806C -#define GL_UNPACK_SKIP_IMAGES 0x806D -#define GL_UNPACK_IMAGE_HEIGHT 0x806E -#define GL_TEXTURE_3D 0x806F -#define GL_PROXY_TEXTURE_3D 0x8070 -#define GL_TEXTURE_DEPTH 0x8071 -#define GL_TEXTURE_WRAP_R 0x8072 -#define GL_MAX_3D_TEXTURE_SIZE 0x8073 -#define GL_UNSIGNED_BYTE_2_3_3_REV 0x8362 -#define GL_UNSIGNED_SHORT_5_6_5 0x8363 -#define GL_UNSIGNED_SHORT_5_6_5_REV 0x8364 -#define GL_UNSIGNED_SHORT_4_4_4_4_REV 0x8365 -#define GL_UNSIGNED_SHORT_1_5_5_5_REV 0x8366 -#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367 -#define GL_UNSIGNED_INT_2_10_10_10_REV 0x8368 -#define GL_BGR 0x80E0 -#define GL_BGRA 0x80E1 -#define GL_MAX_ELEMENTS_VERTICES 0x80E8 -#define GL_MAX_ELEMENTS_INDICES 0x80E9 -#define GL_CLAMP_TO_EDGE 0x812F -#define GL_TEXTURE_MIN_LOD 0x813A -#define GL_TEXTURE_MAX_LOD 0x813B -#define GL_TEXTURE_BASE_LEVEL 0x813C -#define GL_TEXTURE_MAX_LEVEL 0x813D -#define GL_SMOOTH_POINT_SIZE_RANGE 0x0B12 -#define GL_SMOOTH_POINT_SIZE_GRANULARITY 0x0B13 -#define GL_SMOOTH_LINE_WIDTH_RANGE 0x0B22 -#define GL_SMOOTH_LINE_WIDTH_GRANULARITY 0x0B23 -#define GL_ALIASED_LINE_WIDTH_RANGE 0x846E -#define GL_RESCALE_NORMAL 0x803A -#define GL_LIGHT_MODEL_COLOR_CONTROL 0x81F8 -#define GL_SINGLE_COLOR 0x81F9 -#define GL_SEPARATE_SPECULAR_COLOR 0x81FA -#define GL_ALIASED_POINT_SIZE_RANGE 0x846D -#define GL_TEXTURE0 0x84C0 -#define GL_TEXTURE1 0x84C1 -#define GL_TEXTURE2 0x84C2 -#define GL_TEXTURE3 0x84C3 -#define GL_TEXTURE4 0x84C4 -#define GL_TEXTURE5 0x84C5 -#define GL_TEXTURE6 0x84C6 -#define GL_TEXTURE7 0x84C7 -#define GL_TEXTURE8 0x84C8 -#define GL_TEXTURE9 0x84C9 -#define GL_TEXTURE10 0x84CA -#define GL_TEXTURE11 0x84CB -#define GL_TEXTURE12 0x84CC -#define GL_TEXTURE13 0x84CD -#define GL_TEXTURE14 0x84CE -#define GL_TEXTURE15 0x84CF -#define GL_TEXTURE16 0x84D0 -#define GL_TEXTURE17 0x84D1 -#define GL_TEXTURE18 0x84D2 -#define GL_TEXTURE19 0x84D3 -#define GL_TEXTURE20 0x84D4 -#define GL_TEXTURE21 0x84D5 -#define GL_TEXTURE22 0x84D6 -#define GL_TEXTURE23 0x84D7 -#define GL_TEXTURE24 0x84D8 -#define GL_TEXTURE25 0x84D9 -#define GL_TEXTURE26 0x84DA -#define GL_TEXTURE27 0x84DB -#define GL_TEXTURE28 0x84DC -#define GL_TEXTURE29 0x84DD -#define GL_TEXTURE30 0x84DE -#define GL_TEXTURE31 0x84DF -#define GL_ACTIVE_TEXTURE 0x84E0 -#define GL_MULTISAMPLE 0x809D -#define GL_SAMPLE_ALPHA_TO_COVERAGE 0x809E -#define GL_SAMPLE_ALPHA_TO_ONE 0x809F -#define GL_SAMPLE_COVERAGE 0x80A0 -#define GL_SAMPLE_BUFFERS 0x80A8 -#define GL_SAMPLES 0x80A9 -#define GL_SAMPLE_COVERAGE_VALUE 0x80AA -#define GL_SAMPLE_COVERAGE_INVERT 0x80AB -#define GL_TEXTURE_CUBE_MAP 0x8513 -#define GL_TEXTURE_BINDING_CUBE_MAP 0x8514 -#define GL_TEXTURE_CUBE_MAP_POSITIVE_X 0x8515 -#define GL_TEXTURE_CUBE_MAP_NEGATIVE_X 0x8516 -#define GL_TEXTURE_CUBE_MAP_POSITIVE_Y 0x8517 -#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Y 0x8518 -#define GL_TEXTURE_CUBE_MAP_POSITIVE_Z 0x8519 -#define GL_TEXTURE_CUBE_MAP_NEGATIVE_Z 0x851A -#define GL_PROXY_TEXTURE_CUBE_MAP 0x851B -#define GL_MAX_CUBE_MAP_TEXTURE_SIZE 0x851C -#define GL_COMPRESSED_RGB 0x84ED -#define GL_COMPRESSED_RGBA 0x84EE -#define GL_TEXTURE_COMPRESSION_HINT 0x84EF -#define GL_TEXTURE_COMPRESSED_IMAGE_SIZE 0x86A0 -#define GL_TEXTURE_COMPRESSED 0x86A1 -#define GL_NUM_COMPRESSED_TEXTURE_FORMATS 0x86A2 -#define GL_COMPRESSED_TEXTURE_FORMATS 0x86A3 -#define GL_CLAMP_TO_BORDER 0x812D -#define GL_CLIENT_ACTIVE_TEXTURE 0x84E1 -#define GL_MAX_TEXTURE_UNITS 0x84E2 -#define GL_TRANSPOSE_MODELVIEW_MATRIX 0x84E3 -#define GL_TRANSPOSE_PROJECTION_MATRIX 0x84E4 -#define GL_TRANSPOSE_TEXTURE_MATRIX 0x84E5 -#define GL_TRANSPOSE_COLOR_MATRIX 0x84E6 -#define GL_MULTISAMPLE_BIT 0x20000000 -#define GL_NORMAL_MAP 0x8511 -#define GL_REFLECTION_MAP 0x8512 -#define GL_COMPRESSED_ALPHA 0x84E9 -#define GL_COMPRESSED_LUMINANCE 0x84EA -#define GL_COMPRESSED_LUMINANCE_ALPHA 0x84EB -#define GL_COMPRESSED_INTENSITY 0x84EC -#define GL_COMBINE 0x8570 -#define GL_COMBINE_RGB 0x8571 -#define GL_COMBINE_ALPHA 0x8572 -#define GL_SOURCE0_RGB 0x8580 -#define GL_SOURCE1_RGB 0x8581 -#define GL_SOURCE2_RGB 0x8582 -#define GL_SOURCE0_ALPHA 0x8588 -#define GL_SOURCE1_ALPHA 0x8589 -#define GL_SOURCE2_ALPHA 0x858A -#define GL_OPERAND0_RGB 0x8590 -#define GL_OPERAND1_RGB 0x8591 -#define GL_OPERAND2_RGB 0x8592 -#define GL_OPERAND0_ALPHA 0x8598 -#define GL_OPERAND1_ALPHA 0x8599 -#define GL_OPERAND2_ALPHA 0x859A -#define GL_RGB_SCALE 0x8573 -#define GL_ADD_SIGNED 0x8574 -#define GL_INTERPOLATE 0x8575 -#define GL_SUBTRACT 0x84E7 -#define GL_CONSTANT 0x8576 -#define GL_PRIMARY_COLOR 0x8577 -#define GL_PREVIOUS 0x8578 -#define GL_DOT3_RGB 0x86AE -#define GL_DOT3_RGBA 0x86AF -#define GL_BLEND_DST_RGB 0x80C8 -#define GL_BLEND_SRC_RGB 0x80C9 -#define GL_BLEND_DST_ALPHA 0x80CA -#define GL_BLEND_SRC_ALPHA 0x80CB -#define GL_POINT_FADE_THRESHOLD_SIZE 0x8128 -#define GL_DEPTH_COMPONENT16 0x81A5 -#define GL_DEPTH_COMPONENT24 0x81A6 -#define GL_DEPTH_COMPONENT32 0x81A7 -#define GL_MIRRORED_REPEAT 0x8370 -#define GL_MAX_TEXTURE_LOD_BIAS 0x84FD -#define GL_TEXTURE_LOD_BIAS 0x8501 -#define GL_INCR_WRAP 0x8507 -#define GL_DECR_WRAP 0x8508 -#define GL_TEXTURE_DEPTH_SIZE 0x884A -#define GL_TEXTURE_COMPARE_MODE 0x884C -#define GL_TEXTURE_COMPARE_FUNC 0x884D -#define GL_POINT_SIZE_MIN 0x8126 -#define GL_POINT_SIZE_MAX 0x8127 -#define GL_POINT_DISTANCE_ATTENUATION 0x8129 -#define GL_GENERATE_MIPMAP 0x8191 -#define GL_GENERATE_MIPMAP_HINT 0x8192 -#define GL_FOG_COORDINATE_SOURCE 0x8450 -#define GL_FOG_COORDINATE 0x8451 -#define GL_FRAGMENT_DEPTH 0x8452 -#define GL_CURRENT_FOG_COORDINATE 0x8453 -#define GL_FOG_COORDINATE_ARRAY_TYPE 0x8454 -#define GL_FOG_COORDINATE_ARRAY_STRIDE 0x8455 -#define GL_FOG_COORDINATE_ARRAY_POINTER 0x8456 -#define GL_FOG_COORDINATE_ARRAY 0x8457 -#define GL_COLOR_SUM 0x8458 -#define GL_CURRENT_SECONDARY_COLOR 0x8459 -#define GL_SECONDARY_COLOR_ARRAY_SIZE 0x845A -#define GL_SECONDARY_COLOR_ARRAY_TYPE 0x845B -#define GL_SECONDARY_COLOR_ARRAY_STRIDE 0x845C -#define GL_SECONDARY_COLOR_ARRAY_POINTER 0x845D -#define GL_SECONDARY_COLOR_ARRAY 0x845E -#define GL_TEXTURE_FILTER_CONTROL 0x8500 -#define GL_DEPTH_TEXTURE_MODE 0x884B -#define GL_COMPARE_R_TO_TEXTURE 0x884E -#define GL_BLEND_COLOR 0x8005 -#define GL_BLEND_EQUATION 0x8009 -#define GL_CONSTANT_COLOR 0x8001 -#define GL_ONE_MINUS_CONSTANT_COLOR 0x8002 -#define GL_CONSTANT_ALPHA 0x8003 -#define GL_ONE_MINUS_CONSTANT_ALPHA 0x8004 -#define GL_FUNC_ADD 0x8006 -#define GL_FUNC_REVERSE_SUBTRACT 0x800B -#define GL_FUNC_SUBTRACT 0x800A -#define GL_MIN 0x8007 -#define GL_MAX 0x8008 -#define GL_BUFFER_SIZE 0x8764 -#define GL_BUFFER_USAGE 0x8765 -#define GL_QUERY_COUNTER_BITS 0x8864 -#define GL_CURRENT_QUERY 0x8865 -#define GL_QUERY_RESULT 0x8866 -#define GL_QUERY_RESULT_AVAILABLE 0x8867 -#define GL_ARRAY_BUFFER 0x8892 -#define GL_ELEMENT_ARRAY_BUFFER 0x8893 -#define GL_ARRAY_BUFFER_BINDING 0x8894 -#define GL_ELEMENT_ARRAY_BUFFER_BINDING 0x8895 -#define GL_VERTEX_ATTRIB_ARRAY_BUFFER_BINDING 0x889F -#define GL_READ_ONLY 0x88B8 -#define GL_WRITE_ONLY 0x88B9 -#define GL_READ_WRITE 0x88BA -#define GL_BUFFER_ACCESS 0x88BB -#define GL_BUFFER_MAPPED 0x88BC -#define GL_BUFFER_MAP_POINTER 0x88BD -#define GL_STREAM_DRAW 0x88E0 -#define GL_STREAM_READ 0x88E1 -#define GL_STREAM_COPY 0x88E2 -#define GL_STATIC_DRAW 0x88E4 -#define GL_STATIC_READ 0x88E5 -#define GL_STATIC_COPY 0x88E6 -#define GL_DYNAMIC_DRAW 0x88E8 -#define GL_DYNAMIC_READ 0x88E9 -#define GL_DYNAMIC_COPY 0x88EA -#define GL_SAMPLES_PASSED 0x8914 -#define GL_SRC1_ALPHA 0x8589 -#define GL_VERTEX_ARRAY_BUFFER_BINDING 0x8896 -#define GL_NORMAL_ARRAY_BUFFER_BINDING 0x8897 -#define GL_COLOR_ARRAY_BUFFER_BINDING 0x8898 -#define GL_INDEX_ARRAY_BUFFER_BINDING 0x8899 -#define GL_TEXTURE_COORD_ARRAY_BUFFER_BINDING 0x889A -#define GL_EDGE_FLAG_ARRAY_BUFFER_BINDING 0x889B -#define GL_SECONDARY_COLOR_ARRAY_BUFFER_BINDING 0x889C -#define GL_FOG_COORDINATE_ARRAY_BUFFER_BINDING 0x889D -#define GL_WEIGHT_ARRAY_BUFFER_BINDING 0x889E -#define GL_FOG_COORD_SRC 0x8450 -#define GL_FOG_COORD 0x8451 -#define GL_CURRENT_FOG_COORD 0x8453 -#define GL_FOG_COORD_ARRAY_TYPE 0x8454 -#define GL_FOG_COORD_ARRAY_STRIDE 0x8455 -#define GL_FOG_COORD_ARRAY_POINTER 0x8456 -#define GL_FOG_COORD_ARRAY 0x8457 -#define GL_FOG_COORD_ARRAY_BUFFER_BINDING 0x889D -#define GL_SRC0_RGB 0x8580 -#define GL_SRC1_RGB 0x8581 -#define GL_SRC2_RGB 0x8582 -#define GL_SRC0_ALPHA 0x8588 -#define GL_SRC2_ALPHA 0x858A -#define GL_BLEND_EQUATION_RGB 0x8009 -#define GL_VERTEX_ATTRIB_ARRAY_ENABLED 0x8622 -#define GL_VERTEX_ATTRIB_ARRAY_SIZE 0x8623 -#define GL_VERTEX_ATTRIB_ARRAY_STRIDE 0x8624 -#define GL_VERTEX_ATTRIB_ARRAY_TYPE 0x8625 -#define GL_CURRENT_VERTEX_ATTRIB 0x8626 -#define GL_VERTEX_PROGRAM_POINT_SIZE 0x8642 -#define GL_VERTEX_ATTRIB_ARRAY_POINTER 0x8645 -#define GL_STENCIL_BACK_FUNC 0x8800 -#define GL_STENCIL_BACK_FAIL 0x8801 -#define GL_STENCIL_BACK_PASS_DEPTH_FAIL 0x8802 -#define GL_STENCIL_BACK_PASS_DEPTH_PASS 0x8803 -#define GL_MAX_DRAW_BUFFERS 0x8824 -#define GL_DRAW_BUFFER0 0x8825 -#define GL_DRAW_BUFFER1 0x8826 -#define GL_DRAW_BUFFER2 0x8827 -#define GL_DRAW_BUFFER3 0x8828 -#define GL_DRAW_BUFFER4 0x8829 -#define GL_DRAW_BUFFER5 0x882A -#define GL_DRAW_BUFFER6 0x882B -#define GL_DRAW_BUFFER7 0x882C -#define GL_DRAW_BUFFER8 0x882D -#define GL_DRAW_BUFFER9 0x882E -#define GL_DRAW_BUFFER10 0x882F -#define GL_DRAW_BUFFER11 0x8830 -#define GL_DRAW_BUFFER12 0x8831 -#define GL_DRAW_BUFFER13 0x8832 -#define GL_DRAW_BUFFER14 0x8833 -#define GL_DRAW_BUFFER15 0x8834 -#define GL_BLEND_EQUATION_ALPHA 0x883D -#define GL_MAX_VERTEX_ATTRIBS 0x8869 -#define GL_VERTEX_ATTRIB_ARRAY_NORMALIZED 0x886A -#define GL_MAX_TEXTURE_IMAGE_UNITS 0x8872 -#define GL_FRAGMENT_SHADER 0x8B30 -#define GL_VERTEX_SHADER 0x8B31 -#define GL_MAX_FRAGMENT_UNIFORM_COMPONENTS 0x8B49 -#define GL_MAX_VERTEX_UNIFORM_COMPONENTS 0x8B4A -#define GL_MAX_VARYING_FLOATS 0x8B4B -#define GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS 0x8B4C -#define GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS 0x8B4D -#define GL_SHADER_TYPE 0x8B4F -#define GL_FLOAT_VEC2 0x8B50 -#define GL_FLOAT_VEC3 0x8B51 -#define GL_FLOAT_VEC4 0x8B52 -#define GL_INT_VEC2 0x8B53 -#define GL_INT_VEC3 0x8B54 -#define GL_INT_VEC4 0x8B55 -#define GL_BOOL 0x8B56 -#define GL_BOOL_VEC2 0x8B57 -#define GL_BOOL_VEC3 0x8B58 -#define GL_BOOL_VEC4 0x8B59 -#define GL_FLOAT_MAT2 0x8B5A -#define GL_FLOAT_MAT3 0x8B5B -#define GL_FLOAT_MAT4 0x8B5C -#define GL_SAMPLER_1D 0x8B5D -#define GL_SAMPLER_2D 0x8B5E -#define GL_SAMPLER_3D 0x8B5F -#define GL_SAMPLER_CUBE 0x8B60 -#define GL_SAMPLER_1D_SHADOW 0x8B61 -#define GL_SAMPLER_2D_SHADOW 0x8B62 -#define GL_DELETE_STATUS 0x8B80 -#define GL_COMPILE_STATUS 0x8B81 -#define GL_LINK_STATUS 0x8B82 -#define GL_VALIDATE_STATUS 0x8B83 -#define GL_INFO_LOG_LENGTH 0x8B84 -#define GL_ATTACHED_SHADERS 0x8B85 -#define GL_ACTIVE_UNIFORMS 0x8B86 -#define GL_ACTIVE_UNIFORM_MAX_LENGTH 0x8B87 -#define GL_SHADER_SOURCE_LENGTH 0x8B88 -#define GL_ACTIVE_ATTRIBUTES 0x8B89 -#define GL_ACTIVE_ATTRIBUTE_MAX_LENGTH 0x8B8A -#define GL_FRAGMENT_SHADER_DERIVATIVE_HINT 0x8B8B -#define GL_SHADING_LANGUAGE_VERSION 0x8B8C -#define GL_CURRENT_PROGRAM 0x8B8D -#define GL_POINT_SPRITE_COORD_ORIGIN 0x8CA0 -#define GL_LOWER_LEFT 0x8CA1 -#define GL_UPPER_LEFT 0x8CA2 -#define GL_STENCIL_BACK_REF 0x8CA3 -#define GL_STENCIL_BACK_VALUE_MASK 0x8CA4 -#define GL_STENCIL_BACK_WRITEMASK 0x8CA5 -#define GL_VERTEX_PROGRAM_TWO_SIDE 0x8643 -#define GL_POINT_SPRITE 0x8861 -#define GL_COORD_REPLACE 0x8862 -#define GL_MAX_TEXTURE_COORDS 0x8871 -#define GL_PIXEL_PACK_BUFFER 0x88EB -#define GL_PIXEL_UNPACK_BUFFER 0x88EC -#define GL_PIXEL_PACK_BUFFER_BINDING 0x88ED -#define GL_PIXEL_UNPACK_BUFFER_BINDING 0x88EF -#define GL_FLOAT_MAT2x3 0x8B65 -#define GL_FLOAT_MAT2x4 0x8B66 -#define GL_FLOAT_MAT3x2 0x8B67 -#define GL_FLOAT_MAT3x4 0x8B68 -#define GL_FLOAT_MAT4x2 0x8B69 -#define GL_FLOAT_MAT4x3 0x8B6A -#define GL_SRGB 0x8C40 -#define GL_SRGB8 0x8C41 -#define GL_SRGB_ALPHA 0x8C42 -#define GL_SRGB8_ALPHA8 0x8C43 -#define GL_COMPRESSED_SRGB 0x8C48 -#define GL_COMPRESSED_SRGB_ALPHA 0x8C49 -#define GL_CURRENT_RASTER_SECONDARY_COLOR 0x845F -#define GL_SLUMINANCE_ALPHA 0x8C44 -#define GL_SLUMINANCE8_ALPHA8 0x8C45 -#define GL_SLUMINANCE 0x8C46 -#define GL_SLUMINANCE8 0x8C47 -#define GL_COMPRESSED_SLUMINANCE 0x8C4A -#define GL_COMPRESSED_SLUMINANCE_ALPHA 0x8C4B -#define GL_COMPARE_REF_TO_TEXTURE 0x884E -#define GL_CLIP_DISTANCE0 0x3000 -#define GL_CLIP_DISTANCE1 0x3001 -#define GL_CLIP_DISTANCE2 0x3002 -#define GL_CLIP_DISTANCE3 0x3003 -#define GL_CLIP_DISTANCE4 0x3004 -#define GL_CLIP_DISTANCE5 0x3005 -#define GL_CLIP_DISTANCE6 0x3006 -#define GL_CLIP_DISTANCE7 0x3007 -#define GL_MAX_CLIP_DISTANCES 0x0D32 -#define GL_MAJOR_VERSION 0x821B -#define GL_MINOR_VERSION 0x821C -#define GL_NUM_EXTENSIONS 0x821D -#define GL_CONTEXT_FLAGS 0x821E -#define GL_COMPRESSED_RED 0x8225 -#define GL_COMPRESSED_RG 0x8226 -#define GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT 0x00000001 -#define GL_RGBA32F 0x8814 -#define GL_RGB32F 0x8815 -#define GL_RGBA16F 0x881A -#define GL_RGB16F 0x881B -#define GL_VERTEX_ATTRIB_ARRAY_INTEGER 0x88FD -#define GL_MAX_ARRAY_TEXTURE_LAYERS 0x88FF -#define GL_MIN_PROGRAM_TEXEL_OFFSET 0x8904 -#define GL_MAX_PROGRAM_TEXEL_OFFSET 0x8905 -#define GL_CLAMP_READ_COLOR 0x891C -#define GL_FIXED_ONLY 0x891D -#define GL_MAX_VARYING_COMPONENTS 0x8B4B -#define GL_TEXTURE_1D_ARRAY 0x8C18 -#define GL_PROXY_TEXTURE_1D_ARRAY 0x8C19 -#define GL_TEXTURE_2D_ARRAY 0x8C1A -#define GL_PROXY_TEXTURE_2D_ARRAY 0x8C1B -#define GL_TEXTURE_BINDING_1D_ARRAY 0x8C1C -#define GL_TEXTURE_BINDING_2D_ARRAY 0x8C1D -#define GL_R11F_G11F_B10F 0x8C3A -#define GL_UNSIGNED_INT_10F_11F_11F_REV 0x8C3B -#define GL_RGB9_E5 0x8C3D -#define GL_UNSIGNED_INT_5_9_9_9_REV 0x8C3E -#define GL_TEXTURE_SHARED_SIZE 0x8C3F -#define GL_TRANSFORM_FEEDBACK_VARYING_MAX_LENGTH 0x8C76 -#define GL_TRANSFORM_FEEDBACK_BUFFER_MODE 0x8C7F -#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS 0x8C80 -#define GL_TRANSFORM_FEEDBACK_VARYINGS 0x8C83 -#define GL_TRANSFORM_FEEDBACK_BUFFER_START 0x8C84 -#define GL_TRANSFORM_FEEDBACK_BUFFER_SIZE 0x8C85 -#define GL_PRIMITIVES_GENERATED 0x8C87 -#define GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN 0x8C88 -#define GL_RASTERIZER_DISCARD 0x8C89 -#define GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 0x8C8A -#define GL_MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS 0x8C8B -#define GL_INTERLEAVED_ATTRIBS 0x8C8C -#define GL_SEPARATE_ATTRIBS 0x8C8D -#define GL_TRANSFORM_FEEDBACK_BUFFER 0x8C8E -#define GL_TRANSFORM_FEEDBACK_BUFFER_BINDING 0x8C8F -#define GL_RGBA32UI 0x8D70 -#define GL_RGB32UI 0x8D71 -#define GL_RGBA16UI 0x8D76 -#define GL_RGB16UI 0x8D77 -#define GL_RGBA8UI 0x8D7C -#define GL_RGB8UI 0x8D7D -#define GL_RGBA32I 0x8D82 -#define GL_RGB32I 0x8D83 -#define GL_RGBA16I 0x8D88 -#define GL_RGB16I 0x8D89 -#define GL_RGBA8I 0x8D8E -#define GL_RGB8I 0x8D8F -#define GL_RED_INTEGER 0x8D94 -#define GL_GREEN_INTEGER 0x8D95 -#define GL_BLUE_INTEGER 0x8D96 -#define GL_RGB_INTEGER 0x8D98 -#define GL_RGBA_INTEGER 0x8D99 -#define GL_BGR_INTEGER 0x8D9A -#define GL_BGRA_INTEGER 0x8D9B -#define GL_SAMPLER_1D_ARRAY 0x8DC0 -#define GL_SAMPLER_2D_ARRAY 0x8DC1 -#define GL_SAMPLER_1D_ARRAY_SHADOW 0x8DC3 -#define GL_SAMPLER_2D_ARRAY_SHADOW 0x8DC4 -#define GL_SAMPLER_CUBE_SHADOW 0x8DC5 -#define GL_UNSIGNED_INT_VEC2 0x8DC6 -#define GL_UNSIGNED_INT_VEC3 0x8DC7 -#define GL_UNSIGNED_INT_VEC4 0x8DC8 -#define GL_INT_SAMPLER_1D 0x8DC9 -#define GL_INT_SAMPLER_2D 0x8DCA -#define GL_INT_SAMPLER_3D 0x8DCB -#define GL_INT_SAMPLER_CUBE 0x8DCC -#define GL_INT_SAMPLER_1D_ARRAY 0x8DCE -#define GL_INT_SAMPLER_2D_ARRAY 0x8DCF -#define GL_UNSIGNED_INT_SAMPLER_1D 0x8DD1 -#define GL_UNSIGNED_INT_SAMPLER_2D 0x8DD2 -#define GL_UNSIGNED_INT_SAMPLER_3D 0x8DD3 -#define GL_UNSIGNED_INT_SAMPLER_CUBE 0x8DD4 -#define GL_UNSIGNED_INT_SAMPLER_1D_ARRAY 0x8DD6 -#define GL_UNSIGNED_INT_SAMPLER_2D_ARRAY 0x8DD7 -#define GL_QUERY_WAIT 0x8E13 -#define GL_QUERY_NO_WAIT 0x8E14 -#define GL_QUERY_BY_REGION_WAIT 0x8E15 -#define GL_QUERY_BY_REGION_NO_WAIT 0x8E16 -#define GL_BUFFER_ACCESS_FLAGS 0x911F -#define GL_BUFFER_MAP_LENGTH 0x9120 -#define GL_BUFFER_MAP_OFFSET 0x9121 -#define GL_DEPTH_COMPONENT32F 0x8CAC -#define GL_DEPTH32F_STENCIL8 0x8CAD -#define GL_FLOAT_32_UNSIGNED_INT_24_8_REV 0x8DAD -#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506 -#define GL_FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING 0x8210 -#define GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE 0x8211 -#define GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE 0x8212 -#define GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE 0x8213 -#define GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE 0x8214 -#define GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE 0x8215 -#define GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE 0x8216 -#define GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE 0x8217 -#define GL_FRAMEBUFFER_DEFAULT 0x8218 -#define GL_FRAMEBUFFER_UNDEFINED 0x8219 -#define GL_DEPTH_STENCIL_ATTACHMENT 0x821A -#define GL_MAX_RENDERBUFFER_SIZE 0x84E8 -#define GL_DEPTH_STENCIL 0x84F9 -#define GL_UNSIGNED_INT_24_8 0x84FA -#define GL_DEPTH24_STENCIL8 0x88F0 -#define GL_TEXTURE_STENCIL_SIZE 0x88F1 -#define GL_TEXTURE_RED_TYPE 0x8C10 -#define GL_TEXTURE_GREEN_TYPE 0x8C11 -#define GL_TEXTURE_BLUE_TYPE 0x8C12 -#define GL_TEXTURE_ALPHA_TYPE 0x8C13 -#define GL_TEXTURE_DEPTH_TYPE 0x8C16 -#define GL_UNSIGNED_NORMALIZED 0x8C17 -#define GL_FRAMEBUFFER_BINDING 0x8CA6 -#define GL_DRAW_FRAMEBUFFER_BINDING 0x8CA6 -#define GL_RENDERBUFFER_BINDING 0x8CA7 -#define GL_READ_FRAMEBUFFER 0x8CA8 -#define GL_DRAW_FRAMEBUFFER 0x8CA9 -#define GL_READ_FRAMEBUFFER_BINDING 0x8CAA -#define GL_RENDERBUFFER_SAMPLES 0x8CAB -#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE 0x8CD0 -#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME 0x8CD1 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL 0x8CD2 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE 0x8CD3 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LAYER 0x8CD4 -#define GL_FRAMEBUFFER_COMPLETE 0x8CD5 -#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT 0x8CD6 -#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT 0x8CD7 -#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER 0x8CDB -#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER 0x8CDC -#define GL_FRAMEBUFFER_UNSUPPORTED 0x8CDD -#define GL_MAX_COLOR_ATTACHMENTS 0x8CDF -#define GL_COLOR_ATTACHMENT0 0x8CE0 -#define GL_COLOR_ATTACHMENT1 0x8CE1 -#define GL_COLOR_ATTACHMENT2 0x8CE2 -#define GL_COLOR_ATTACHMENT3 0x8CE3 -#define GL_COLOR_ATTACHMENT4 0x8CE4 -#define GL_COLOR_ATTACHMENT5 0x8CE5 -#define GL_COLOR_ATTACHMENT6 0x8CE6 -#define GL_COLOR_ATTACHMENT7 0x8CE7 -#define GL_COLOR_ATTACHMENT8 0x8CE8 -#define GL_COLOR_ATTACHMENT9 0x8CE9 -#define GL_COLOR_ATTACHMENT10 0x8CEA -#define GL_COLOR_ATTACHMENT11 0x8CEB -#define GL_COLOR_ATTACHMENT12 0x8CEC -#define GL_COLOR_ATTACHMENT13 0x8CED -#define GL_COLOR_ATTACHMENT14 0x8CEE -#define GL_COLOR_ATTACHMENT15 0x8CEF -#define GL_COLOR_ATTACHMENT16 0x8CF0 -#define GL_COLOR_ATTACHMENT17 0x8CF1 -#define GL_COLOR_ATTACHMENT18 0x8CF2 -#define GL_COLOR_ATTACHMENT19 0x8CF3 -#define GL_COLOR_ATTACHMENT20 0x8CF4 -#define GL_COLOR_ATTACHMENT21 0x8CF5 -#define GL_COLOR_ATTACHMENT22 0x8CF6 -#define GL_COLOR_ATTACHMENT23 0x8CF7 -#define GL_COLOR_ATTACHMENT24 0x8CF8 -#define GL_COLOR_ATTACHMENT25 0x8CF9 -#define GL_COLOR_ATTACHMENT26 0x8CFA -#define GL_COLOR_ATTACHMENT27 0x8CFB -#define GL_COLOR_ATTACHMENT28 0x8CFC -#define GL_COLOR_ATTACHMENT29 0x8CFD -#define GL_COLOR_ATTACHMENT30 0x8CFE -#define GL_COLOR_ATTACHMENT31 0x8CFF -#define GL_DEPTH_ATTACHMENT 0x8D00 -#define GL_STENCIL_ATTACHMENT 0x8D20 -#define GL_FRAMEBUFFER 0x8D40 -#define GL_RENDERBUFFER 0x8D41 -#define GL_RENDERBUFFER_WIDTH 0x8D42 -#define GL_RENDERBUFFER_HEIGHT 0x8D43 -#define GL_RENDERBUFFER_INTERNAL_FORMAT 0x8D44 -#define GL_STENCIL_INDEX1 0x8D46 -#define GL_STENCIL_INDEX4 0x8D47 -#define GL_STENCIL_INDEX8 0x8D48 -#define GL_STENCIL_INDEX16 0x8D49 -#define GL_RENDERBUFFER_RED_SIZE 0x8D50 -#define GL_RENDERBUFFER_GREEN_SIZE 0x8D51 -#define GL_RENDERBUFFER_BLUE_SIZE 0x8D52 -#define GL_RENDERBUFFER_ALPHA_SIZE 0x8D53 -#define GL_RENDERBUFFER_DEPTH_SIZE 0x8D54 -#define GL_RENDERBUFFER_STENCIL_SIZE 0x8D55 -#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE 0x8D56 -#define GL_MAX_SAMPLES 0x8D57 -#define GL_INDEX 0x8222 -#define GL_TEXTURE_LUMINANCE_TYPE 0x8C14 -#define GL_TEXTURE_INTENSITY_TYPE 0x8C15 -#define GL_FRAMEBUFFER_SRGB 0x8DB9 -#define GL_HALF_FLOAT 0x140B -#define GL_MAP_READ_BIT 0x0001 -#define GL_MAP_WRITE_BIT 0x0002 -#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004 -#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008 -#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010 -#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020 -#define GL_COMPRESSED_RED_RGTC1 0x8DBB -#define GL_COMPRESSED_SIGNED_RED_RGTC1 0x8DBC -#define GL_COMPRESSED_RG_RGTC2 0x8DBD -#define GL_COMPRESSED_SIGNED_RG_RGTC2 0x8DBE -#define GL_RG 0x8227 -#define GL_RG_INTEGER 0x8228 -#define GL_R8 0x8229 -#define GL_R16 0x822A -#define GL_RG8 0x822B -#define GL_RG16 0x822C -#define GL_R16F 0x822D -#define GL_R32F 0x822E -#define GL_RG16F 0x822F -#define GL_RG32F 0x8230 -#define GL_R8I 0x8231 -#define GL_R8UI 0x8232 -#define GL_R16I 0x8233 -#define GL_R16UI 0x8234 -#define GL_R32I 0x8235 -#define GL_R32UI 0x8236 -#define GL_RG8I 0x8237 -#define GL_RG8UI 0x8238 -#define GL_RG16I 0x8239 -#define GL_RG16UI 0x823A -#define GL_RG32I 0x823B -#define GL_RG32UI 0x823C -#define GL_VERTEX_ARRAY_BINDING 0x85B5 -#define GL_CLAMP_VERTEX_COLOR 0x891A -#define GL_CLAMP_FRAGMENT_COLOR 0x891B -#define GL_ALPHA_INTEGER 0x8D97 -#define GL_SAMPLER_2D_RECT 0x8B63 -#define GL_SAMPLER_2D_RECT_SHADOW 0x8B64 -#define GL_SAMPLER_BUFFER 0x8DC2 -#define GL_INT_SAMPLER_2D_RECT 0x8DCD -#define GL_INT_SAMPLER_BUFFER 0x8DD0 -#define GL_UNSIGNED_INT_SAMPLER_2D_RECT 0x8DD5 -#define GL_UNSIGNED_INT_SAMPLER_BUFFER 0x8DD8 -#define GL_TEXTURE_BUFFER 0x8C2A -#define GL_MAX_TEXTURE_BUFFER_SIZE 0x8C2B -#define GL_TEXTURE_BINDING_BUFFER 0x8C2C -#define GL_TEXTURE_BUFFER_DATA_STORE_BINDING 0x8C2D -#define GL_TEXTURE_RECTANGLE 0x84F5 -#define GL_TEXTURE_BINDING_RECTANGLE 0x84F6 -#define GL_PROXY_TEXTURE_RECTANGLE 0x84F7 -#define GL_MAX_RECTANGLE_TEXTURE_SIZE 0x84F8 -#define GL_R8_SNORM 0x8F94 -#define GL_RG8_SNORM 0x8F95 -#define GL_RGB8_SNORM 0x8F96 -#define GL_RGBA8_SNORM 0x8F97 -#define GL_R16_SNORM 0x8F98 -#define GL_RG16_SNORM 0x8F99 -#define GL_RGB16_SNORM 0x8F9A -#define GL_RGBA16_SNORM 0x8F9B -#define GL_SIGNED_NORMALIZED 0x8F9C -#define GL_PRIMITIVE_RESTART 0x8F9D -#define GL_PRIMITIVE_RESTART_INDEX 0x8F9E -#define GL_COPY_READ_BUFFER 0x8F36 -#define GL_COPY_WRITE_BUFFER 0x8F37 -#define GL_UNIFORM_BUFFER 0x8A11 -#define GL_UNIFORM_BUFFER_BINDING 0x8A28 -#define GL_UNIFORM_BUFFER_START 0x8A29 -#define GL_UNIFORM_BUFFER_SIZE 0x8A2A -#define GL_MAX_VERTEX_UNIFORM_BLOCKS 0x8A2B -#define GL_MAX_GEOMETRY_UNIFORM_BLOCKS 0x8A2C -#define GL_MAX_FRAGMENT_UNIFORM_BLOCKS 0x8A2D -#define GL_MAX_COMBINED_UNIFORM_BLOCKS 0x8A2E -#define GL_MAX_UNIFORM_BUFFER_BINDINGS 0x8A2F -#define GL_MAX_UNIFORM_BLOCK_SIZE 0x8A30 -#define GL_MAX_COMBINED_VERTEX_UNIFORM_COMPONENTS 0x8A31 -#define GL_MAX_COMBINED_GEOMETRY_UNIFORM_COMPONENTS 0x8A32 -#define GL_MAX_COMBINED_FRAGMENT_UNIFORM_COMPONENTS 0x8A33 -#define GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT 0x8A34 -#define GL_ACTIVE_UNIFORM_BLOCK_MAX_NAME_LENGTH 0x8A35 -#define GL_ACTIVE_UNIFORM_BLOCKS 0x8A36 -#define GL_UNIFORM_TYPE 0x8A37 -#define GL_UNIFORM_SIZE 0x8A38 -#define GL_UNIFORM_NAME_LENGTH 0x8A39 -#define GL_UNIFORM_BLOCK_INDEX 0x8A3A -#define GL_UNIFORM_OFFSET 0x8A3B -#define GL_UNIFORM_ARRAY_STRIDE 0x8A3C -#define GL_UNIFORM_MATRIX_STRIDE 0x8A3D -#define GL_UNIFORM_IS_ROW_MAJOR 0x8A3E -#define GL_UNIFORM_BLOCK_BINDING 0x8A3F -#define GL_UNIFORM_BLOCK_DATA_SIZE 0x8A40 -#define GL_UNIFORM_BLOCK_NAME_LENGTH 0x8A41 -#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORMS 0x8A42 -#define GL_UNIFORM_BLOCK_ACTIVE_UNIFORM_INDICES 0x8A43 -#define GL_UNIFORM_BLOCK_REFERENCED_BY_VERTEX_SHADER 0x8A44 -#define GL_UNIFORM_BLOCK_REFERENCED_BY_GEOMETRY_SHADER 0x8A45 -#define GL_UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER 0x8A46 -#define GL_INVALID_INDEX 0xFFFFFFFF -#define GL_CONTEXT_CORE_PROFILE_BIT 0x00000001 -#define GL_CONTEXT_COMPATIBILITY_PROFILE_BIT 0x00000002 -#define GL_LINES_ADJACENCY 0x000A -#define GL_LINE_STRIP_ADJACENCY 0x000B -#define GL_TRIANGLES_ADJACENCY 0x000C -#define GL_TRIANGLE_STRIP_ADJACENCY 0x000D -#define GL_PROGRAM_POINT_SIZE 0x8642 -#define GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS 0x8C29 -#define GL_FRAMEBUFFER_ATTACHMENT_LAYERED 0x8DA7 -#define GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS 0x8DA8 -#define GL_GEOMETRY_SHADER 0x8DD9 -#define GL_GEOMETRY_VERTICES_OUT 0x8916 -#define GL_GEOMETRY_INPUT_TYPE 0x8917 -#define GL_GEOMETRY_OUTPUT_TYPE 0x8918 -#define GL_MAX_GEOMETRY_UNIFORM_COMPONENTS 0x8DDF -#define GL_MAX_GEOMETRY_OUTPUT_VERTICES 0x8DE0 -#define GL_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS 0x8DE1 -#define GL_MAX_VERTEX_OUTPUT_COMPONENTS 0x9122 -#define GL_MAX_GEOMETRY_INPUT_COMPONENTS 0x9123 -#define GL_MAX_GEOMETRY_OUTPUT_COMPONENTS 0x9124 -#define GL_MAX_FRAGMENT_INPUT_COMPONENTS 0x9125 -#define GL_CONTEXT_PROFILE_MASK 0x9126 -#define GL_DEPTH_CLAMP 0x864F -#define GL_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION 0x8E4C -#define GL_FIRST_VERTEX_CONVENTION 0x8E4D -#define GL_LAST_VERTEX_CONVENTION 0x8E4E -#define GL_PROVOKING_VERTEX 0x8E4F -#define GL_TEXTURE_CUBE_MAP_SEAMLESS 0x884F -#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111 -#define GL_OBJECT_TYPE 0x9112 -#define GL_SYNC_CONDITION 0x9113 -#define GL_SYNC_STATUS 0x9114 -#define GL_SYNC_FLAGS 0x9115 -#define GL_SYNC_FENCE 0x9116 -#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117 -#define GL_UNSIGNALED 0x9118 -#define GL_SIGNALED 0x9119 -#define GL_ALREADY_SIGNALED 0x911A -#define GL_TIMEOUT_EXPIRED 0x911B -#define GL_CONDITION_SATISFIED 0x911C -#define GL_WAIT_FAILED 0x911D -#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFF -#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001 -#define GL_SAMPLE_POSITION 0x8E50 -#define GL_SAMPLE_MASK 0x8E51 -#define GL_SAMPLE_MASK_VALUE 0x8E52 -#define GL_MAX_SAMPLE_MASK_WORDS 0x8E59 -#define GL_TEXTURE_2D_MULTISAMPLE 0x9100 -#define GL_PROXY_TEXTURE_2D_MULTISAMPLE 0x9101 -#define GL_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9102 -#define GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY 0x9103 -#define GL_TEXTURE_BINDING_2D_MULTISAMPLE 0x9104 -#define GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY 0x9105 -#define GL_TEXTURE_SAMPLES 0x9106 -#define GL_TEXTURE_FIXED_SAMPLE_LOCATIONS 0x9107 -#define GL_SAMPLER_2D_MULTISAMPLE 0x9108 -#define GL_INT_SAMPLER_2D_MULTISAMPLE 0x9109 -#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE 0x910A -#define GL_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910B -#define GL_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910C -#define GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY 0x910D -#define GL_MAX_COLOR_TEXTURE_SAMPLES 0x910E -#define GL_MAX_DEPTH_TEXTURE_SAMPLES 0x910F -#define GL_MAX_INTEGER_SAMPLES 0x9110 -#define GL_VERTEX_ATTRIB_ARRAY_DIVISOR 0x88FE -#define GL_SRC1_COLOR 0x88F9 -#define GL_ONE_MINUS_SRC1_COLOR 0x88FA -#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB -#define GL_MAX_DUAL_SOURCE_DRAW_BUFFERS 0x88FC -#define GL_ANY_SAMPLES_PASSED 0x8C2F -#define GL_SAMPLER_BINDING 0x8919 -#define GL_RGB10_A2UI 0x906F -#define GL_TEXTURE_SWIZZLE_R 0x8E42 -#define GL_TEXTURE_SWIZZLE_G 0x8E43 -#define GL_TEXTURE_SWIZZLE_B 0x8E44 -#define GL_TEXTURE_SWIZZLE_A 0x8E45 -#define GL_TEXTURE_SWIZZLE_RGBA 0x8E46 -#define GL_TIME_ELAPSED 0x88BF -#define GL_TIMESTAMP 0x8E28 -#define GL_INT_2_10_10_10_REV 0x8D9F -#ifndef GL_VERSION_1_0 -#define GL_VERSION_1_0 1 -GLAPI int GLAD_GL_VERSION_1_0; -typedef void (APIENTRYP PFNGLCULLFACEPROC)(GLenum mode); -GLAPI PFNGLCULLFACEPROC glad_glCullFace; -#define glCullFace glad_glCullFace -typedef void (APIENTRYP PFNGLFRONTFACEPROC)(GLenum mode); -GLAPI PFNGLFRONTFACEPROC glad_glFrontFace; -#define glFrontFace glad_glFrontFace -typedef void (APIENTRYP PFNGLHINTPROC)(GLenum target, GLenum mode); -GLAPI PFNGLHINTPROC glad_glHint; -#define glHint glad_glHint -typedef void (APIENTRYP PFNGLLINEWIDTHPROC)(GLfloat width); -GLAPI PFNGLLINEWIDTHPROC glad_glLineWidth; -#define glLineWidth glad_glLineWidth -typedef void (APIENTRYP PFNGLPOINTSIZEPROC)(GLfloat size); -GLAPI PFNGLPOINTSIZEPROC glad_glPointSize; -#define glPointSize glad_glPointSize -typedef void (APIENTRYP PFNGLPOLYGONMODEPROC)(GLenum face, GLenum mode); -GLAPI PFNGLPOLYGONMODEPROC glad_glPolygonMode; -#define glPolygonMode glad_glPolygonMode -typedef void (APIENTRYP PFNGLSCISSORPROC)(GLint x, GLint y, GLsizei width, GLsizei height); -GLAPI PFNGLSCISSORPROC glad_glScissor; -#define glScissor glad_glScissor -typedef void (APIENTRYP PFNGLTEXPARAMETERFPROC)(GLenum target, GLenum pname, GLfloat param); -GLAPI PFNGLTEXPARAMETERFPROC glad_glTexParameterf; -#define glTexParameterf glad_glTexParameterf -typedef void (APIENTRYP PFNGLTEXPARAMETERFVPROC)(GLenum target, GLenum pname, const GLfloat *params); -GLAPI PFNGLTEXPARAMETERFVPROC glad_glTexParameterfv; -#define glTexParameterfv glad_glTexParameterfv -typedef void (APIENTRYP PFNGLTEXPARAMETERIPROC)(GLenum target, GLenum pname, GLint param); -GLAPI PFNGLTEXPARAMETERIPROC glad_glTexParameteri; -#define glTexParameteri glad_glTexParameteri -typedef void (APIENTRYP PFNGLTEXPARAMETERIVPROC)(GLenum target, GLenum pname, const GLint *params); -GLAPI PFNGLTEXPARAMETERIVPROC glad_glTexParameteriv; -#define glTexParameteriv glad_glTexParameteriv -typedef void (APIENTRYP PFNGLTEXIMAGE1DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLint border, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXIMAGE1DPROC glad_glTexImage1D; -#define glTexImage1D glad_glTexImage1D -typedef void (APIENTRYP PFNGLTEXIMAGE2DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXIMAGE2DPROC glad_glTexImage2D; -#define glTexImage2D glad_glTexImage2D -typedef void (APIENTRYP PFNGLDRAWBUFFERPROC)(GLenum buf); -GLAPI PFNGLDRAWBUFFERPROC glad_glDrawBuffer; -#define glDrawBuffer glad_glDrawBuffer -typedef void (APIENTRYP PFNGLCLEARPROC)(GLbitfield mask); -GLAPI PFNGLCLEARPROC glad_glClear; -#define glClear glad_glClear -typedef void (APIENTRYP PFNGLCLEARCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); -GLAPI PFNGLCLEARCOLORPROC glad_glClearColor; -#define glClearColor glad_glClearColor -typedef void (APIENTRYP PFNGLCLEARSTENCILPROC)(GLint s); -GLAPI PFNGLCLEARSTENCILPROC glad_glClearStencil; -#define glClearStencil glad_glClearStencil -typedef void (APIENTRYP PFNGLCLEARDEPTHPROC)(GLdouble depth); -GLAPI PFNGLCLEARDEPTHPROC glad_glClearDepth; -#define glClearDepth glad_glClearDepth -typedef void (APIENTRYP PFNGLSTENCILMASKPROC)(GLuint mask); -GLAPI PFNGLSTENCILMASKPROC glad_glStencilMask; -#define glStencilMask glad_glStencilMask -typedef void (APIENTRYP PFNGLCOLORMASKPROC)(GLboolean red, GLboolean green, GLboolean blue, GLboolean alpha); -GLAPI PFNGLCOLORMASKPROC glad_glColorMask; -#define glColorMask glad_glColorMask -typedef void (APIENTRYP PFNGLDEPTHMASKPROC)(GLboolean flag); -GLAPI PFNGLDEPTHMASKPROC glad_glDepthMask; -#define glDepthMask glad_glDepthMask -typedef void (APIENTRYP PFNGLDISABLEPROC)(GLenum cap); -GLAPI PFNGLDISABLEPROC glad_glDisable; -#define glDisable glad_glDisable -typedef void (APIENTRYP PFNGLENABLEPROC)(GLenum cap); -GLAPI PFNGLENABLEPROC glad_glEnable; -#define glEnable glad_glEnable -typedef void (APIENTRYP PFNGLFINISHPROC)(void); -GLAPI PFNGLFINISHPROC glad_glFinish; -#define glFinish glad_glFinish -typedef void (APIENTRYP PFNGLFLUSHPROC)(void); -GLAPI PFNGLFLUSHPROC glad_glFlush; -#define glFlush glad_glFlush -typedef void (APIENTRYP PFNGLBLENDFUNCPROC)(GLenum sfactor, GLenum dfactor); -GLAPI PFNGLBLENDFUNCPROC glad_glBlendFunc; -#define glBlendFunc glad_glBlendFunc -typedef void (APIENTRYP PFNGLLOGICOPPROC)(GLenum opcode); -GLAPI PFNGLLOGICOPPROC glad_glLogicOp; -#define glLogicOp glad_glLogicOp -typedef void (APIENTRYP PFNGLSTENCILFUNCPROC)(GLenum func, GLint ref, GLuint mask); -GLAPI PFNGLSTENCILFUNCPROC glad_glStencilFunc; -#define glStencilFunc glad_glStencilFunc -typedef void (APIENTRYP PFNGLSTENCILOPPROC)(GLenum fail, GLenum zfail, GLenum zpass); -GLAPI PFNGLSTENCILOPPROC glad_glStencilOp; -#define glStencilOp glad_glStencilOp -typedef void (APIENTRYP PFNGLDEPTHFUNCPROC)(GLenum func); -GLAPI PFNGLDEPTHFUNCPROC glad_glDepthFunc; -#define glDepthFunc glad_glDepthFunc -typedef void (APIENTRYP PFNGLPIXELSTOREFPROC)(GLenum pname, GLfloat param); -GLAPI PFNGLPIXELSTOREFPROC glad_glPixelStoref; -#define glPixelStoref glad_glPixelStoref -typedef void (APIENTRYP PFNGLPIXELSTOREIPROC)(GLenum pname, GLint param); -GLAPI PFNGLPIXELSTOREIPROC glad_glPixelStorei; -#define glPixelStorei glad_glPixelStorei -typedef void (APIENTRYP PFNGLREADBUFFERPROC)(GLenum src); -GLAPI PFNGLREADBUFFERPROC glad_glReadBuffer; -#define glReadBuffer glad_glReadBuffer -typedef void (APIENTRYP PFNGLREADPIXELSPROC)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, void *pixels); -GLAPI PFNGLREADPIXELSPROC glad_glReadPixels; -#define glReadPixels glad_glReadPixels -typedef void (APIENTRYP PFNGLGETBOOLEANVPROC)(GLenum pname, GLboolean *data); -GLAPI PFNGLGETBOOLEANVPROC glad_glGetBooleanv; -#define glGetBooleanv glad_glGetBooleanv -typedef void (APIENTRYP PFNGLGETDOUBLEVPROC)(GLenum pname, GLdouble *data); -GLAPI PFNGLGETDOUBLEVPROC glad_glGetDoublev; -#define glGetDoublev glad_glGetDoublev -typedef GLenum (APIENTRYP PFNGLGETERRORPROC)(void); -GLAPI PFNGLGETERRORPROC glad_glGetError; -#define glGetError glad_glGetError -typedef void (APIENTRYP PFNGLGETFLOATVPROC)(GLenum pname, GLfloat *data); -GLAPI PFNGLGETFLOATVPROC glad_glGetFloatv; -#define glGetFloatv glad_glGetFloatv -typedef void (APIENTRYP PFNGLGETINTEGERVPROC)(GLenum pname, GLint *data); -GLAPI PFNGLGETINTEGERVPROC glad_glGetIntegerv; -#define glGetIntegerv glad_glGetIntegerv -typedef const GLubyte * (APIENTRYP PFNGLGETSTRINGPROC)(GLenum name); -GLAPI PFNGLGETSTRINGPROC glad_glGetString; -#define glGetString glad_glGetString -typedef void (APIENTRYP PFNGLGETTEXIMAGEPROC)(GLenum target, GLint level, GLenum format, GLenum type, void *pixels); -GLAPI PFNGLGETTEXIMAGEPROC glad_glGetTexImage; -#define glGetTexImage glad_glGetTexImage -typedef void (APIENTRYP PFNGLGETTEXPARAMETERFVPROC)(GLenum target, GLenum pname, GLfloat *params); -GLAPI PFNGLGETTEXPARAMETERFVPROC glad_glGetTexParameterfv; -#define glGetTexParameterfv glad_glGetTexParameterfv -typedef void (APIENTRYP PFNGLGETTEXPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETTEXPARAMETERIVPROC glad_glGetTexParameteriv; -#define glGetTexParameteriv glad_glGetTexParameteriv -typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERFVPROC)(GLenum target, GLint level, GLenum pname, GLfloat *params); -GLAPI PFNGLGETTEXLEVELPARAMETERFVPROC glad_glGetTexLevelParameterfv; -#define glGetTexLevelParameterfv glad_glGetTexLevelParameterfv -typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERIVPROC)(GLenum target, GLint level, GLenum pname, GLint *params); -GLAPI PFNGLGETTEXLEVELPARAMETERIVPROC glad_glGetTexLevelParameteriv; -#define glGetTexLevelParameteriv glad_glGetTexLevelParameteriv -typedef GLboolean (APIENTRYP PFNGLISENABLEDPROC)(GLenum cap); -GLAPI PFNGLISENABLEDPROC glad_glIsEnabled; -#define glIsEnabled glad_glIsEnabled -typedef void (APIENTRYP PFNGLDEPTHRANGEPROC)(GLdouble n, GLdouble f); -GLAPI PFNGLDEPTHRANGEPROC glad_glDepthRange; -#define glDepthRange glad_glDepthRange -typedef void (APIENTRYP PFNGLVIEWPORTPROC)(GLint x, GLint y, GLsizei width, GLsizei height); -GLAPI PFNGLVIEWPORTPROC glad_glViewport; -#define glViewport glad_glViewport -typedef void (APIENTRYP PFNGLNEWLISTPROC)(GLuint list, GLenum mode); -GLAPI PFNGLNEWLISTPROC glad_glNewList; -#define glNewList glad_glNewList -typedef void (APIENTRYP PFNGLENDLISTPROC)(void); -GLAPI PFNGLENDLISTPROC glad_glEndList; -#define glEndList glad_glEndList -typedef void (APIENTRYP PFNGLCALLLISTPROC)(GLuint list); -GLAPI PFNGLCALLLISTPROC glad_glCallList; -#define glCallList glad_glCallList -typedef void (APIENTRYP PFNGLCALLLISTSPROC)(GLsizei n, GLenum type, const void *lists); -GLAPI PFNGLCALLLISTSPROC glad_glCallLists; -#define glCallLists glad_glCallLists -typedef void (APIENTRYP PFNGLDELETELISTSPROC)(GLuint list, GLsizei range); -GLAPI PFNGLDELETELISTSPROC glad_glDeleteLists; -#define glDeleteLists glad_glDeleteLists -typedef GLuint (APIENTRYP PFNGLGENLISTSPROC)(GLsizei range); -GLAPI PFNGLGENLISTSPROC glad_glGenLists; -#define glGenLists glad_glGenLists -typedef void (APIENTRYP PFNGLLISTBASEPROC)(GLuint base); -GLAPI PFNGLLISTBASEPROC glad_glListBase; -#define glListBase glad_glListBase -typedef void (APIENTRYP PFNGLBEGINPROC)(GLenum mode); -GLAPI PFNGLBEGINPROC glad_glBegin; -#define glBegin glad_glBegin -typedef void (APIENTRYP PFNGLBITMAPPROC)(GLsizei width, GLsizei height, GLfloat xorig, GLfloat yorig, GLfloat xmove, GLfloat ymove, const GLubyte *bitmap); -GLAPI PFNGLBITMAPPROC glad_glBitmap; -#define glBitmap glad_glBitmap -typedef void (APIENTRYP PFNGLCOLOR3BPROC)(GLbyte red, GLbyte green, GLbyte blue); -GLAPI PFNGLCOLOR3BPROC glad_glColor3b; -#define glColor3b glad_glColor3b -typedef void (APIENTRYP PFNGLCOLOR3BVPROC)(const GLbyte *v); -GLAPI PFNGLCOLOR3BVPROC glad_glColor3bv; -#define glColor3bv glad_glColor3bv -typedef void (APIENTRYP PFNGLCOLOR3DPROC)(GLdouble red, GLdouble green, GLdouble blue); -GLAPI PFNGLCOLOR3DPROC glad_glColor3d; -#define glColor3d glad_glColor3d -typedef void (APIENTRYP PFNGLCOLOR3DVPROC)(const GLdouble *v); -GLAPI PFNGLCOLOR3DVPROC glad_glColor3dv; -#define glColor3dv glad_glColor3dv -typedef void (APIENTRYP PFNGLCOLOR3FPROC)(GLfloat red, GLfloat green, GLfloat blue); -GLAPI PFNGLCOLOR3FPROC glad_glColor3f; -#define glColor3f glad_glColor3f -typedef void (APIENTRYP PFNGLCOLOR3FVPROC)(const GLfloat *v); -GLAPI PFNGLCOLOR3FVPROC glad_glColor3fv; -#define glColor3fv glad_glColor3fv -typedef void (APIENTRYP PFNGLCOLOR3IPROC)(GLint red, GLint green, GLint blue); -GLAPI PFNGLCOLOR3IPROC glad_glColor3i; -#define glColor3i glad_glColor3i -typedef void (APIENTRYP PFNGLCOLOR3IVPROC)(const GLint *v); -GLAPI PFNGLCOLOR3IVPROC glad_glColor3iv; -#define glColor3iv glad_glColor3iv -typedef void (APIENTRYP PFNGLCOLOR3SPROC)(GLshort red, GLshort green, GLshort blue); -GLAPI PFNGLCOLOR3SPROC glad_glColor3s; -#define glColor3s glad_glColor3s -typedef void (APIENTRYP PFNGLCOLOR3SVPROC)(const GLshort *v); -GLAPI PFNGLCOLOR3SVPROC glad_glColor3sv; -#define glColor3sv glad_glColor3sv -typedef void (APIENTRYP PFNGLCOLOR3UBPROC)(GLubyte red, GLubyte green, GLubyte blue); -GLAPI PFNGLCOLOR3UBPROC glad_glColor3ub; -#define glColor3ub glad_glColor3ub -typedef void (APIENTRYP PFNGLCOLOR3UBVPROC)(const GLubyte *v); -GLAPI PFNGLCOLOR3UBVPROC glad_glColor3ubv; -#define glColor3ubv glad_glColor3ubv -typedef void (APIENTRYP PFNGLCOLOR3UIPROC)(GLuint red, GLuint green, GLuint blue); -GLAPI PFNGLCOLOR3UIPROC glad_glColor3ui; -#define glColor3ui glad_glColor3ui -typedef void (APIENTRYP PFNGLCOLOR3UIVPROC)(const GLuint *v); -GLAPI PFNGLCOLOR3UIVPROC glad_glColor3uiv; -#define glColor3uiv glad_glColor3uiv -typedef void (APIENTRYP PFNGLCOLOR3USPROC)(GLushort red, GLushort green, GLushort blue); -GLAPI PFNGLCOLOR3USPROC glad_glColor3us; -#define glColor3us glad_glColor3us -typedef void (APIENTRYP PFNGLCOLOR3USVPROC)(const GLushort *v); -GLAPI PFNGLCOLOR3USVPROC glad_glColor3usv; -#define glColor3usv glad_glColor3usv -typedef void (APIENTRYP PFNGLCOLOR4BPROC)(GLbyte red, GLbyte green, GLbyte blue, GLbyte alpha); -GLAPI PFNGLCOLOR4BPROC glad_glColor4b; -#define glColor4b glad_glColor4b -typedef void (APIENTRYP PFNGLCOLOR4BVPROC)(const GLbyte *v); -GLAPI PFNGLCOLOR4BVPROC glad_glColor4bv; -#define glColor4bv glad_glColor4bv -typedef void (APIENTRYP PFNGLCOLOR4DPROC)(GLdouble red, GLdouble green, GLdouble blue, GLdouble alpha); -GLAPI PFNGLCOLOR4DPROC glad_glColor4d; -#define glColor4d glad_glColor4d -typedef void (APIENTRYP PFNGLCOLOR4DVPROC)(const GLdouble *v); -GLAPI PFNGLCOLOR4DVPROC glad_glColor4dv; -#define glColor4dv glad_glColor4dv -typedef void (APIENTRYP PFNGLCOLOR4FPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); -GLAPI PFNGLCOLOR4FPROC glad_glColor4f; -#define glColor4f glad_glColor4f -typedef void (APIENTRYP PFNGLCOLOR4FVPROC)(const GLfloat *v); -GLAPI PFNGLCOLOR4FVPROC glad_glColor4fv; -#define glColor4fv glad_glColor4fv -typedef void (APIENTRYP PFNGLCOLOR4IPROC)(GLint red, GLint green, GLint blue, GLint alpha); -GLAPI PFNGLCOLOR4IPROC glad_glColor4i; -#define glColor4i glad_glColor4i -typedef void (APIENTRYP PFNGLCOLOR4IVPROC)(const GLint *v); -GLAPI PFNGLCOLOR4IVPROC glad_glColor4iv; -#define glColor4iv glad_glColor4iv -typedef void (APIENTRYP PFNGLCOLOR4SPROC)(GLshort red, GLshort green, GLshort blue, GLshort alpha); -GLAPI PFNGLCOLOR4SPROC glad_glColor4s; -#define glColor4s glad_glColor4s -typedef void (APIENTRYP PFNGLCOLOR4SVPROC)(const GLshort *v); -GLAPI PFNGLCOLOR4SVPROC glad_glColor4sv; -#define glColor4sv glad_glColor4sv -typedef void (APIENTRYP PFNGLCOLOR4UBPROC)(GLubyte red, GLubyte green, GLubyte blue, GLubyte alpha); -GLAPI PFNGLCOLOR4UBPROC glad_glColor4ub; -#define glColor4ub glad_glColor4ub -typedef void (APIENTRYP PFNGLCOLOR4UBVPROC)(const GLubyte *v); -GLAPI PFNGLCOLOR4UBVPROC glad_glColor4ubv; -#define glColor4ubv glad_glColor4ubv -typedef void (APIENTRYP PFNGLCOLOR4UIPROC)(GLuint red, GLuint green, GLuint blue, GLuint alpha); -GLAPI PFNGLCOLOR4UIPROC glad_glColor4ui; -#define glColor4ui glad_glColor4ui -typedef void (APIENTRYP PFNGLCOLOR4UIVPROC)(const GLuint *v); -GLAPI PFNGLCOLOR4UIVPROC glad_glColor4uiv; -#define glColor4uiv glad_glColor4uiv -typedef void (APIENTRYP PFNGLCOLOR4USPROC)(GLushort red, GLushort green, GLushort blue, GLushort alpha); -GLAPI PFNGLCOLOR4USPROC glad_glColor4us; -#define glColor4us glad_glColor4us -typedef void (APIENTRYP PFNGLCOLOR4USVPROC)(const GLushort *v); -GLAPI PFNGLCOLOR4USVPROC glad_glColor4usv; -#define glColor4usv glad_glColor4usv -typedef void (APIENTRYP PFNGLEDGEFLAGPROC)(GLboolean flag); -GLAPI PFNGLEDGEFLAGPROC glad_glEdgeFlag; -#define glEdgeFlag glad_glEdgeFlag -typedef void (APIENTRYP PFNGLEDGEFLAGVPROC)(const GLboolean *flag); -GLAPI PFNGLEDGEFLAGVPROC glad_glEdgeFlagv; -#define glEdgeFlagv glad_glEdgeFlagv -typedef void (APIENTRYP PFNGLENDPROC)(void); -GLAPI PFNGLENDPROC glad_glEnd; -#define glEnd glad_glEnd -typedef void (APIENTRYP PFNGLINDEXDPROC)(GLdouble c); -GLAPI PFNGLINDEXDPROC glad_glIndexd; -#define glIndexd glad_glIndexd -typedef void (APIENTRYP PFNGLINDEXDVPROC)(const GLdouble *c); -GLAPI PFNGLINDEXDVPROC glad_glIndexdv; -#define glIndexdv glad_glIndexdv -typedef void (APIENTRYP PFNGLINDEXFPROC)(GLfloat c); -GLAPI PFNGLINDEXFPROC glad_glIndexf; -#define glIndexf glad_glIndexf -typedef void (APIENTRYP PFNGLINDEXFVPROC)(const GLfloat *c); -GLAPI PFNGLINDEXFVPROC glad_glIndexfv; -#define glIndexfv glad_glIndexfv -typedef void (APIENTRYP PFNGLINDEXIPROC)(GLint c); -GLAPI PFNGLINDEXIPROC glad_glIndexi; -#define glIndexi glad_glIndexi -typedef void (APIENTRYP PFNGLINDEXIVPROC)(const GLint *c); -GLAPI PFNGLINDEXIVPROC glad_glIndexiv; -#define glIndexiv glad_glIndexiv -typedef void (APIENTRYP PFNGLINDEXSPROC)(GLshort c); -GLAPI PFNGLINDEXSPROC glad_glIndexs; -#define glIndexs glad_glIndexs -typedef void (APIENTRYP PFNGLINDEXSVPROC)(const GLshort *c); -GLAPI PFNGLINDEXSVPROC glad_glIndexsv; -#define glIndexsv glad_glIndexsv -typedef void (APIENTRYP PFNGLNORMAL3BPROC)(GLbyte nx, GLbyte ny, GLbyte nz); -GLAPI PFNGLNORMAL3BPROC glad_glNormal3b; -#define glNormal3b glad_glNormal3b -typedef void (APIENTRYP PFNGLNORMAL3BVPROC)(const GLbyte *v); -GLAPI PFNGLNORMAL3BVPROC glad_glNormal3bv; -#define glNormal3bv glad_glNormal3bv -typedef void (APIENTRYP PFNGLNORMAL3DPROC)(GLdouble nx, GLdouble ny, GLdouble nz); -GLAPI PFNGLNORMAL3DPROC glad_glNormal3d; -#define glNormal3d glad_glNormal3d -typedef void (APIENTRYP PFNGLNORMAL3DVPROC)(const GLdouble *v); -GLAPI PFNGLNORMAL3DVPROC glad_glNormal3dv; -#define glNormal3dv glad_glNormal3dv -typedef void (APIENTRYP PFNGLNORMAL3FPROC)(GLfloat nx, GLfloat ny, GLfloat nz); -GLAPI PFNGLNORMAL3FPROC glad_glNormal3f; -#define glNormal3f glad_glNormal3f -typedef void (APIENTRYP PFNGLNORMAL3FVPROC)(const GLfloat *v); -GLAPI PFNGLNORMAL3FVPROC glad_glNormal3fv; -#define glNormal3fv glad_glNormal3fv -typedef void (APIENTRYP PFNGLNORMAL3IPROC)(GLint nx, GLint ny, GLint nz); -GLAPI PFNGLNORMAL3IPROC glad_glNormal3i; -#define glNormal3i glad_glNormal3i -typedef void (APIENTRYP PFNGLNORMAL3IVPROC)(const GLint *v); -GLAPI PFNGLNORMAL3IVPROC glad_glNormal3iv; -#define glNormal3iv glad_glNormal3iv -typedef void (APIENTRYP PFNGLNORMAL3SPROC)(GLshort nx, GLshort ny, GLshort nz); -GLAPI PFNGLNORMAL3SPROC glad_glNormal3s; -#define glNormal3s glad_glNormal3s -typedef void (APIENTRYP PFNGLNORMAL3SVPROC)(const GLshort *v); -GLAPI PFNGLNORMAL3SVPROC glad_glNormal3sv; -#define glNormal3sv glad_glNormal3sv -typedef void (APIENTRYP PFNGLRASTERPOS2DPROC)(GLdouble x, GLdouble y); -GLAPI PFNGLRASTERPOS2DPROC glad_glRasterPos2d; -#define glRasterPos2d glad_glRasterPos2d -typedef void (APIENTRYP PFNGLRASTERPOS2DVPROC)(const GLdouble *v); -GLAPI PFNGLRASTERPOS2DVPROC glad_glRasterPos2dv; -#define glRasterPos2dv glad_glRasterPos2dv -typedef void (APIENTRYP PFNGLRASTERPOS2FPROC)(GLfloat x, GLfloat y); -GLAPI PFNGLRASTERPOS2FPROC glad_glRasterPos2f; -#define glRasterPos2f glad_glRasterPos2f -typedef void (APIENTRYP PFNGLRASTERPOS2FVPROC)(const GLfloat *v); -GLAPI PFNGLRASTERPOS2FVPROC glad_glRasterPos2fv; -#define glRasterPos2fv glad_glRasterPos2fv -typedef void (APIENTRYP PFNGLRASTERPOS2IPROC)(GLint x, GLint y); -GLAPI PFNGLRASTERPOS2IPROC glad_glRasterPos2i; -#define glRasterPos2i glad_glRasterPos2i -typedef void (APIENTRYP PFNGLRASTERPOS2IVPROC)(const GLint *v); -GLAPI PFNGLRASTERPOS2IVPROC glad_glRasterPos2iv; -#define glRasterPos2iv glad_glRasterPos2iv -typedef void (APIENTRYP PFNGLRASTERPOS2SPROC)(GLshort x, GLshort y); -GLAPI PFNGLRASTERPOS2SPROC glad_glRasterPos2s; -#define glRasterPos2s glad_glRasterPos2s -typedef void (APIENTRYP PFNGLRASTERPOS2SVPROC)(const GLshort *v); -GLAPI PFNGLRASTERPOS2SVPROC glad_glRasterPos2sv; -#define glRasterPos2sv glad_glRasterPos2sv -typedef void (APIENTRYP PFNGLRASTERPOS3DPROC)(GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLRASTERPOS3DPROC glad_glRasterPos3d; -#define glRasterPos3d glad_glRasterPos3d -typedef void (APIENTRYP PFNGLRASTERPOS3DVPROC)(const GLdouble *v); -GLAPI PFNGLRASTERPOS3DVPROC glad_glRasterPos3dv; -#define glRasterPos3dv glad_glRasterPos3dv -typedef void (APIENTRYP PFNGLRASTERPOS3FPROC)(GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLRASTERPOS3FPROC glad_glRasterPos3f; -#define glRasterPos3f glad_glRasterPos3f -typedef void (APIENTRYP PFNGLRASTERPOS3FVPROC)(const GLfloat *v); -GLAPI PFNGLRASTERPOS3FVPROC glad_glRasterPos3fv; -#define glRasterPos3fv glad_glRasterPos3fv -typedef void (APIENTRYP PFNGLRASTERPOS3IPROC)(GLint x, GLint y, GLint z); -GLAPI PFNGLRASTERPOS3IPROC glad_glRasterPos3i; -#define glRasterPos3i glad_glRasterPos3i -typedef void (APIENTRYP PFNGLRASTERPOS3IVPROC)(const GLint *v); -GLAPI PFNGLRASTERPOS3IVPROC glad_glRasterPos3iv; -#define glRasterPos3iv glad_glRasterPos3iv -typedef void (APIENTRYP PFNGLRASTERPOS3SPROC)(GLshort x, GLshort y, GLshort z); -GLAPI PFNGLRASTERPOS3SPROC glad_glRasterPos3s; -#define glRasterPos3s glad_glRasterPos3s -typedef void (APIENTRYP PFNGLRASTERPOS3SVPROC)(const GLshort *v); -GLAPI PFNGLRASTERPOS3SVPROC glad_glRasterPos3sv; -#define glRasterPos3sv glad_glRasterPos3sv -typedef void (APIENTRYP PFNGLRASTERPOS4DPROC)(GLdouble x, GLdouble y, GLdouble z, GLdouble w); -GLAPI PFNGLRASTERPOS4DPROC glad_glRasterPos4d; -#define glRasterPos4d glad_glRasterPos4d -typedef void (APIENTRYP PFNGLRASTERPOS4DVPROC)(const GLdouble *v); -GLAPI PFNGLRASTERPOS4DVPROC glad_glRasterPos4dv; -#define glRasterPos4dv glad_glRasterPos4dv -typedef void (APIENTRYP PFNGLRASTERPOS4FPROC)(GLfloat x, GLfloat y, GLfloat z, GLfloat w); -GLAPI PFNGLRASTERPOS4FPROC glad_glRasterPos4f; -#define glRasterPos4f glad_glRasterPos4f -typedef void (APIENTRYP PFNGLRASTERPOS4FVPROC)(const GLfloat *v); -GLAPI PFNGLRASTERPOS4FVPROC glad_glRasterPos4fv; -#define glRasterPos4fv glad_glRasterPos4fv -typedef void (APIENTRYP PFNGLRASTERPOS4IPROC)(GLint x, GLint y, GLint z, GLint w); -GLAPI PFNGLRASTERPOS4IPROC glad_glRasterPos4i; -#define glRasterPos4i glad_glRasterPos4i -typedef void (APIENTRYP PFNGLRASTERPOS4IVPROC)(const GLint *v); -GLAPI PFNGLRASTERPOS4IVPROC glad_glRasterPos4iv; -#define glRasterPos4iv glad_glRasterPos4iv -typedef void (APIENTRYP PFNGLRASTERPOS4SPROC)(GLshort x, GLshort y, GLshort z, GLshort w); -GLAPI PFNGLRASTERPOS4SPROC glad_glRasterPos4s; -#define glRasterPos4s glad_glRasterPos4s -typedef void (APIENTRYP PFNGLRASTERPOS4SVPROC)(const GLshort *v); -GLAPI PFNGLRASTERPOS4SVPROC glad_glRasterPos4sv; -#define glRasterPos4sv glad_glRasterPos4sv -typedef void (APIENTRYP PFNGLRECTDPROC)(GLdouble x1, GLdouble y1, GLdouble x2, GLdouble y2); -GLAPI PFNGLRECTDPROC glad_glRectd; -#define glRectd glad_glRectd -typedef void (APIENTRYP PFNGLRECTDVPROC)(const GLdouble *v1, const GLdouble *v2); -GLAPI PFNGLRECTDVPROC glad_glRectdv; -#define glRectdv glad_glRectdv -typedef void (APIENTRYP PFNGLRECTFPROC)(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2); -GLAPI PFNGLRECTFPROC glad_glRectf; -#define glRectf glad_glRectf -typedef void (APIENTRYP PFNGLRECTFVPROC)(const GLfloat *v1, const GLfloat *v2); -GLAPI PFNGLRECTFVPROC glad_glRectfv; -#define glRectfv glad_glRectfv -typedef void (APIENTRYP PFNGLRECTIPROC)(GLint x1, GLint y1, GLint x2, GLint y2); -GLAPI PFNGLRECTIPROC glad_glRecti; -#define glRecti glad_glRecti -typedef void (APIENTRYP PFNGLRECTIVPROC)(const GLint *v1, const GLint *v2); -GLAPI PFNGLRECTIVPROC glad_glRectiv; -#define glRectiv glad_glRectiv -typedef void (APIENTRYP PFNGLRECTSPROC)(GLshort x1, GLshort y1, GLshort x2, GLshort y2); -GLAPI PFNGLRECTSPROC glad_glRects; -#define glRects glad_glRects -typedef void (APIENTRYP PFNGLRECTSVPROC)(const GLshort *v1, const GLshort *v2); -GLAPI PFNGLRECTSVPROC glad_glRectsv; -#define glRectsv glad_glRectsv -typedef void (APIENTRYP PFNGLTEXCOORD1DPROC)(GLdouble s); -GLAPI PFNGLTEXCOORD1DPROC glad_glTexCoord1d; -#define glTexCoord1d glad_glTexCoord1d -typedef void (APIENTRYP PFNGLTEXCOORD1DVPROC)(const GLdouble *v); -GLAPI PFNGLTEXCOORD1DVPROC glad_glTexCoord1dv; -#define glTexCoord1dv glad_glTexCoord1dv -typedef void (APIENTRYP PFNGLTEXCOORD1FPROC)(GLfloat s); -GLAPI PFNGLTEXCOORD1FPROC glad_glTexCoord1f; -#define glTexCoord1f glad_glTexCoord1f -typedef void (APIENTRYP PFNGLTEXCOORD1FVPROC)(const GLfloat *v); -GLAPI PFNGLTEXCOORD1FVPROC glad_glTexCoord1fv; -#define glTexCoord1fv glad_glTexCoord1fv -typedef void (APIENTRYP PFNGLTEXCOORD1IPROC)(GLint s); -GLAPI PFNGLTEXCOORD1IPROC glad_glTexCoord1i; -#define glTexCoord1i glad_glTexCoord1i -typedef void (APIENTRYP PFNGLTEXCOORD1IVPROC)(const GLint *v); -GLAPI PFNGLTEXCOORD1IVPROC glad_glTexCoord1iv; -#define glTexCoord1iv glad_glTexCoord1iv -typedef void (APIENTRYP PFNGLTEXCOORD1SPROC)(GLshort s); -GLAPI PFNGLTEXCOORD1SPROC glad_glTexCoord1s; -#define glTexCoord1s glad_glTexCoord1s -typedef void (APIENTRYP PFNGLTEXCOORD1SVPROC)(const GLshort *v); -GLAPI PFNGLTEXCOORD1SVPROC glad_glTexCoord1sv; -#define glTexCoord1sv glad_glTexCoord1sv -typedef void (APIENTRYP PFNGLTEXCOORD2DPROC)(GLdouble s, GLdouble t); -GLAPI PFNGLTEXCOORD2DPROC glad_glTexCoord2d; -#define glTexCoord2d glad_glTexCoord2d -typedef void (APIENTRYP PFNGLTEXCOORD2DVPROC)(const GLdouble *v); -GLAPI PFNGLTEXCOORD2DVPROC glad_glTexCoord2dv; -#define glTexCoord2dv glad_glTexCoord2dv -typedef void (APIENTRYP PFNGLTEXCOORD2FPROC)(GLfloat s, GLfloat t); -GLAPI PFNGLTEXCOORD2FPROC glad_glTexCoord2f; -#define glTexCoord2f glad_glTexCoord2f -typedef void (APIENTRYP PFNGLTEXCOORD2FVPROC)(const GLfloat *v); -GLAPI PFNGLTEXCOORD2FVPROC glad_glTexCoord2fv; -#define glTexCoord2fv glad_glTexCoord2fv -typedef void (APIENTRYP PFNGLTEXCOORD2IPROC)(GLint s, GLint t); -GLAPI PFNGLTEXCOORD2IPROC glad_glTexCoord2i; -#define glTexCoord2i glad_glTexCoord2i -typedef void (APIENTRYP PFNGLTEXCOORD2IVPROC)(const GLint *v); -GLAPI PFNGLTEXCOORD2IVPROC glad_glTexCoord2iv; -#define glTexCoord2iv glad_glTexCoord2iv -typedef void (APIENTRYP PFNGLTEXCOORD2SPROC)(GLshort s, GLshort t); -GLAPI PFNGLTEXCOORD2SPROC glad_glTexCoord2s; -#define glTexCoord2s glad_glTexCoord2s -typedef void (APIENTRYP PFNGLTEXCOORD2SVPROC)(const GLshort *v); -GLAPI PFNGLTEXCOORD2SVPROC glad_glTexCoord2sv; -#define glTexCoord2sv glad_glTexCoord2sv -typedef void (APIENTRYP PFNGLTEXCOORD3DPROC)(GLdouble s, GLdouble t, GLdouble r); -GLAPI PFNGLTEXCOORD3DPROC glad_glTexCoord3d; -#define glTexCoord3d glad_glTexCoord3d -typedef void (APIENTRYP PFNGLTEXCOORD3DVPROC)(const GLdouble *v); -GLAPI PFNGLTEXCOORD3DVPROC glad_glTexCoord3dv; -#define glTexCoord3dv glad_glTexCoord3dv -typedef void (APIENTRYP PFNGLTEXCOORD3FPROC)(GLfloat s, GLfloat t, GLfloat r); -GLAPI PFNGLTEXCOORD3FPROC glad_glTexCoord3f; -#define glTexCoord3f glad_glTexCoord3f -typedef void (APIENTRYP PFNGLTEXCOORD3FVPROC)(const GLfloat *v); -GLAPI PFNGLTEXCOORD3FVPROC glad_glTexCoord3fv; -#define glTexCoord3fv glad_glTexCoord3fv -typedef void (APIENTRYP PFNGLTEXCOORD3IPROC)(GLint s, GLint t, GLint r); -GLAPI PFNGLTEXCOORD3IPROC glad_glTexCoord3i; -#define glTexCoord3i glad_glTexCoord3i -typedef void (APIENTRYP PFNGLTEXCOORD3IVPROC)(const GLint *v); -GLAPI PFNGLTEXCOORD3IVPROC glad_glTexCoord3iv; -#define glTexCoord3iv glad_glTexCoord3iv -typedef void (APIENTRYP PFNGLTEXCOORD3SPROC)(GLshort s, GLshort t, GLshort r); -GLAPI PFNGLTEXCOORD3SPROC glad_glTexCoord3s; -#define glTexCoord3s glad_glTexCoord3s -typedef void (APIENTRYP PFNGLTEXCOORD3SVPROC)(const GLshort *v); -GLAPI PFNGLTEXCOORD3SVPROC glad_glTexCoord3sv; -#define glTexCoord3sv glad_glTexCoord3sv -typedef void (APIENTRYP PFNGLTEXCOORD4DPROC)(GLdouble s, GLdouble t, GLdouble r, GLdouble q); -GLAPI PFNGLTEXCOORD4DPROC glad_glTexCoord4d; -#define glTexCoord4d glad_glTexCoord4d -typedef void (APIENTRYP PFNGLTEXCOORD4DVPROC)(const GLdouble *v); -GLAPI PFNGLTEXCOORD4DVPROC glad_glTexCoord4dv; -#define glTexCoord4dv glad_glTexCoord4dv -typedef void (APIENTRYP PFNGLTEXCOORD4FPROC)(GLfloat s, GLfloat t, GLfloat r, GLfloat q); -GLAPI PFNGLTEXCOORD4FPROC glad_glTexCoord4f; -#define glTexCoord4f glad_glTexCoord4f -typedef void (APIENTRYP PFNGLTEXCOORD4FVPROC)(const GLfloat *v); -GLAPI PFNGLTEXCOORD4FVPROC glad_glTexCoord4fv; -#define glTexCoord4fv glad_glTexCoord4fv -typedef void (APIENTRYP PFNGLTEXCOORD4IPROC)(GLint s, GLint t, GLint r, GLint q); -GLAPI PFNGLTEXCOORD4IPROC glad_glTexCoord4i; -#define glTexCoord4i glad_glTexCoord4i -typedef void (APIENTRYP PFNGLTEXCOORD4IVPROC)(const GLint *v); -GLAPI PFNGLTEXCOORD4IVPROC glad_glTexCoord4iv; -#define glTexCoord4iv glad_glTexCoord4iv -typedef void (APIENTRYP PFNGLTEXCOORD4SPROC)(GLshort s, GLshort t, GLshort r, GLshort q); -GLAPI PFNGLTEXCOORD4SPROC glad_glTexCoord4s; -#define glTexCoord4s glad_glTexCoord4s -typedef void (APIENTRYP PFNGLTEXCOORD4SVPROC)(const GLshort *v); -GLAPI PFNGLTEXCOORD4SVPROC glad_glTexCoord4sv; -#define glTexCoord4sv glad_glTexCoord4sv -typedef void (APIENTRYP PFNGLVERTEX2DPROC)(GLdouble x, GLdouble y); -GLAPI PFNGLVERTEX2DPROC glad_glVertex2d; -#define glVertex2d glad_glVertex2d -typedef void (APIENTRYP PFNGLVERTEX2DVPROC)(const GLdouble *v); -GLAPI PFNGLVERTEX2DVPROC glad_glVertex2dv; -#define glVertex2dv glad_glVertex2dv -typedef void (APIENTRYP PFNGLVERTEX2FPROC)(GLfloat x, GLfloat y); -GLAPI PFNGLVERTEX2FPROC glad_glVertex2f; -#define glVertex2f glad_glVertex2f -typedef void (APIENTRYP PFNGLVERTEX2FVPROC)(const GLfloat *v); -GLAPI PFNGLVERTEX2FVPROC glad_glVertex2fv; -#define glVertex2fv glad_glVertex2fv -typedef void (APIENTRYP PFNGLVERTEX2IPROC)(GLint x, GLint y); -GLAPI PFNGLVERTEX2IPROC glad_glVertex2i; -#define glVertex2i glad_glVertex2i -typedef void (APIENTRYP PFNGLVERTEX2IVPROC)(const GLint *v); -GLAPI PFNGLVERTEX2IVPROC glad_glVertex2iv; -#define glVertex2iv glad_glVertex2iv -typedef void (APIENTRYP PFNGLVERTEX2SPROC)(GLshort x, GLshort y); -GLAPI PFNGLVERTEX2SPROC glad_glVertex2s; -#define glVertex2s glad_glVertex2s -typedef void (APIENTRYP PFNGLVERTEX2SVPROC)(const GLshort *v); -GLAPI PFNGLVERTEX2SVPROC glad_glVertex2sv; -#define glVertex2sv glad_glVertex2sv -typedef void (APIENTRYP PFNGLVERTEX3DPROC)(GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLVERTEX3DPROC glad_glVertex3d; -#define glVertex3d glad_glVertex3d -typedef void (APIENTRYP PFNGLVERTEX3DVPROC)(const GLdouble *v); -GLAPI PFNGLVERTEX3DVPROC glad_glVertex3dv; -#define glVertex3dv glad_glVertex3dv -typedef void (APIENTRYP PFNGLVERTEX3FPROC)(GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLVERTEX3FPROC glad_glVertex3f; -#define glVertex3f glad_glVertex3f -typedef void (APIENTRYP PFNGLVERTEX3FVPROC)(const GLfloat *v); -GLAPI PFNGLVERTEX3FVPROC glad_glVertex3fv; -#define glVertex3fv glad_glVertex3fv -typedef void (APIENTRYP PFNGLVERTEX3IPROC)(GLint x, GLint y, GLint z); -GLAPI PFNGLVERTEX3IPROC glad_glVertex3i; -#define glVertex3i glad_glVertex3i -typedef void (APIENTRYP PFNGLVERTEX3IVPROC)(const GLint *v); -GLAPI PFNGLVERTEX3IVPROC glad_glVertex3iv; -#define glVertex3iv glad_glVertex3iv -typedef void (APIENTRYP PFNGLVERTEX3SPROC)(GLshort x, GLshort y, GLshort z); -GLAPI PFNGLVERTEX3SPROC glad_glVertex3s; -#define glVertex3s glad_glVertex3s -typedef void (APIENTRYP PFNGLVERTEX3SVPROC)(const GLshort *v); -GLAPI PFNGLVERTEX3SVPROC glad_glVertex3sv; -#define glVertex3sv glad_glVertex3sv -typedef void (APIENTRYP PFNGLVERTEX4DPROC)(GLdouble x, GLdouble y, GLdouble z, GLdouble w); -GLAPI PFNGLVERTEX4DPROC glad_glVertex4d; -#define glVertex4d glad_glVertex4d -typedef void (APIENTRYP PFNGLVERTEX4DVPROC)(const GLdouble *v); -GLAPI PFNGLVERTEX4DVPROC glad_glVertex4dv; -#define glVertex4dv glad_glVertex4dv -typedef void (APIENTRYP PFNGLVERTEX4FPROC)(GLfloat x, GLfloat y, GLfloat z, GLfloat w); -GLAPI PFNGLVERTEX4FPROC glad_glVertex4f; -#define glVertex4f glad_glVertex4f -typedef void (APIENTRYP PFNGLVERTEX4FVPROC)(const GLfloat *v); -GLAPI PFNGLVERTEX4FVPROC glad_glVertex4fv; -#define glVertex4fv glad_glVertex4fv -typedef void (APIENTRYP PFNGLVERTEX4IPROC)(GLint x, GLint y, GLint z, GLint w); -GLAPI PFNGLVERTEX4IPROC glad_glVertex4i; -#define glVertex4i glad_glVertex4i -typedef void (APIENTRYP PFNGLVERTEX4IVPROC)(const GLint *v); -GLAPI PFNGLVERTEX4IVPROC glad_glVertex4iv; -#define glVertex4iv glad_glVertex4iv -typedef void (APIENTRYP PFNGLVERTEX4SPROC)(GLshort x, GLshort y, GLshort z, GLshort w); -GLAPI PFNGLVERTEX4SPROC glad_glVertex4s; -#define glVertex4s glad_glVertex4s -typedef void (APIENTRYP PFNGLVERTEX4SVPROC)(const GLshort *v); -GLAPI PFNGLVERTEX4SVPROC glad_glVertex4sv; -#define glVertex4sv glad_glVertex4sv -typedef void (APIENTRYP PFNGLCLIPPLANEPROC)(GLenum plane, const GLdouble *equation); -GLAPI PFNGLCLIPPLANEPROC glad_glClipPlane; -#define glClipPlane glad_glClipPlane -typedef void (APIENTRYP PFNGLCOLORMATERIALPROC)(GLenum face, GLenum mode); -GLAPI PFNGLCOLORMATERIALPROC glad_glColorMaterial; -#define glColorMaterial glad_glColorMaterial -typedef void (APIENTRYP PFNGLFOGFPROC)(GLenum pname, GLfloat param); -GLAPI PFNGLFOGFPROC glad_glFogf; -#define glFogf glad_glFogf -typedef void (APIENTRYP PFNGLFOGFVPROC)(GLenum pname, const GLfloat *params); -GLAPI PFNGLFOGFVPROC glad_glFogfv; -#define glFogfv glad_glFogfv -typedef void (APIENTRYP PFNGLFOGIPROC)(GLenum pname, GLint param); -GLAPI PFNGLFOGIPROC glad_glFogi; -#define glFogi glad_glFogi -typedef void (APIENTRYP PFNGLFOGIVPROC)(GLenum pname, const GLint *params); -GLAPI PFNGLFOGIVPROC glad_glFogiv; -#define glFogiv glad_glFogiv -typedef void (APIENTRYP PFNGLLIGHTFPROC)(GLenum light, GLenum pname, GLfloat param); -GLAPI PFNGLLIGHTFPROC glad_glLightf; -#define glLightf glad_glLightf -typedef void (APIENTRYP PFNGLLIGHTFVPROC)(GLenum light, GLenum pname, const GLfloat *params); -GLAPI PFNGLLIGHTFVPROC glad_glLightfv; -#define glLightfv glad_glLightfv -typedef void (APIENTRYP PFNGLLIGHTIPROC)(GLenum light, GLenum pname, GLint param); -GLAPI PFNGLLIGHTIPROC glad_glLighti; -#define glLighti glad_glLighti -typedef void (APIENTRYP PFNGLLIGHTIVPROC)(GLenum light, GLenum pname, const GLint *params); -GLAPI PFNGLLIGHTIVPROC glad_glLightiv; -#define glLightiv glad_glLightiv -typedef void (APIENTRYP PFNGLLIGHTMODELFPROC)(GLenum pname, GLfloat param); -GLAPI PFNGLLIGHTMODELFPROC glad_glLightModelf; -#define glLightModelf glad_glLightModelf -typedef void (APIENTRYP PFNGLLIGHTMODELFVPROC)(GLenum pname, const GLfloat *params); -GLAPI PFNGLLIGHTMODELFVPROC glad_glLightModelfv; -#define glLightModelfv glad_glLightModelfv -typedef void (APIENTRYP PFNGLLIGHTMODELIPROC)(GLenum pname, GLint param); -GLAPI PFNGLLIGHTMODELIPROC glad_glLightModeli; -#define glLightModeli glad_glLightModeli -typedef void (APIENTRYP PFNGLLIGHTMODELIVPROC)(GLenum pname, const GLint *params); -GLAPI PFNGLLIGHTMODELIVPROC glad_glLightModeliv; -#define glLightModeliv glad_glLightModeliv -typedef void (APIENTRYP PFNGLLINESTIPPLEPROC)(GLint factor, GLushort pattern); -GLAPI PFNGLLINESTIPPLEPROC glad_glLineStipple; -#define glLineStipple glad_glLineStipple -typedef void (APIENTRYP PFNGLMATERIALFPROC)(GLenum face, GLenum pname, GLfloat param); -GLAPI PFNGLMATERIALFPROC glad_glMaterialf; -#define glMaterialf glad_glMaterialf -typedef void (APIENTRYP PFNGLMATERIALFVPROC)(GLenum face, GLenum pname, const GLfloat *params); -GLAPI PFNGLMATERIALFVPROC glad_glMaterialfv; -#define glMaterialfv glad_glMaterialfv -typedef void (APIENTRYP PFNGLMATERIALIPROC)(GLenum face, GLenum pname, GLint param); -GLAPI PFNGLMATERIALIPROC glad_glMateriali; -#define glMateriali glad_glMateriali -typedef void (APIENTRYP PFNGLMATERIALIVPROC)(GLenum face, GLenum pname, const GLint *params); -GLAPI PFNGLMATERIALIVPROC glad_glMaterialiv; -#define glMaterialiv glad_glMaterialiv -typedef void (APIENTRYP PFNGLPOLYGONSTIPPLEPROC)(const GLubyte *mask); -GLAPI PFNGLPOLYGONSTIPPLEPROC glad_glPolygonStipple; -#define glPolygonStipple glad_glPolygonStipple -typedef void (APIENTRYP PFNGLSHADEMODELPROC)(GLenum mode); -GLAPI PFNGLSHADEMODELPROC glad_glShadeModel; -#define glShadeModel glad_glShadeModel -typedef void (APIENTRYP PFNGLTEXENVFPROC)(GLenum target, GLenum pname, GLfloat param); -GLAPI PFNGLTEXENVFPROC glad_glTexEnvf; -#define glTexEnvf glad_glTexEnvf -typedef void (APIENTRYP PFNGLTEXENVFVPROC)(GLenum target, GLenum pname, const GLfloat *params); -GLAPI PFNGLTEXENVFVPROC glad_glTexEnvfv; -#define glTexEnvfv glad_glTexEnvfv -typedef void (APIENTRYP PFNGLTEXENVIPROC)(GLenum target, GLenum pname, GLint param); -GLAPI PFNGLTEXENVIPROC glad_glTexEnvi; -#define glTexEnvi glad_glTexEnvi -typedef void (APIENTRYP PFNGLTEXENVIVPROC)(GLenum target, GLenum pname, const GLint *params); -GLAPI PFNGLTEXENVIVPROC glad_glTexEnviv; -#define glTexEnviv glad_glTexEnviv -typedef void (APIENTRYP PFNGLTEXGENDPROC)(GLenum coord, GLenum pname, GLdouble param); -GLAPI PFNGLTEXGENDPROC glad_glTexGend; -#define glTexGend glad_glTexGend -typedef void (APIENTRYP PFNGLTEXGENDVPROC)(GLenum coord, GLenum pname, const GLdouble *params); -GLAPI PFNGLTEXGENDVPROC glad_glTexGendv; -#define glTexGendv glad_glTexGendv -typedef void (APIENTRYP PFNGLTEXGENFPROC)(GLenum coord, GLenum pname, GLfloat param); -GLAPI PFNGLTEXGENFPROC glad_glTexGenf; -#define glTexGenf glad_glTexGenf -typedef void (APIENTRYP PFNGLTEXGENFVPROC)(GLenum coord, GLenum pname, const GLfloat *params); -GLAPI PFNGLTEXGENFVPROC glad_glTexGenfv; -#define glTexGenfv glad_glTexGenfv -typedef void (APIENTRYP PFNGLTEXGENIPROC)(GLenum coord, GLenum pname, GLint param); -GLAPI PFNGLTEXGENIPROC glad_glTexGeni; -#define glTexGeni glad_glTexGeni -typedef void (APIENTRYP PFNGLTEXGENIVPROC)(GLenum coord, GLenum pname, const GLint *params); -GLAPI PFNGLTEXGENIVPROC glad_glTexGeniv; -#define glTexGeniv glad_glTexGeniv -typedef void (APIENTRYP PFNGLFEEDBACKBUFFERPROC)(GLsizei size, GLenum type, GLfloat *buffer); -GLAPI PFNGLFEEDBACKBUFFERPROC glad_glFeedbackBuffer; -#define glFeedbackBuffer glad_glFeedbackBuffer -typedef void (APIENTRYP PFNGLSELECTBUFFERPROC)(GLsizei size, GLuint *buffer); -GLAPI PFNGLSELECTBUFFERPROC glad_glSelectBuffer; -#define glSelectBuffer glad_glSelectBuffer -typedef GLint (APIENTRYP PFNGLRENDERMODEPROC)(GLenum mode); -GLAPI PFNGLRENDERMODEPROC glad_glRenderMode; -#define glRenderMode glad_glRenderMode -typedef void (APIENTRYP PFNGLINITNAMESPROC)(void); -GLAPI PFNGLINITNAMESPROC glad_glInitNames; -#define glInitNames glad_glInitNames -typedef void (APIENTRYP PFNGLLOADNAMEPROC)(GLuint name); -GLAPI PFNGLLOADNAMEPROC glad_glLoadName; -#define glLoadName glad_glLoadName -typedef void (APIENTRYP PFNGLPASSTHROUGHPROC)(GLfloat token); -GLAPI PFNGLPASSTHROUGHPROC glad_glPassThrough; -#define glPassThrough glad_glPassThrough -typedef void (APIENTRYP PFNGLPOPNAMEPROC)(void); -GLAPI PFNGLPOPNAMEPROC glad_glPopName; -#define glPopName glad_glPopName -typedef void (APIENTRYP PFNGLPUSHNAMEPROC)(GLuint name); -GLAPI PFNGLPUSHNAMEPROC glad_glPushName; -#define glPushName glad_glPushName -typedef void (APIENTRYP PFNGLCLEARACCUMPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); -GLAPI PFNGLCLEARACCUMPROC glad_glClearAccum; -#define glClearAccum glad_glClearAccum -typedef void (APIENTRYP PFNGLCLEARINDEXPROC)(GLfloat c); -GLAPI PFNGLCLEARINDEXPROC glad_glClearIndex; -#define glClearIndex glad_glClearIndex -typedef void (APIENTRYP PFNGLINDEXMASKPROC)(GLuint mask); -GLAPI PFNGLINDEXMASKPROC glad_glIndexMask; -#define glIndexMask glad_glIndexMask -typedef void (APIENTRYP PFNGLACCUMPROC)(GLenum op, GLfloat value); -GLAPI PFNGLACCUMPROC glad_glAccum; -#define glAccum glad_glAccum -typedef void (APIENTRYP PFNGLPOPATTRIBPROC)(void); -GLAPI PFNGLPOPATTRIBPROC glad_glPopAttrib; -#define glPopAttrib glad_glPopAttrib -typedef void (APIENTRYP PFNGLPUSHATTRIBPROC)(GLbitfield mask); -GLAPI PFNGLPUSHATTRIBPROC glad_glPushAttrib; -#define glPushAttrib glad_glPushAttrib -typedef void (APIENTRYP PFNGLMAP1DPROC)(GLenum target, GLdouble u1, GLdouble u2, GLint stride, GLint order, const GLdouble *points); -GLAPI PFNGLMAP1DPROC glad_glMap1d; -#define glMap1d glad_glMap1d -typedef void (APIENTRYP PFNGLMAP1FPROC)(GLenum target, GLfloat u1, GLfloat u2, GLint stride, GLint order, const GLfloat *points); -GLAPI PFNGLMAP1FPROC glad_glMap1f; -#define glMap1f glad_glMap1f -typedef void (APIENTRYP PFNGLMAP2DPROC)(GLenum target, GLdouble u1, GLdouble u2, GLint ustride, GLint uorder, GLdouble v1, GLdouble v2, GLint vstride, GLint vorder, const GLdouble *points); -GLAPI PFNGLMAP2DPROC glad_glMap2d; -#define glMap2d glad_glMap2d -typedef void (APIENTRYP PFNGLMAP2FPROC)(GLenum target, GLfloat u1, GLfloat u2, GLint ustride, GLint uorder, GLfloat v1, GLfloat v2, GLint vstride, GLint vorder, const GLfloat *points); -GLAPI PFNGLMAP2FPROC glad_glMap2f; -#define glMap2f glad_glMap2f -typedef void (APIENTRYP PFNGLMAPGRID1DPROC)(GLint un, GLdouble u1, GLdouble u2); -GLAPI PFNGLMAPGRID1DPROC glad_glMapGrid1d; -#define glMapGrid1d glad_glMapGrid1d -typedef void (APIENTRYP PFNGLMAPGRID1FPROC)(GLint un, GLfloat u1, GLfloat u2); -GLAPI PFNGLMAPGRID1FPROC glad_glMapGrid1f; -#define glMapGrid1f glad_glMapGrid1f -typedef void (APIENTRYP PFNGLMAPGRID2DPROC)(GLint un, GLdouble u1, GLdouble u2, GLint vn, GLdouble v1, GLdouble v2); -GLAPI PFNGLMAPGRID2DPROC glad_glMapGrid2d; -#define glMapGrid2d glad_glMapGrid2d -typedef void (APIENTRYP PFNGLMAPGRID2FPROC)(GLint un, GLfloat u1, GLfloat u2, GLint vn, GLfloat v1, GLfloat v2); -GLAPI PFNGLMAPGRID2FPROC glad_glMapGrid2f; -#define glMapGrid2f glad_glMapGrid2f -typedef void (APIENTRYP PFNGLEVALCOORD1DPROC)(GLdouble u); -GLAPI PFNGLEVALCOORD1DPROC glad_glEvalCoord1d; -#define glEvalCoord1d glad_glEvalCoord1d -typedef void (APIENTRYP PFNGLEVALCOORD1DVPROC)(const GLdouble *u); -GLAPI PFNGLEVALCOORD1DVPROC glad_glEvalCoord1dv; -#define glEvalCoord1dv glad_glEvalCoord1dv -typedef void (APIENTRYP PFNGLEVALCOORD1FPROC)(GLfloat u); -GLAPI PFNGLEVALCOORD1FPROC glad_glEvalCoord1f; -#define glEvalCoord1f glad_glEvalCoord1f -typedef void (APIENTRYP PFNGLEVALCOORD1FVPROC)(const GLfloat *u); -GLAPI PFNGLEVALCOORD1FVPROC glad_glEvalCoord1fv; -#define glEvalCoord1fv glad_glEvalCoord1fv -typedef void (APIENTRYP PFNGLEVALCOORD2DPROC)(GLdouble u, GLdouble v); -GLAPI PFNGLEVALCOORD2DPROC glad_glEvalCoord2d; -#define glEvalCoord2d glad_glEvalCoord2d -typedef void (APIENTRYP PFNGLEVALCOORD2DVPROC)(const GLdouble *u); -GLAPI PFNGLEVALCOORD2DVPROC glad_glEvalCoord2dv; -#define glEvalCoord2dv glad_glEvalCoord2dv -typedef void (APIENTRYP PFNGLEVALCOORD2FPROC)(GLfloat u, GLfloat v); -GLAPI PFNGLEVALCOORD2FPROC glad_glEvalCoord2f; -#define glEvalCoord2f glad_glEvalCoord2f -typedef void (APIENTRYP PFNGLEVALCOORD2FVPROC)(const GLfloat *u); -GLAPI PFNGLEVALCOORD2FVPROC glad_glEvalCoord2fv; -#define glEvalCoord2fv glad_glEvalCoord2fv -typedef void (APIENTRYP PFNGLEVALMESH1PROC)(GLenum mode, GLint i1, GLint i2); -GLAPI PFNGLEVALMESH1PROC glad_glEvalMesh1; -#define glEvalMesh1 glad_glEvalMesh1 -typedef void (APIENTRYP PFNGLEVALPOINT1PROC)(GLint i); -GLAPI PFNGLEVALPOINT1PROC glad_glEvalPoint1; -#define glEvalPoint1 glad_glEvalPoint1 -typedef void (APIENTRYP PFNGLEVALMESH2PROC)(GLenum mode, GLint i1, GLint i2, GLint j1, GLint j2); -GLAPI PFNGLEVALMESH2PROC glad_glEvalMesh2; -#define glEvalMesh2 glad_glEvalMesh2 -typedef void (APIENTRYP PFNGLEVALPOINT2PROC)(GLint i, GLint j); -GLAPI PFNGLEVALPOINT2PROC glad_glEvalPoint2; -#define glEvalPoint2 glad_glEvalPoint2 -typedef void (APIENTRYP PFNGLALPHAFUNCPROC)(GLenum func, GLfloat ref); -GLAPI PFNGLALPHAFUNCPROC glad_glAlphaFunc; -#define glAlphaFunc glad_glAlphaFunc -typedef void (APIENTRYP PFNGLPIXELZOOMPROC)(GLfloat xfactor, GLfloat yfactor); -GLAPI PFNGLPIXELZOOMPROC glad_glPixelZoom; -#define glPixelZoom glad_glPixelZoom -typedef void (APIENTRYP PFNGLPIXELTRANSFERFPROC)(GLenum pname, GLfloat param); -GLAPI PFNGLPIXELTRANSFERFPROC glad_glPixelTransferf; -#define glPixelTransferf glad_glPixelTransferf -typedef void (APIENTRYP PFNGLPIXELTRANSFERIPROC)(GLenum pname, GLint param); -GLAPI PFNGLPIXELTRANSFERIPROC glad_glPixelTransferi; -#define glPixelTransferi glad_glPixelTransferi -typedef void (APIENTRYP PFNGLPIXELMAPFVPROC)(GLenum map, GLsizei mapsize, const GLfloat *values); -GLAPI PFNGLPIXELMAPFVPROC glad_glPixelMapfv; -#define glPixelMapfv glad_glPixelMapfv -typedef void (APIENTRYP PFNGLPIXELMAPUIVPROC)(GLenum map, GLsizei mapsize, const GLuint *values); -GLAPI PFNGLPIXELMAPUIVPROC glad_glPixelMapuiv; -#define glPixelMapuiv glad_glPixelMapuiv -typedef void (APIENTRYP PFNGLPIXELMAPUSVPROC)(GLenum map, GLsizei mapsize, const GLushort *values); -GLAPI PFNGLPIXELMAPUSVPROC glad_glPixelMapusv; -#define glPixelMapusv glad_glPixelMapusv -typedef void (APIENTRYP PFNGLCOPYPIXELSPROC)(GLint x, GLint y, GLsizei width, GLsizei height, GLenum type); -GLAPI PFNGLCOPYPIXELSPROC glad_glCopyPixels; -#define glCopyPixels glad_glCopyPixels -typedef void (APIENTRYP PFNGLDRAWPIXELSPROC)(GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLDRAWPIXELSPROC glad_glDrawPixels; -#define glDrawPixels glad_glDrawPixels -typedef void (APIENTRYP PFNGLGETCLIPPLANEPROC)(GLenum plane, GLdouble *equation); -GLAPI PFNGLGETCLIPPLANEPROC glad_glGetClipPlane; -#define glGetClipPlane glad_glGetClipPlane -typedef void (APIENTRYP PFNGLGETLIGHTFVPROC)(GLenum light, GLenum pname, GLfloat *params); -GLAPI PFNGLGETLIGHTFVPROC glad_glGetLightfv; -#define glGetLightfv glad_glGetLightfv -typedef void (APIENTRYP PFNGLGETLIGHTIVPROC)(GLenum light, GLenum pname, GLint *params); -GLAPI PFNGLGETLIGHTIVPROC glad_glGetLightiv; -#define glGetLightiv glad_glGetLightiv -typedef void (APIENTRYP PFNGLGETMAPDVPROC)(GLenum target, GLenum query, GLdouble *v); -GLAPI PFNGLGETMAPDVPROC glad_glGetMapdv; -#define glGetMapdv glad_glGetMapdv -typedef void (APIENTRYP PFNGLGETMAPFVPROC)(GLenum target, GLenum query, GLfloat *v); -GLAPI PFNGLGETMAPFVPROC glad_glGetMapfv; -#define glGetMapfv glad_glGetMapfv -typedef void (APIENTRYP PFNGLGETMAPIVPROC)(GLenum target, GLenum query, GLint *v); -GLAPI PFNGLGETMAPIVPROC glad_glGetMapiv; -#define glGetMapiv glad_glGetMapiv -typedef void (APIENTRYP PFNGLGETMATERIALFVPROC)(GLenum face, GLenum pname, GLfloat *params); -GLAPI PFNGLGETMATERIALFVPROC glad_glGetMaterialfv; -#define glGetMaterialfv glad_glGetMaterialfv -typedef void (APIENTRYP PFNGLGETMATERIALIVPROC)(GLenum face, GLenum pname, GLint *params); -GLAPI PFNGLGETMATERIALIVPROC glad_glGetMaterialiv; -#define glGetMaterialiv glad_glGetMaterialiv -typedef void (APIENTRYP PFNGLGETPIXELMAPFVPROC)(GLenum map, GLfloat *values); -GLAPI PFNGLGETPIXELMAPFVPROC glad_glGetPixelMapfv; -#define glGetPixelMapfv glad_glGetPixelMapfv -typedef void (APIENTRYP PFNGLGETPIXELMAPUIVPROC)(GLenum map, GLuint *values); -GLAPI PFNGLGETPIXELMAPUIVPROC glad_glGetPixelMapuiv; -#define glGetPixelMapuiv glad_glGetPixelMapuiv -typedef void (APIENTRYP PFNGLGETPIXELMAPUSVPROC)(GLenum map, GLushort *values); -GLAPI PFNGLGETPIXELMAPUSVPROC glad_glGetPixelMapusv; -#define glGetPixelMapusv glad_glGetPixelMapusv -typedef void (APIENTRYP PFNGLGETPOLYGONSTIPPLEPROC)(GLubyte *mask); -GLAPI PFNGLGETPOLYGONSTIPPLEPROC glad_glGetPolygonStipple; -#define glGetPolygonStipple glad_glGetPolygonStipple -typedef void (APIENTRYP PFNGLGETTEXENVFVPROC)(GLenum target, GLenum pname, GLfloat *params); -GLAPI PFNGLGETTEXENVFVPROC glad_glGetTexEnvfv; -#define glGetTexEnvfv glad_glGetTexEnvfv -typedef void (APIENTRYP PFNGLGETTEXENVIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETTEXENVIVPROC glad_glGetTexEnviv; -#define glGetTexEnviv glad_glGetTexEnviv -typedef void (APIENTRYP PFNGLGETTEXGENDVPROC)(GLenum coord, GLenum pname, GLdouble *params); -GLAPI PFNGLGETTEXGENDVPROC glad_glGetTexGendv; -#define glGetTexGendv glad_glGetTexGendv -typedef void (APIENTRYP PFNGLGETTEXGENFVPROC)(GLenum coord, GLenum pname, GLfloat *params); -GLAPI PFNGLGETTEXGENFVPROC glad_glGetTexGenfv; -#define glGetTexGenfv glad_glGetTexGenfv -typedef void (APIENTRYP PFNGLGETTEXGENIVPROC)(GLenum coord, GLenum pname, GLint *params); -GLAPI PFNGLGETTEXGENIVPROC glad_glGetTexGeniv; -#define glGetTexGeniv glad_glGetTexGeniv -typedef GLboolean (APIENTRYP PFNGLISLISTPROC)(GLuint list); -GLAPI PFNGLISLISTPROC glad_glIsList; -#define glIsList glad_glIsList -typedef void (APIENTRYP PFNGLFRUSTUMPROC)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar); -GLAPI PFNGLFRUSTUMPROC glad_glFrustum; -#define glFrustum glad_glFrustum -typedef void (APIENTRYP PFNGLLOADIDENTITYPROC)(void); -GLAPI PFNGLLOADIDENTITYPROC glad_glLoadIdentity; -#define glLoadIdentity glad_glLoadIdentity -typedef void (APIENTRYP PFNGLLOADMATRIXFPROC)(const GLfloat *m); -GLAPI PFNGLLOADMATRIXFPROC glad_glLoadMatrixf; -#define glLoadMatrixf glad_glLoadMatrixf -typedef void (APIENTRYP PFNGLLOADMATRIXDPROC)(const GLdouble *m); -GLAPI PFNGLLOADMATRIXDPROC glad_glLoadMatrixd; -#define glLoadMatrixd glad_glLoadMatrixd -typedef void (APIENTRYP PFNGLMATRIXMODEPROC)(GLenum mode); -GLAPI PFNGLMATRIXMODEPROC glad_glMatrixMode; -#define glMatrixMode glad_glMatrixMode -typedef void (APIENTRYP PFNGLMULTMATRIXFPROC)(const GLfloat *m); -GLAPI PFNGLMULTMATRIXFPROC glad_glMultMatrixf; -#define glMultMatrixf glad_glMultMatrixf -typedef void (APIENTRYP PFNGLMULTMATRIXDPROC)(const GLdouble *m); -GLAPI PFNGLMULTMATRIXDPROC glad_glMultMatrixd; -#define glMultMatrixd glad_glMultMatrixd -typedef void (APIENTRYP PFNGLORTHOPROC)(GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar); -GLAPI PFNGLORTHOPROC glad_glOrtho; -#define glOrtho glad_glOrtho -typedef void (APIENTRYP PFNGLPOPMATRIXPROC)(void); -GLAPI PFNGLPOPMATRIXPROC glad_glPopMatrix; -#define glPopMatrix glad_glPopMatrix -typedef void (APIENTRYP PFNGLPUSHMATRIXPROC)(void); -GLAPI PFNGLPUSHMATRIXPROC glad_glPushMatrix; -#define glPushMatrix glad_glPushMatrix -typedef void (APIENTRYP PFNGLROTATEDPROC)(GLdouble angle, GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLROTATEDPROC glad_glRotated; -#define glRotated glad_glRotated -typedef void (APIENTRYP PFNGLROTATEFPROC)(GLfloat angle, GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLROTATEFPROC glad_glRotatef; -#define glRotatef glad_glRotatef -typedef void (APIENTRYP PFNGLSCALEDPROC)(GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLSCALEDPROC glad_glScaled; -#define glScaled glad_glScaled -typedef void (APIENTRYP PFNGLSCALEFPROC)(GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLSCALEFPROC glad_glScalef; -#define glScalef glad_glScalef -typedef void (APIENTRYP PFNGLTRANSLATEDPROC)(GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLTRANSLATEDPROC glad_glTranslated; -#define glTranslated glad_glTranslated -typedef void (APIENTRYP PFNGLTRANSLATEFPROC)(GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLTRANSLATEFPROC glad_glTranslatef; -#define glTranslatef glad_glTranslatef -#endif -#ifndef GL_VERSION_1_1 -#define GL_VERSION_1_1 1 -GLAPI int GLAD_GL_VERSION_1_1; -typedef void (APIENTRYP PFNGLDRAWARRAYSPROC)(GLenum mode, GLint first, GLsizei count); -GLAPI PFNGLDRAWARRAYSPROC glad_glDrawArrays; -#define glDrawArrays glad_glDrawArrays -typedef void (APIENTRYP PFNGLDRAWELEMENTSPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices); -GLAPI PFNGLDRAWELEMENTSPROC glad_glDrawElements; -#define glDrawElements glad_glDrawElements -typedef void (APIENTRYP PFNGLGETPOINTERVPROC)(GLenum pname, void **params); -GLAPI PFNGLGETPOINTERVPROC glad_glGetPointerv; -#define glGetPointerv glad_glGetPointerv -typedef void (APIENTRYP PFNGLPOLYGONOFFSETPROC)(GLfloat factor, GLfloat units); -GLAPI PFNGLPOLYGONOFFSETPROC glad_glPolygonOffset; -#define glPolygonOffset glad_glPolygonOffset -typedef void (APIENTRYP PFNGLCOPYTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLint border); -GLAPI PFNGLCOPYTEXIMAGE1DPROC glad_glCopyTexImage1D; -#define glCopyTexImage1D glad_glCopyTexImage1D -typedef void (APIENTRYP PFNGLCOPYTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLint x, GLint y, GLsizei width, GLsizei height, GLint border); -GLAPI PFNGLCOPYTEXIMAGE2DPROC glad_glCopyTexImage2D; -#define glCopyTexImage2D glad_glCopyTexImage2D -typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLint x, GLint y, GLsizei width); -GLAPI PFNGLCOPYTEXSUBIMAGE1DPROC glad_glCopyTexSubImage1D; -#define glCopyTexSubImage1D glad_glCopyTexSubImage1D -typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint x, GLint y, GLsizei width, GLsizei height); -GLAPI PFNGLCOPYTEXSUBIMAGE2DPROC glad_glCopyTexSubImage2D; -#define glCopyTexSubImage2D glad_glCopyTexSubImage2D -typedef void (APIENTRYP PFNGLTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXSUBIMAGE1DPROC glad_glTexSubImage1D; -#define glTexSubImage1D glad_glTexSubImage1D -typedef void (APIENTRYP PFNGLTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXSUBIMAGE2DPROC glad_glTexSubImage2D; -#define glTexSubImage2D glad_glTexSubImage2D -typedef void (APIENTRYP PFNGLBINDTEXTUREPROC)(GLenum target, GLuint texture); -GLAPI PFNGLBINDTEXTUREPROC glad_glBindTexture; -#define glBindTexture glad_glBindTexture -typedef void (APIENTRYP PFNGLDELETETEXTURESPROC)(GLsizei n, const GLuint *textures); -GLAPI PFNGLDELETETEXTURESPROC glad_glDeleteTextures; -#define glDeleteTextures glad_glDeleteTextures -typedef void (APIENTRYP PFNGLGENTEXTURESPROC)(GLsizei n, GLuint *textures); -GLAPI PFNGLGENTEXTURESPROC glad_glGenTextures; -#define glGenTextures glad_glGenTextures -typedef GLboolean (APIENTRYP PFNGLISTEXTUREPROC)(GLuint texture); -GLAPI PFNGLISTEXTUREPROC glad_glIsTexture; -#define glIsTexture glad_glIsTexture -typedef void (APIENTRYP PFNGLARRAYELEMENTPROC)(GLint i); -GLAPI PFNGLARRAYELEMENTPROC glad_glArrayElement; -#define glArrayElement glad_glArrayElement -typedef void (APIENTRYP PFNGLCOLORPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLCOLORPOINTERPROC glad_glColorPointer; -#define glColorPointer glad_glColorPointer -typedef void (APIENTRYP PFNGLDISABLECLIENTSTATEPROC)(GLenum array); -GLAPI PFNGLDISABLECLIENTSTATEPROC glad_glDisableClientState; -#define glDisableClientState glad_glDisableClientState -typedef void (APIENTRYP PFNGLEDGEFLAGPOINTERPROC)(GLsizei stride, const void *pointer); -GLAPI PFNGLEDGEFLAGPOINTERPROC glad_glEdgeFlagPointer; -#define glEdgeFlagPointer glad_glEdgeFlagPointer -typedef void (APIENTRYP PFNGLENABLECLIENTSTATEPROC)(GLenum array); -GLAPI PFNGLENABLECLIENTSTATEPROC glad_glEnableClientState; -#define glEnableClientState glad_glEnableClientState -typedef void (APIENTRYP PFNGLINDEXPOINTERPROC)(GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLINDEXPOINTERPROC glad_glIndexPointer; -#define glIndexPointer glad_glIndexPointer -typedef void (APIENTRYP PFNGLINTERLEAVEDARRAYSPROC)(GLenum format, GLsizei stride, const void *pointer); -GLAPI PFNGLINTERLEAVEDARRAYSPROC glad_glInterleavedArrays; -#define glInterleavedArrays glad_glInterleavedArrays -typedef void (APIENTRYP PFNGLNORMALPOINTERPROC)(GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLNORMALPOINTERPROC glad_glNormalPointer; -#define glNormalPointer glad_glNormalPointer -typedef void (APIENTRYP PFNGLTEXCOORDPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLTEXCOORDPOINTERPROC glad_glTexCoordPointer; -#define glTexCoordPointer glad_glTexCoordPointer -typedef void (APIENTRYP PFNGLVERTEXPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLVERTEXPOINTERPROC glad_glVertexPointer; -#define glVertexPointer glad_glVertexPointer -typedef GLboolean (APIENTRYP PFNGLARETEXTURESRESIDENTPROC)(GLsizei n, const GLuint *textures, GLboolean *residences); -GLAPI PFNGLARETEXTURESRESIDENTPROC glad_glAreTexturesResident; -#define glAreTexturesResident glad_glAreTexturesResident -typedef void (APIENTRYP PFNGLPRIORITIZETEXTURESPROC)(GLsizei n, const GLuint *textures, const GLfloat *priorities); -GLAPI PFNGLPRIORITIZETEXTURESPROC glad_glPrioritizeTextures; -#define glPrioritizeTextures glad_glPrioritizeTextures -typedef void (APIENTRYP PFNGLINDEXUBPROC)(GLubyte c); -GLAPI PFNGLINDEXUBPROC glad_glIndexub; -#define glIndexub glad_glIndexub -typedef void (APIENTRYP PFNGLINDEXUBVPROC)(const GLubyte *c); -GLAPI PFNGLINDEXUBVPROC glad_glIndexubv; -#define glIndexubv glad_glIndexubv -typedef void (APIENTRYP PFNGLPOPCLIENTATTRIBPROC)(void); -GLAPI PFNGLPOPCLIENTATTRIBPROC glad_glPopClientAttrib; -#define glPopClientAttrib glad_glPopClientAttrib -typedef void (APIENTRYP PFNGLPUSHCLIENTATTRIBPROC)(GLbitfield mask); -GLAPI PFNGLPUSHCLIENTATTRIBPROC glad_glPushClientAttrib; -#define glPushClientAttrib glad_glPushClientAttrib -#endif -#ifndef GL_VERSION_1_2 -#define GL_VERSION_1_2 1 -GLAPI int GLAD_GL_VERSION_1_2; -typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices); -GLAPI PFNGLDRAWRANGEELEMENTSPROC glad_glDrawRangeElements; -#define glDrawRangeElements glad_glDrawRangeElements -typedef void (APIENTRYP PFNGLTEXIMAGE3DPROC)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXIMAGE3DPROC glad_glTexImage3D; -#define glTexImage3D glad_glTexImage3D -typedef void (APIENTRYP PFNGLTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pixels); -GLAPI PFNGLTEXSUBIMAGE3DPROC glad_glTexSubImage3D; -#define glTexSubImage3D glad_glTexSubImage3D -typedef void (APIENTRYP PFNGLCOPYTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height); -GLAPI PFNGLCOPYTEXSUBIMAGE3DPROC glad_glCopyTexSubImage3D; -#define glCopyTexSubImage3D glad_glCopyTexSubImage3D -#endif -#ifndef GL_VERSION_1_3 -#define GL_VERSION_1_3 1 -GLAPI int GLAD_GL_VERSION_1_3; -typedef void (APIENTRYP PFNGLACTIVETEXTUREPROC)(GLenum texture); -GLAPI PFNGLACTIVETEXTUREPROC glad_glActiveTexture; -#define glActiveTexture glad_glActiveTexture -typedef void (APIENTRYP PFNGLSAMPLECOVERAGEPROC)(GLfloat value, GLboolean invert); -GLAPI PFNGLSAMPLECOVERAGEPROC glad_glSampleCoverage; -#define glSampleCoverage glad_glSampleCoverage -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE3DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXIMAGE3DPROC glad_glCompressedTexImage3D; -#define glCompressedTexImage3D glad_glCompressedTexImage3D -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE2DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLsizei height, GLint border, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXIMAGE2DPROC glad_glCompressedTexImage2D; -#define glCompressedTexImage2D glad_glCompressedTexImage2D -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXIMAGE1DPROC)(GLenum target, GLint level, GLenum internalformat, GLsizei width, GLint border, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXIMAGE1DPROC glad_glCompressedTexImage1D; -#define glCompressedTexImage1D glad_glCompressedTexImage1D -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE3DPROC glad_glCompressedTexSubImage3D; -#define glCompressedTexSubImage3D glad_glCompressedTexSubImage3D -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC)(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE2DPROC glad_glCompressedTexSubImage2D; -#define glCompressedTexSubImage2D glad_glCompressedTexSubImage2D -typedef void (APIENTRYP PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC)(GLenum target, GLint level, GLint xoffset, GLsizei width, GLenum format, GLsizei imageSize, const void *data); -GLAPI PFNGLCOMPRESSEDTEXSUBIMAGE1DPROC glad_glCompressedTexSubImage1D; -#define glCompressedTexSubImage1D glad_glCompressedTexSubImage1D -typedef void (APIENTRYP PFNGLGETCOMPRESSEDTEXIMAGEPROC)(GLenum target, GLint level, void *img); -GLAPI PFNGLGETCOMPRESSEDTEXIMAGEPROC glad_glGetCompressedTexImage; -#define glGetCompressedTexImage glad_glGetCompressedTexImage -typedef void (APIENTRYP PFNGLCLIENTACTIVETEXTUREPROC)(GLenum texture); -GLAPI PFNGLCLIENTACTIVETEXTUREPROC glad_glClientActiveTexture; -#define glClientActiveTexture glad_glClientActiveTexture -typedef void (APIENTRYP PFNGLMULTITEXCOORD1DPROC)(GLenum target, GLdouble s); -GLAPI PFNGLMULTITEXCOORD1DPROC glad_glMultiTexCoord1d; -#define glMultiTexCoord1d glad_glMultiTexCoord1d -typedef void (APIENTRYP PFNGLMULTITEXCOORD1DVPROC)(GLenum target, const GLdouble *v); -GLAPI PFNGLMULTITEXCOORD1DVPROC glad_glMultiTexCoord1dv; -#define glMultiTexCoord1dv glad_glMultiTexCoord1dv -typedef void (APIENTRYP PFNGLMULTITEXCOORD1FPROC)(GLenum target, GLfloat s); -GLAPI PFNGLMULTITEXCOORD1FPROC glad_glMultiTexCoord1f; -#define glMultiTexCoord1f glad_glMultiTexCoord1f -typedef void (APIENTRYP PFNGLMULTITEXCOORD1FVPROC)(GLenum target, const GLfloat *v); -GLAPI PFNGLMULTITEXCOORD1FVPROC glad_glMultiTexCoord1fv; -#define glMultiTexCoord1fv glad_glMultiTexCoord1fv -typedef void (APIENTRYP PFNGLMULTITEXCOORD1IPROC)(GLenum target, GLint s); -GLAPI PFNGLMULTITEXCOORD1IPROC glad_glMultiTexCoord1i; -#define glMultiTexCoord1i glad_glMultiTexCoord1i -typedef void (APIENTRYP PFNGLMULTITEXCOORD1IVPROC)(GLenum target, const GLint *v); -GLAPI PFNGLMULTITEXCOORD1IVPROC glad_glMultiTexCoord1iv; -#define glMultiTexCoord1iv glad_glMultiTexCoord1iv -typedef void (APIENTRYP PFNGLMULTITEXCOORD1SPROC)(GLenum target, GLshort s); -GLAPI PFNGLMULTITEXCOORD1SPROC glad_glMultiTexCoord1s; -#define glMultiTexCoord1s glad_glMultiTexCoord1s -typedef void (APIENTRYP PFNGLMULTITEXCOORD1SVPROC)(GLenum target, const GLshort *v); -GLAPI PFNGLMULTITEXCOORD1SVPROC glad_glMultiTexCoord1sv; -#define glMultiTexCoord1sv glad_glMultiTexCoord1sv -typedef void (APIENTRYP PFNGLMULTITEXCOORD2DPROC)(GLenum target, GLdouble s, GLdouble t); -GLAPI PFNGLMULTITEXCOORD2DPROC glad_glMultiTexCoord2d; -#define glMultiTexCoord2d glad_glMultiTexCoord2d -typedef void (APIENTRYP PFNGLMULTITEXCOORD2DVPROC)(GLenum target, const GLdouble *v); -GLAPI PFNGLMULTITEXCOORD2DVPROC glad_glMultiTexCoord2dv; -#define glMultiTexCoord2dv glad_glMultiTexCoord2dv -typedef void (APIENTRYP PFNGLMULTITEXCOORD2FPROC)(GLenum target, GLfloat s, GLfloat t); -GLAPI PFNGLMULTITEXCOORD2FPROC glad_glMultiTexCoord2f; -#define glMultiTexCoord2f glad_glMultiTexCoord2f -typedef void (APIENTRYP PFNGLMULTITEXCOORD2FVPROC)(GLenum target, const GLfloat *v); -GLAPI PFNGLMULTITEXCOORD2FVPROC glad_glMultiTexCoord2fv; -#define glMultiTexCoord2fv glad_glMultiTexCoord2fv -typedef void (APIENTRYP PFNGLMULTITEXCOORD2IPROC)(GLenum target, GLint s, GLint t); -GLAPI PFNGLMULTITEXCOORD2IPROC glad_glMultiTexCoord2i; -#define glMultiTexCoord2i glad_glMultiTexCoord2i -typedef void (APIENTRYP PFNGLMULTITEXCOORD2IVPROC)(GLenum target, const GLint *v); -GLAPI PFNGLMULTITEXCOORD2IVPROC glad_glMultiTexCoord2iv; -#define glMultiTexCoord2iv glad_glMultiTexCoord2iv -typedef void (APIENTRYP PFNGLMULTITEXCOORD2SPROC)(GLenum target, GLshort s, GLshort t); -GLAPI PFNGLMULTITEXCOORD2SPROC glad_glMultiTexCoord2s; -#define glMultiTexCoord2s glad_glMultiTexCoord2s -typedef void (APIENTRYP PFNGLMULTITEXCOORD2SVPROC)(GLenum target, const GLshort *v); -GLAPI PFNGLMULTITEXCOORD2SVPROC glad_glMultiTexCoord2sv; -#define glMultiTexCoord2sv glad_glMultiTexCoord2sv -typedef void (APIENTRYP PFNGLMULTITEXCOORD3DPROC)(GLenum target, GLdouble s, GLdouble t, GLdouble r); -GLAPI PFNGLMULTITEXCOORD3DPROC glad_glMultiTexCoord3d; -#define glMultiTexCoord3d glad_glMultiTexCoord3d -typedef void (APIENTRYP PFNGLMULTITEXCOORD3DVPROC)(GLenum target, const GLdouble *v); -GLAPI PFNGLMULTITEXCOORD3DVPROC glad_glMultiTexCoord3dv; -#define glMultiTexCoord3dv glad_glMultiTexCoord3dv -typedef void (APIENTRYP PFNGLMULTITEXCOORD3FPROC)(GLenum target, GLfloat s, GLfloat t, GLfloat r); -GLAPI PFNGLMULTITEXCOORD3FPROC glad_glMultiTexCoord3f; -#define glMultiTexCoord3f glad_glMultiTexCoord3f -typedef void (APIENTRYP PFNGLMULTITEXCOORD3FVPROC)(GLenum target, const GLfloat *v); -GLAPI PFNGLMULTITEXCOORD3FVPROC glad_glMultiTexCoord3fv; -#define glMultiTexCoord3fv glad_glMultiTexCoord3fv -typedef void (APIENTRYP PFNGLMULTITEXCOORD3IPROC)(GLenum target, GLint s, GLint t, GLint r); -GLAPI PFNGLMULTITEXCOORD3IPROC glad_glMultiTexCoord3i; -#define glMultiTexCoord3i glad_glMultiTexCoord3i -typedef void (APIENTRYP PFNGLMULTITEXCOORD3IVPROC)(GLenum target, const GLint *v); -GLAPI PFNGLMULTITEXCOORD3IVPROC glad_glMultiTexCoord3iv; -#define glMultiTexCoord3iv glad_glMultiTexCoord3iv -typedef void (APIENTRYP PFNGLMULTITEXCOORD3SPROC)(GLenum target, GLshort s, GLshort t, GLshort r); -GLAPI PFNGLMULTITEXCOORD3SPROC glad_glMultiTexCoord3s; -#define glMultiTexCoord3s glad_glMultiTexCoord3s -typedef void (APIENTRYP PFNGLMULTITEXCOORD3SVPROC)(GLenum target, const GLshort *v); -GLAPI PFNGLMULTITEXCOORD3SVPROC glad_glMultiTexCoord3sv; -#define glMultiTexCoord3sv glad_glMultiTexCoord3sv -typedef void (APIENTRYP PFNGLMULTITEXCOORD4DPROC)(GLenum target, GLdouble s, GLdouble t, GLdouble r, GLdouble q); -GLAPI PFNGLMULTITEXCOORD4DPROC glad_glMultiTexCoord4d; -#define glMultiTexCoord4d glad_glMultiTexCoord4d -typedef void (APIENTRYP PFNGLMULTITEXCOORD4DVPROC)(GLenum target, const GLdouble *v); -GLAPI PFNGLMULTITEXCOORD4DVPROC glad_glMultiTexCoord4dv; -#define glMultiTexCoord4dv glad_glMultiTexCoord4dv -typedef void (APIENTRYP PFNGLMULTITEXCOORD4FPROC)(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q); -GLAPI PFNGLMULTITEXCOORD4FPROC glad_glMultiTexCoord4f; -#define glMultiTexCoord4f glad_glMultiTexCoord4f -typedef void (APIENTRYP PFNGLMULTITEXCOORD4FVPROC)(GLenum target, const GLfloat *v); -GLAPI PFNGLMULTITEXCOORD4FVPROC glad_glMultiTexCoord4fv; -#define glMultiTexCoord4fv glad_glMultiTexCoord4fv -typedef void (APIENTRYP PFNGLMULTITEXCOORD4IPROC)(GLenum target, GLint s, GLint t, GLint r, GLint q); -GLAPI PFNGLMULTITEXCOORD4IPROC glad_glMultiTexCoord4i; -#define glMultiTexCoord4i glad_glMultiTexCoord4i -typedef void (APIENTRYP PFNGLMULTITEXCOORD4IVPROC)(GLenum target, const GLint *v); -GLAPI PFNGLMULTITEXCOORD4IVPROC glad_glMultiTexCoord4iv; -#define glMultiTexCoord4iv glad_glMultiTexCoord4iv -typedef void (APIENTRYP PFNGLMULTITEXCOORD4SPROC)(GLenum target, GLshort s, GLshort t, GLshort r, GLshort q); -GLAPI PFNGLMULTITEXCOORD4SPROC glad_glMultiTexCoord4s; -#define glMultiTexCoord4s glad_glMultiTexCoord4s -typedef void (APIENTRYP PFNGLMULTITEXCOORD4SVPROC)(GLenum target, const GLshort *v); -GLAPI PFNGLMULTITEXCOORD4SVPROC glad_glMultiTexCoord4sv; -#define glMultiTexCoord4sv glad_glMultiTexCoord4sv -typedef void (APIENTRYP PFNGLLOADTRANSPOSEMATRIXFPROC)(const GLfloat *m); -GLAPI PFNGLLOADTRANSPOSEMATRIXFPROC glad_glLoadTransposeMatrixf; -#define glLoadTransposeMatrixf glad_glLoadTransposeMatrixf -typedef void (APIENTRYP PFNGLLOADTRANSPOSEMATRIXDPROC)(const GLdouble *m); -GLAPI PFNGLLOADTRANSPOSEMATRIXDPROC glad_glLoadTransposeMatrixd; -#define glLoadTransposeMatrixd glad_glLoadTransposeMatrixd -typedef void (APIENTRYP PFNGLMULTTRANSPOSEMATRIXFPROC)(const GLfloat *m); -GLAPI PFNGLMULTTRANSPOSEMATRIXFPROC glad_glMultTransposeMatrixf; -#define glMultTransposeMatrixf glad_glMultTransposeMatrixf -typedef void (APIENTRYP PFNGLMULTTRANSPOSEMATRIXDPROC)(const GLdouble *m); -GLAPI PFNGLMULTTRANSPOSEMATRIXDPROC glad_glMultTransposeMatrixd; -#define glMultTransposeMatrixd glad_glMultTransposeMatrixd -#endif -#ifndef GL_VERSION_1_4 -#define GL_VERSION_1_4 1 -GLAPI int GLAD_GL_VERSION_1_4; -typedef void (APIENTRYP PFNGLBLENDFUNCSEPARATEPROC)(GLenum sfactorRGB, GLenum dfactorRGB, GLenum sfactorAlpha, GLenum dfactorAlpha); -GLAPI PFNGLBLENDFUNCSEPARATEPROC glad_glBlendFuncSeparate; -#define glBlendFuncSeparate glad_glBlendFuncSeparate -typedef void (APIENTRYP PFNGLMULTIDRAWARRAYSPROC)(GLenum mode, const GLint *first, const GLsizei *count, GLsizei drawcount); -GLAPI PFNGLMULTIDRAWARRAYSPROC glad_glMultiDrawArrays; -#define glMultiDrawArrays glad_glMultiDrawArrays -typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSPROC)(GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei drawcount); -GLAPI PFNGLMULTIDRAWELEMENTSPROC glad_glMultiDrawElements; -#define glMultiDrawElements glad_glMultiDrawElements -typedef void (APIENTRYP PFNGLPOINTPARAMETERFPROC)(GLenum pname, GLfloat param); -GLAPI PFNGLPOINTPARAMETERFPROC glad_glPointParameterf; -#define glPointParameterf glad_glPointParameterf -typedef void (APIENTRYP PFNGLPOINTPARAMETERFVPROC)(GLenum pname, const GLfloat *params); -GLAPI PFNGLPOINTPARAMETERFVPROC glad_glPointParameterfv; -#define glPointParameterfv glad_glPointParameterfv -typedef void (APIENTRYP PFNGLPOINTPARAMETERIPROC)(GLenum pname, GLint param); -GLAPI PFNGLPOINTPARAMETERIPROC glad_glPointParameteri; -#define glPointParameteri glad_glPointParameteri -typedef void (APIENTRYP PFNGLPOINTPARAMETERIVPROC)(GLenum pname, const GLint *params); -GLAPI PFNGLPOINTPARAMETERIVPROC glad_glPointParameteriv; -#define glPointParameteriv glad_glPointParameteriv -typedef void (APIENTRYP PFNGLFOGCOORDFPROC)(GLfloat coord); -GLAPI PFNGLFOGCOORDFPROC glad_glFogCoordf; -#define glFogCoordf glad_glFogCoordf -typedef void (APIENTRYP PFNGLFOGCOORDFVPROC)(const GLfloat *coord); -GLAPI PFNGLFOGCOORDFVPROC glad_glFogCoordfv; -#define glFogCoordfv glad_glFogCoordfv -typedef void (APIENTRYP PFNGLFOGCOORDDPROC)(GLdouble coord); -GLAPI PFNGLFOGCOORDDPROC glad_glFogCoordd; -#define glFogCoordd glad_glFogCoordd -typedef void (APIENTRYP PFNGLFOGCOORDDVPROC)(const GLdouble *coord); -GLAPI PFNGLFOGCOORDDVPROC glad_glFogCoorddv; -#define glFogCoorddv glad_glFogCoorddv -typedef void (APIENTRYP PFNGLFOGCOORDPOINTERPROC)(GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLFOGCOORDPOINTERPROC glad_glFogCoordPointer; -#define glFogCoordPointer glad_glFogCoordPointer -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3BPROC)(GLbyte red, GLbyte green, GLbyte blue); -GLAPI PFNGLSECONDARYCOLOR3BPROC glad_glSecondaryColor3b; -#define glSecondaryColor3b glad_glSecondaryColor3b -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3BVPROC)(const GLbyte *v); -GLAPI PFNGLSECONDARYCOLOR3BVPROC glad_glSecondaryColor3bv; -#define glSecondaryColor3bv glad_glSecondaryColor3bv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3DPROC)(GLdouble red, GLdouble green, GLdouble blue); -GLAPI PFNGLSECONDARYCOLOR3DPROC glad_glSecondaryColor3d; -#define glSecondaryColor3d glad_glSecondaryColor3d -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3DVPROC)(const GLdouble *v); -GLAPI PFNGLSECONDARYCOLOR3DVPROC glad_glSecondaryColor3dv; -#define glSecondaryColor3dv glad_glSecondaryColor3dv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3FPROC)(GLfloat red, GLfloat green, GLfloat blue); -GLAPI PFNGLSECONDARYCOLOR3FPROC glad_glSecondaryColor3f; -#define glSecondaryColor3f glad_glSecondaryColor3f -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3FVPROC)(const GLfloat *v); -GLAPI PFNGLSECONDARYCOLOR3FVPROC glad_glSecondaryColor3fv; -#define glSecondaryColor3fv glad_glSecondaryColor3fv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3IPROC)(GLint red, GLint green, GLint blue); -GLAPI PFNGLSECONDARYCOLOR3IPROC glad_glSecondaryColor3i; -#define glSecondaryColor3i glad_glSecondaryColor3i -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3IVPROC)(const GLint *v); -GLAPI PFNGLSECONDARYCOLOR3IVPROC glad_glSecondaryColor3iv; -#define glSecondaryColor3iv glad_glSecondaryColor3iv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3SPROC)(GLshort red, GLshort green, GLshort blue); -GLAPI PFNGLSECONDARYCOLOR3SPROC glad_glSecondaryColor3s; -#define glSecondaryColor3s glad_glSecondaryColor3s -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3SVPROC)(const GLshort *v); -GLAPI PFNGLSECONDARYCOLOR3SVPROC glad_glSecondaryColor3sv; -#define glSecondaryColor3sv glad_glSecondaryColor3sv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3UBPROC)(GLubyte red, GLubyte green, GLubyte blue); -GLAPI PFNGLSECONDARYCOLOR3UBPROC glad_glSecondaryColor3ub; -#define glSecondaryColor3ub glad_glSecondaryColor3ub -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3UBVPROC)(const GLubyte *v); -GLAPI PFNGLSECONDARYCOLOR3UBVPROC glad_glSecondaryColor3ubv; -#define glSecondaryColor3ubv glad_glSecondaryColor3ubv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3UIPROC)(GLuint red, GLuint green, GLuint blue); -GLAPI PFNGLSECONDARYCOLOR3UIPROC glad_glSecondaryColor3ui; -#define glSecondaryColor3ui glad_glSecondaryColor3ui -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3UIVPROC)(const GLuint *v); -GLAPI PFNGLSECONDARYCOLOR3UIVPROC glad_glSecondaryColor3uiv; -#define glSecondaryColor3uiv glad_glSecondaryColor3uiv -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3USPROC)(GLushort red, GLushort green, GLushort blue); -GLAPI PFNGLSECONDARYCOLOR3USPROC glad_glSecondaryColor3us; -#define glSecondaryColor3us glad_glSecondaryColor3us -typedef void (APIENTRYP PFNGLSECONDARYCOLOR3USVPROC)(const GLushort *v); -GLAPI PFNGLSECONDARYCOLOR3USVPROC glad_glSecondaryColor3usv; -#define glSecondaryColor3usv glad_glSecondaryColor3usv -typedef void (APIENTRYP PFNGLSECONDARYCOLORPOINTERPROC)(GLint size, GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLSECONDARYCOLORPOINTERPROC glad_glSecondaryColorPointer; -#define glSecondaryColorPointer glad_glSecondaryColorPointer -typedef void (APIENTRYP PFNGLWINDOWPOS2DPROC)(GLdouble x, GLdouble y); -GLAPI PFNGLWINDOWPOS2DPROC glad_glWindowPos2d; -#define glWindowPos2d glad_glWindowPos2d -typedef void (APIENTRYP PFNGLWINDOWPOS2DVPROC)(const GLdouble *v); -GLAPI PFNGLWINDOWPOS2DVPROC glad_glWindowPos2dv; -#define glWindowPos2dv glad_glWindowPos2dv -typedef void (APIENTRYP PFNGLWINDOWPOS2FPROC)(GLfloat x, GLfloat y); -GLAPI PFNGLWINDOWPOS2FPROC glad_glWindowPos2f; -#define glWindowPos2f glad_glWindowPos2f -typedef void (APIENTRYP PFNGLWINDOWPOS2FVPROC)(const GLfloat *v); -GLAPI PFNGLWINDOWPOS2FVPROC glad_glWindowPos2fv; -#define glWindowPos2fv glad_glWindowPos2fv -typedef void (APIENTRYP PFNGLWINDOWPOS2IPROC)(GLint x, GLint y); -GLAPI PFNGLWINDOWPOS2IPROC glad_glWindowPos2i; -#define glWindowPos2i glad_glWindowPos2i -typedef void (APIENTRYP PFNGLWINDOWPOS2IVPROC)(const GLint *v); -GLAPI PFNGLWINDOWPOS2IVPROC glad_glWindowPos2iv; -#define glWindowPos2iv glad_glWindowPos2iv -typedef void (APIENTRYP PFNGLWINDOWPOS2SPROC)(GLshort x, GLshort y); -GLAPI PFNGLWINDOWPOS2SPROC glad_glWindowPos2s; -#define glWindowPos2s glad_glWindowPos2s -typedef void (APIENTRYP PFNGLWINDOWPOS2SVPROC)(const GLshort *v); -GLAPI PFNGLWINDOWPOS2SVPROC glad_glWindowPos2sv; -#define glWindowPos2sv glad_glWindowPos2sv -typedef void (APIENTRYP PFNGLWINDOWPOS3DPROC)(GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLWINDOWPOS3DPROC glad_glWindowPos3d; -#define glWindowPos3d glad_glWindowPos3d -typedef void (APIENTRYP PFNGLWINDOWPOS3DVPROC)(const GLdouble *v); -GLAPI PFNGLWINDOWPOS3DVPROC glad_glWindowPos3dv; -#define glWindowPos3dv glad_glWindowPos3dv -typedef void (APIENTRYP PFNGLWINDOWPOS3FPROC)(GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLWINDOWPOS3FPROC glad_glWindowPos3f; -#define glWindowPos3f glad_glWindowPos3f -typedef void (APIENTRYP PFNGLWINDOWPOS3FVPROC)(const GLfloat *v); -GLAPI PFNGLWINDOWPOS3FVPROC glad_glWindowPos3fv; -#define glWindowPos3fv glad_glWindowPos3fv -typedef void (APIENTRYP PFNGLWINDOWPOS3IPROC)(GLint x, GLint y, GLint z); -GLAPI PFNGLWINDOWPOS3IPROC glad_glWindowPos3i; -#define glWindowPos3i glad_glWindowPos3i -typedef void (APIENTRYP PFNGLWINDOWPOS3IVPROC)(const GLint *v); -GLAPI PFNGLWINDOWPOS3IVPROC glad_glWindowPos3iv; -#define glWindowPos3iv glad_glWindowPos3iv -typedef void (APIENTRYP PFNGLWINDOWPOS3SPROC)(GLshort x, GLshort y, GLshort z); -GLAPI PFNGLWINDOWPOS3SPROC glad_glWindowPos3s; -#define glWindowPos3s glad_glWindowPos3s -typedef void (APIENTRYP PFNGLWINDOWPOS3SVPROC)(const GLshort *v); -GLAPI PFNGLWINDOWPOS3SVPROC glad_glWindowPos3sv; -#define glWindowPos3sv glad_glWindowPos3sv -typedef void (APIENTRYP PFNGLBLENDCOLORPROC)(GLfloat red, GLfloat green, GLfloat blue, GLfloat alpha); -GLAPI PFNGLBLENDCOLORPROC glad_glBlendColor; -#define glBlendColor glad_glBlendColor -typedef void (APIENTRYP PFNGLBLENDEQUATIONPROC)(GLenum mode); -GLAPI PFNGLBLENDEQUATIONPROC glad_glBlendEquation; -#define glBlendEquation glad_glBlendEquation -#endif -#ifndef GL_VERSION_1_5 -#define GL_VERSION_1_5 1 -GLAPI int GLAD_GL_VERSION_1_5; -typedef void (APIENTRYP PFNGLGENQUERIESPROC)(GLsizei n, GLuint *ids); -GLAPI PFNGLGENQUERIESPROC glad_glGenQueries; -#define glGenQueries glad_glGenQueries -typedef void (APIENTRYP PFNGLDELETEQUERIESPROC)(GLsizei n, const GLuint *ids); -GLAPI PFNGLDELETEQUERIESPROC glad_glDeleteQueries; -#define glDeleteQueries glad_glDeleteQueries -typedef GLboolean (APIENTRYP PFNGLISQUERYPROC)(GLuint id); -GLAPI PFNGLISQUERYPROC glad_glIsQuery; -#define glIsQuery glad_glIsQuery -typedef void (APIENTRYP PFNGLBEGINQUERYPROC)(GLenum target, GLuint id); -GLAPI PFNGLBEGINQUERYPROC glad_glBeginQuery; -#define glBeginQuery glad_glBeginQuery -typedef void (APIENTRYP PFNGLENDQUERYPROC)(GLenum target); -GLAPI PFNGLENDQUERYPROC glad_glEndQuery; -#define glEndQuery glad_glEndQuery -typedef void (APIENTRYP PFNGLGETQUERYIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETQUERYIVPROC glad_glGetQueryiv; -#define glGetQueryiv glad_glGetQueryiv -typedef void (APIENTRYP PFNGLGETQUERYOBJECTIVPROC)(GLuint id, GLenum pname, GLint *params); -GLAPI PFNGLGETQUERYOBJECTIVPROC glad_glGetQueryObjectiv; -#define glGetQueryObjectiv glad_glGetQueryObjectiv -typedef void (APIENTRYP PFNGLGETQUERYOBJECTUIVPROC)(GLuint id, GLenum pname, GLuint *params); -GLAPI PFNGLGETQUERYOBJECTUIVPROC glad_glGetQueryObjectuiv; -#define glGetQueryObjectuiv glad_glGetQueryObjectuiv -typedef void (APIENTRYP PFNGLBINDBUFFERPROC)(GLenum target, GLuint buffer); -GLAPI PFNGLBINDBUFFERPROC glad_glBindBuffer; -#define glBindBuffer glad_glBindBuffer -typedef void (APIENTRYP PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers); -GLAPI PFNGLDELETEBUFFERSPROC glad_glDeleteBuffers; -#define glDeleteBuffers glad_glDeleteBuffers -typedef void (APIENTRYP PFNGLGENBUFFERSPROC)(GLsizei n, GLuint *buffers); -GLAPI PFNGLGENBUFFERSPROC glad_glGenBuffers; -#define glGenBuffers glad_glGenBuffers -typedef GLboolean (APIENTRYP PFNGLISBUFFERPROC)(GLuint buffer); -GLAPI PFNGLISBUFFERPROC glad_glIsBuffer; -#define glIsBuffer glad_glIsBuffer -typedef void (APIENTRYP PFNGLBUFFERDATAPROC)(GLenum target, GLsizeiptr size, const void *data, GLenum usage); -GLAPI PFNGLBUFFERDATAPROC glad_glBufferData; -#define glBufferData glad_glBufferData -typedef void (APIENTRYP PFNGLBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, const void *data); -GLAPI PFNGLBUFFERSUBDATAPROC glad_glBufferSubData; -#define glBufferSubData glad_glBufferSubData -typedef void (APIENTRYP PFNGLGETBUFFERSUBDATAPROC)(GLenum target, GLintptr offset, GLsizeiptr size, void *data); -GLAPI PFNGLGETBUFFERSUBDATAPROC glad_glGetBufferSubData; -#define glGetBufferSubData glad_glGetBufferSubData -typedef void * (APIENTRYP PFNGLMAPBUFFERPROC)(GLenum target, GLenum access); -GLAPI PFNGLMAPBUFFERPROC glad_glMapBuffer; -#define glMapBuffer glad_glMapBuffer -typedef GLboolean (APIENTRYP PFNGLUNMAPBUFFERPROC)(GLenum target); -GLAPI PFNGLUNMAPBUFFERPROC glad_glUnmapBuffer; -#define glUnmapBuffer glad_glUnmapBuffer -typedef void (APIENTRYP PFNGLGETBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETBUFFERPARAMETERIVPROC glad_glGetBufferParameteriv; -#define glGetBufferParameteriv glad_glGetBufferParameteriv -typedef void (APIENTRYP PFNGLGETBUFFERPOINTERVPROC)(GLenum target, GLenum pname, void **params); -GLAPI PFNGLGETBUFFERPOINTERVPROC glad_glGetBufferPointerv; -#define glGetBufferPointerv glad_glGetBufferPointerv -#endif -#ifndef GL_VERSION_2_0 -#define GL_VERSION_2_0 1 -GLAPI int GLAD_GL_VERSION_2_0; -typedef void (APIENTRYP PFNGLBLENDEQUATIONSEPARATEPROC)(GLenum modeRGB, GLenum modeAlpha); -GLAPI PFNGLBLENDEQUATIONSEPARATEPROC glad_glBlendEquationSeparate; -#define glBlendEquationSeparate glad_glBlendEquationSeparate -typedef void (APIENTRYP PFNGLDRAWBUFFERSPROC)(GLsizei n, const GLenum *bufs); -GLAPI PFNGLDRAWBUFFERSPROC glad_glDrawBuffers; -#define glDrawBuffers glad_glDrawBuffers -typedef void (APIENTRYP PFNGLSTENCILOPSEPARATEPROC)(GLenum face, GLenum sfail, GLenum dpfail, GLenum dppass); -GLAPI PFNGLSTENCILOPSEPARATEPROC glad_glStencilOpSeparate; -#define glStencilOpSeparate glad_glStencilOpSeparate -typedef void (APIENTRYP PFNGLSTENCILFUNCSEPARATEPROC)(GLenum face, GLenum func, GLint ref, GLuint mask); -GLAPI PFNGLSTENCILFUNCSEPARATEPROC glad_glStencilFuncSeparate; -#define glStencilFuncSeparate glad_glStencilFuncSeparate -typedef void (APIENTRYP PFNGLSTENCILMASKSEPARATEPROC)(GLenum face, GLuint mask); -GLAPI PFNGLSTENCILMASKSEPARATEPROC glad_glStencilMaskSeparate; -#define glStencilMaskSeparate glad_glStencilMaskSeparate -typedef void (APIENTRYP PFNGLATTACHSHADERPROC)(GLuint program, GLuint shader); -GLAPI PFNGLATTACHSHADERPROC glad_glAttachShader; -#define glAttachShader glad_glAttachShader -typedef void (APIENTRYP PFNGLBINDATTRIBLOCATIONPROC)(GLuint program, GLuint index, const GLchar *name); -GLAPI PFNGLBINDATTRIBLOCATIONPROC glad_glBindAttribLocation; -#define glBindAttribLocation glad_glBindAttribLocation -typedef void (APIENTRYP PFNGLCOMPILESHADERPROC)(GLuint shader); -GLAPI PFNGLCOMPILESHADERPROC glad_glCompileShader; -#define glCompileShader glad_glCompileShader -typedef GLuint (APIENTRYP PFNGLCREATEPROGRAMPROC)(void); -GLAPI PFNGLCREATEPROGRAMPROC glad_glCreateProgram; -#define glCreateProgram glad_glCreateProgram -typedef GLuint (APIENTRYP PFNGLCREATESHADERPROC)(GLenum type); -GLAPI PFNGLCREATESHADERPROC glad_glCreateShader; -#define glCreateShader glad_glCreateShader -typedef void (APIENTRYP PFNGLDELETEPROGRAMPROC)(GLuint program); -GLAPI PFNGLDELETEPROGRAMPROC glad_glDeleteProgram; -#define glDeleteProgram glad_glDeleteProgram -typedef void (APIENTRYP PFNGLDELETESHADERPROC)(GLuint shader); -GLAPI PFNGLDELETESHADERPROC glad_glDeleteShader; -#define glDeleteShader glad_glDeleteShader -typedef void (APIENTRYP PFNGLDETACHSHADERPROC)(GLuint program, GLuint shader); -GLAPI PFNGLDETACHSHADERPROC glad_glDetachShader; -#define glDetachShader glad_glDetachShader -typedef void (APIENTRYP PFNGLDISABLEVERTEXATTRIBARRAYPROC)(GLuint index); -GLAPI PFNGLDISABLEVERTEXATTRIBARRAYPROC glad_glDisableVertexAttribArray; -#define glDisableVertexAttribArray glad_glDisableVertexAttribArray -typedef void (APIENTRYP PFNGLENABLEVERTEXATTRIBARRAYPROC)(GLuint index); -GLAPI PFNGLENABLEVERTEXATTRIBARRAYPROC glad_glEnableVertexAttribArray; -#define glEnableVertexAttribArray glad_glEnableVertexAttribArray -typedef void (APIENTRYP PFNGLGETACTIVEATTRIBPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name); -GLAPI PFNGLGETACTIVEATTRIBPROC glad_glGetActiveAttrib; -#define glGetActiveAttrib glad_glGetActiveAttrib -typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name); -GLAPI PFNGLGETACTIVEUNIFORMPROC glad_glGetActiveUniform; -#define glGetActiveUniform glad_glGetActiveUniform -typedef void (APIENTRYP PFNGLGETATTACHEDSHADERSPROC)(GLuint program, GLsizei maxCount, GLsizei *count, GLuint *shaders); -GLAPI PFNGLGETATTACHEDSHADERSPROC glad_glGetAttachedShaders; -#define glGetAttachedShaders glad_glGetAttachedShaders -typedef GLint (APIENTRYP PFNGLGETATTRIBLOCATIONPROC)(GLuint program, const GLchar *name); -GLAPI PFNGLGETATTRIBLOCATIONPROC glad_glGetAttribLocation; -#define glGetAttribLocation glad_glGetAttribLocation -typedef void (APIENTRYP PFNGLGETPROGRAMIVPROC)(GLuint program, GLenum pname, GLint *params); -GLAPI PFNGLGETPROGRAMIVPROC glad_glGetProgramiv; -#define glGetProgramiv glad_glGetProgramiv -typedef void (APIENTRYP PFNGLGETPROGRAMINFOLOGPROC)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog); -GLAPI PFNGLGETPROGRAMINFOLOGPROC glad_glGetProgramInfoLog; -#define glGetProgramInfoLog glad_glGetProgramInfoLog -typedef void (APIENTRYP PFNGLGETSHADERIVPROC)(GLuint shader, GLenum pname, GLint *params); -GLAPI PFNGLGETSHADERIVPROC glad_glGetShaderiv; -#define glGetShaderiv glad_glGetShaderiv -typedef void (APIENTRYP PFNGLGETSHADERINFOLOGPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog); -GLAPI PFNGLGETSHADERINFOLOGPROC glad_glGetShaderInfoLog; -#define glGetShaderInfoLog glad_glGetShaderInfoLog -typedef void (APIENTRYP PFNGLGETSHADERSOURCEPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *source); -GLAPI PFNGLGETSHADERSOURCEPROC glad_glGetShaderSource; -#define glGetShaderSource glad_glGetShaderSource -typedef GLint (APIENTRYP PFNGLGETUNIFORMLOCATIONPROC)(GLuint program, const GLchar *name); -GLAPI PFNGLGETUNIFORMLOCATIONPROC glad_glGetUniformLocation; -#define glGetUniformLocation glad_glGetUniformLocation -typedef void (APIENTRYP PFNGLGETUNIFORMFVPROC)(GLuint program, GLint location, GLfloat *params); -GLAPI PFNGLGETUNIFORMFVPROC glad_glGetUniformfv; -#define glGetUniformfv glad_glGetUniformfv -typedef void (APIENTRYP PFNGLGETUNIFORMIVPROC)(GLuint program, GLint location, GLint *params); -GLAPI PFNGLGETUNIFORMIVPROC glad_glGetUniformiv; -#define glGetUniformiv glad_glGetUniformiv -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBDVPROC)(GLuint index, GLenum pname, GLdouble *params); -GLAPI PFNGLGETVERTEXATTRIBDVPROC glad_glGetVertexAttribdv; -#define glGetVertexAttribdv glad_glGetVertexAttribdv -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBFVPROC)(GLuint index, GLenum pname, GLfloat *params); -GLAPI PFNGLGETVERTEXATTRIBFVPROC glad_glGetVertexAttribfv; -#define glGetVertexAttribfv glad_glGetVertexAttribfv -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIVPROC)(GLuint index, GLenum pname, GLint *params); -GLAPI PFNGLGETVERTEXATTRIBIVPROC glad_glGetVertexAttribiv; -#define glGetVertexAttribiv glad_glGetVertexAttribiv -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBPOINTERVPROC)(GLuint index, GLenum pname, void **pointer); -GLAPI PFNGLGETVERTEXATTRIBPOINTERVPROC glad_glGetVertexAttribPointerv; -#define glGetVertexAttribPointerv glad_glGetVertexAttribPointerv -typedef GLboolean (APIENTRYP PFNGLISPROGRAMPROC)(GLuint program); -GLAPI PFNGLISPROGRAMPROC glad_glIsProgram; -#define glIsProgram glad_glIsProgram -typedef GLboolean (APIENTRYP PFNGLISSHADERPROC)(GLuint shader); -GLAPI PFNGLISSHADERPROC glad_glIsShader; -#define glIsShader glad_glIsShader -typedef void (APIENTRYP PFNGLLINKPROGRAMPROC)(GLuint program); -GLAPI PFNGLLINKPROGRAMPROC glad_glLinkProgram; -#define glLinkProgram glad_glLinkProgram -typedef void (APIENTRYP PFNGLSHADERSOURCEPROC)(GLuint shader, GLsizei count, const GLchar *const*string, const GLint *length); -GLAPI PFNGLSHADERSOURCEPROC glad_glShaderSource; -#define glShaderSource glad_glShaderSource -typedef void (APIENTRYP PFNGLUSEPROGRAMPROC)(GLuint program); -GLAPI PFNGLUSEPROGRAMPROC glad_glUseProgram; -#define glUseProgram glad_glUseProgram -typedef void (APIENTRYP PFNGLUNIFORM1FPROC)(GLint location, GLfloat v0); -GLAPI PFNGLUNIFORM1FPROC glad_glUniform1f; -#define glUniform1f glad_glUniform1f -typedef void (APIENTRYP PFNGLUNIFORM2FPROC)(GLint location, GLfloat v0, GLfloat v1); -GLAPI PFNGLUNIFORM2FPROC glad_glUniform2f; -#define glUniform2f glad_glUniform2f -typedef void (APIENTRYP PFNGLUNIFORM3FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2); -GLAPI PFNGLUNIFORM3FPROC glad_glUniform3f; -#define glUniform3f glad_glUniform3f -typedef void (APIENTRYP PFNGLUNIFORM4FPROC)(GLint location, GLfloat v0, GLfloat v1, GLfloat v2, GLfloat v3); -GLAPI PFNGLUNIFORM4FPROC glad_glUniform4f; -#define glUniform4f glad_glUniform4f -typedef void (APIENTRYP PFNGLUNIFORM1IPROC)(GLint location, GLint v0); -GLAPI PFNGLUNIFORM1IPROC glad_glUniform1i; -#define glUniform1i glad_glUniform1i -typedef void (APIENTRYP PFNGLUNIFORM2IPROC)(GLint location, GLint v0, GLint v1); -GLAPI PFNGLUNIFORM2IPROC glad_glUniform2i; -#define glUniform2i glad_glUniform2i -typedef void (APIENTRYP PFNGLUNIFORM3IPROC)(GLint location, GLint v0, GLint v1, GLint v2); -GLAPI PFNGLUNIFORM3IPROC glad_glUniform3i; -#define glUniform3i glad_glUniform3i -typedef void (APIENTRYP PFNGLUNIFORM4IPROC)(GLint location, GLint v0, GLint v1, GLint v2, GLint v3); -GLAPI PFNGLUNIFORM4IPROC glad_glUniform4i; -#define glUniform4i glad_glUniform4i -typedef void (APIENTRYP PFNGLUNIFORM1FVPROC)(GLint location, GLsizei count, const GLfloat *value); -GLAPI PFNGLUNIFORM1FVPROC glad_glUniform1fv; -#define glUniform1fv glad_glUniform1fv -typedef void (APIENTRYP PFNGLUNIFORM2FVPROC)(GLint location, GLsizei count, const GLfloat *value); -GLAPI PFNGLUNIFORM2FVPROC glad_glUniform2fv; -#define glUniform2fv glad_glUniform2fv -typedef void (APIENTRYP PFNGLUNIFORM3FVPROC)(GLint location, GLsizei count, const GLfloat *value); -GLAPI PFNGLUNIFORM3FVPROC glad_glUniform3fv; -#define glUniform3fv glad_glUniform3fv -typedef void (APIENTRYP PFNGLUNIFORM4FVPROC)(GLint location, GLsizei count, const GLfloat *value); -GLAPI PFNGLUNIFORM4FVPROC glad_glUniform4fv; -#define glUniform4fv glad_glUniform4fv -typedef void (APIENTRYP PFNGLUNIFORM1IVPROC)(GLint location, GLsizei count, const GLint *value); -GLAPI PFNGLUNIFORM1IVPROC glad_glUniform1iv; -#define glUniform1iv glad_glUniform1iv -typedef void (APIENTRYP PFNGLUNIFORM2IVPROC)(GLint location, GLsizei count, const GLint *value); -GLAPI PFNGLUNIFORM2IVPROC glad_glUniform2iv; -#define glUniform2iv glad_glUniform2iv -typedef void (APIENTRYP PFNGLUNIFORM3IVPROC)(GLint location, GLsizei count, const GLint *value); -GLAPI PFNGLUNIFORM3IVPROC glad_glUniform3iv; -#define glUniform3iv glad_glUniform3iv -typedef void (APIENTRYP PFNGLUNIFORM4IVPROC)(GLint location, GLsizei count, const GLint *value); -GLAPI PFNGLUNIFORM4IVPROC glad_glUniform4iv; -#define glUniform4iv glad_glUniform4iv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX2FVPROC glad_glUniformMatrix2fv; -#define glUniformMatrix2fv glad_glUniformMatrix2fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX3FVPROC glad_glUniformMatrix3fv; -#define glUniformMatrix3fv glad_glUniformMatrix3fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX4FVPROC glad_glUniformMatrix4fv; -#define glUniformMatrix4fv glad_glUniformMatrix4fv -typedef void (APIENTRYP PFNGLVALIDATEPROGRAMPROC)(GLuint program); -GLAPI PFNGLVALIDATEPROGRAMPROC glad_glValidateProgram; -#define glValidateProgram glad_glValidateProgram -typedef void (APIENTRYP PFNGLVERTEXATTRIB1DPROC)(GLuint index, GLdouble x); -GLAPI PFNGLVERTEXATTRIB1DPROC glad_glVertexAttrib1d; -#define glVertexAttrib1d glad_glVertexAttrib1d -typedef void (APIENTRYP PFNGLVERTEXATTRIB1DVPROC)(GLuint index, const GLdouble *v); -GLAPI PFNGLVERTEXATTRIB1DVPROC glad_glVertexAttrib1dv; -#define glVertexAttrib1dv glad_glVertexAttrib1dv -typedef void (APIENTRYP PFNGLVERTEXATTRIB1FPROC)(GLuint index, GLfloat x); -GLAPI PFNGLVERTEXATTRIB1FPROC glad_glVertexAttrib1f; -#define glVertexAttrib1f glad_glVertexAttrib1f -typedef void (APIENTRYP PFNGLVERTEXATTRIB1FVPROC)(GLuint index, const GLfloat *v); -GLAPI PFNGLVERTEXATTRIB1FVPROC glad_glVertexAttrib1fv; -#define glVertexAttrib1fv glad_glVertexAttrib1fv -typedef void (APIENTRYP PFNGLVERTEXATTRIB1SPROC)(GLuint index, GLshort x); -GLAPI PFNGLVERTEXATTRIB1SPROC glad_glVertexAttrib1s; -#define glVertexAttrib1s glad_glVertexAttrib1s -typedef void (APIENTRYP PFNGLVERTEXATTRIB1SVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIB1SVPROC glad_glVertexAttrib1sv; -#define glVertexAttrib1sv glad_glVertexAttrib1sv -typedef void (APIENTRYP PFNGLVERTEXATTRIB2DPROC)(GLuint index, GLdouble x, GLdouble y); -GLAPI PFNGLVERTEXATTRIB2DPROC glad_glVertexAttrib2d; -#define glVertexAttrib2d glad_glVertexAttrib2d -typedef void (APIENTRYP PFNGLVERTEXATTRIB2DVPROC)(GLuint index, const GLdouble *v); -GLAPI PFNGLVERTEXATTRIB2DVPROC glad_glVertexAttrib2dv; -#define glVertexAttrib2dv glad_glVertexAttrib2dv -typedef void (APIENTRYP PFNGLVERTEXATTRIB2FPROC)(GLuint index, GLfloat x, GLfloat y); -GLAPI PFNGLVERTEXATTRIB2FPROC glad_glVertexAttrib2f; -#define glVertexAttrib2f glad_glVertexAttrib2f -typedef void (APIENTRYP PFNGLVERTEXATTRIB2FVPROC)(GLuint index, const GLfloat *v); -GLAPI PFNGLVERTEXATTRIB2FVPROC glad_glVertexAttrib2fv; -#define glVertexAttrib2fv glad_glVertexAttrib2fv -typedef void (APIENTRYP PFNGLVERTEXATTRIB2SPROC)(GLuint index, GLshort x, GLshort y); -GLAPI PFNGLVERTEXATTRIB2SPROC glad_glVertexAttrib2s; -#define glVertexAttrib2s glad_glVertexAttrib2s -typedef void (APIENTRYP PFNGLVERTEXATTRIB2SVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIB2SVPROC glad_glVertexAttrib2sv; -#define glVertexAttrib2sv glad_glVertexAttrib2sv -typedef void (APIENTRYP PFNGLVERTEXATTRIB3DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z); -GLAPI PFNGLVERTEXATTRIB3DPROC glad_glVertexAttrib3d; -#define glVertexAttrib3d glad_glVertexAttrib3d -typedef void (APIENTRYP PFNGLVERTEXATTRIB3DVPROC)(GLuint index, const GLdouble *v); -GLAPI PFNGLVERTEXATTRIB3DVPROC glad_glVertexAttrib3dv; -#define glVertexAttrib3dv glad_glVertexAttrib3dv -typedef void (APIENTRYP PFNGLVERTEXATTRIB3FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z); -GLAPI PFNGLVERTEXATTRIB3FPROC glad_glVertexAttrib3f; -#define glVertexAttrib3f glad_glVertexAttrib3f -typedef void (APIENTRYP PFNGLVERTEXATTRIB3FVPROC)(GLuint index, const GLfloat *v); -GLAPI PFNGLVERTEXATTRIB3FVPROC glad_glVertexAttrib3fv; -#define glVertexAttrib3fv glad_glVertexAttrib3fv -typedef void (APIENTRYP PFNGLVERTEXATTRIB3SPROC)(GLuint index, GLshort x, GLshort y, GLshort z); -GLAPI PFNGLVERTEXATTRIB3SPROC glad_glVertexAttrib3s; -#define glVertexAttrib3s glad_glVertexAttrib3s -typedef void (APIENTRYP PFNGLVERTEXATTRIB3SVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIB3SVPROC glad_glVertexAttrib3sv; -#define glVertexAttrib3sv glad_glVertexAttrib3sv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NBVPROC)(GLuint index, const GLbyte *v); -GLAPI PFNGLVERTEXATTRIB4NBVPROC glad_glVertexAttrib4Nbv; -#define glVertexAttrib4Nbv glad_glVertexAttrib4Nbv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NIVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIB4NIVPROC glad_glVertexAttrib4Niv; -#define glVertexAttrib4Niv glad_glVertexAttrib4Niv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NSVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIB4NSVPROC glad_glVertexAttrib4Nsv; -#define glVertexAttrib4Nsv glad_glVertexAttrib4Nsv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUBPROC)(GLuint index, GLubyte x, GLubyte y, GLubyte z, GLubyte w); -GLAPI PFNGLVERTEXATTRIB4NUBPROC glad_glVertexAttrib4Nub; -#define glVertexAttrib4Nub glad_glVertexAttrib4Nub -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUBVPROC)(GLuint index, const GLubyte *v); -GLAPI PFNGLVERTEXATTRIB4NUBVPROC glad_glVertexAttrib4Nubv; -#define glVertexAttrib4Nubv glad_glVertexAttrib4Nubv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIB4NUIVPROC glad_glVertexAttrib4Nuiv; -#define glVertexAttrib4Nuiv glad_glVertexAttrib4Nuiv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4NUSVPROC)(GLuint index, const GLushort *v); -GLAPI PFNGLVERTEXATTRIB4NUSVPROC glad_glVertexAttrib4Nusv; -#define glVertexAttrib4Nusv glad_glVertexAttrib4Nusv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4BVPROC)(GLuint index, const GLbyte *v); -GLAPI PFNGLVERTEXATTRIB4BVPROC glad_glVertexAttrib4bv; -#define glVertexAttrib4bv glad_glVertexAttrib4bv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4DPROC)(GLuint index, GLdouble x, GLdouble y, GLdouble z, GLdouble w); -GLAPI PFNGLVERTEXATTRIB4DPROC glad_glVertexAttrib4d; -#define glVertexAttrib4d glad_glVertexAttrib4d -typedef void (APIENTRYP PFNGLVERTEXATTRIB4DVPROC)(GLuint index, const GLdouble *v); -GLAPI PFNGLVERTEXATTRIB4DVPROC glad_glVertexAttrib4dv; -#define glVertexAttrib4dv glad_glVertexAttrib4dv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4FPROC)(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w); -GLAPI PFNGLVERTEXATTRIB4FPROC glad_glVertexAttrib4f; -#define glVertexAttrib4f glad_glVertexAttrib4f -typedef void (APIENTRYP PFNGLVERTEXATTRIB4FVPROC)(GLuint index, const GLfloat *v); -GLAPI PFNGLVERTEXATTRIB4FVPROC glad_glVertexAttrib4fv; -#define glVertexAttrib4fv glad_glVertexAttrib4fv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4IVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIB4IVPROC glad_glVertexAttrib4iv; -#define glVertexAttrib4iv glad_glVertexAttrib4iv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4SPROC)(GLuint index, GLshort x, GLshort y, GLshort z, GLshort w); -GLAPI PFNGLVERTEXATTRIB4SPROC glad_glVertexAttrib4s; -#define glVertexAttrib4s glad_glVertexAttrib4s -typedef void (APIENTRYP PFNGLVERTEXATTRIB4SVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIB4SVPROC glad_glVertexAttrib4sv; -#define glVertexAttrib4sv glad_glVertexAttrib4sv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4UBVPROC)(GLuint index, const GLubyte *v); -GLAPI PFNGLVERTEXATTRIB4UBVPROC glad_glVertexAttrib4ubv; -#define glVertexAttrib4ubv glad_glVertexAttrib4ubv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4UIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIB4UIVPROC glad_glVertexAttrib4uiv; -#define glVertexAttrib4uiv glad_glVertexAttrib4uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIB4USVPROC)(GLuint index, const GLushort *v); -GLAPI PFNGLVERTEXATTRIB4USVPROC glad_glVertexAttrib4usv; -#define glVertexAttrib4usv glad_glVertexAttrib4usv -typedef void (APIENTRYP PFNGLVERTEXATTRIBPOINTERPROC)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void *pointer); -GLAPI PFNGLVERTEXATTRIBPOINTERPROC glad_glVertexAttribPointer; -#define glVertexAttribPointer glad_glVertexAttribPointer -#endif -#ifndef GL_VERSION_2_1 -#define GL_VERSION_2_1 1 -GLAPI int GLAD_GL_VERSION_2_1; -typedef void (APIENTRYP PFNGLUNIFORMMATRIX2X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX2X3FVPROC glad_glUniformMatrix2x3fv; -#define glUniformMatrix2x3fv glad_glUniformMatrix2x3fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX3X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX3X2FVPROC glad_glUniformMatrix3x2fv; -#define glUniformMatrix3x2fv glad_glUniformMatrix3x2fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX2X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX2X4FVPROC glad_glUniformMatrix2x4fv; -#define glUniformMatrix2x4fv glad_glUniformMatrix2x4fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX4X2FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX4X2FVPROC glad_glUniformMatrix4x2fv; -#define glUniformMatrix4x2fv glad_glUniformMatrix4x2fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX3X4FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX3X4FVPROC glad_glUniformMatrix3x4fv; -#define glUniformMatrix3x4fv glad_glUniformMatrix3x4fv -typedef void (APIENTRYP PFNGLUNIFORMMATRIX4X3FVPROC)(GLint location, GLsizei count, GLboolean transpose, const GLfloat *value); -GLAPI PFNGLUNIFORMMATRIX4X3FVPROC glad_glUniformMatrix4x3fv; -#define glUniformMatrix4x3fv glad_glUniformMatrix4x3fv -#endif -#ifndef GL_VERSION_3_0 -#define GL_VERSION_3_0 1 -GLAPI int GLAD_GL_VERSION_3_0; -typedef void (APIENTRYP PFNGLCOLORMASKIPROC)(GLuint index, GLboolean r, GLboolean g, GLboolean b, GLboolean a); -GLAPI PFNGLCOLORMASKIPROC glad_glColorMaski; -#define glColorMaski glad_glColorMaski -typedef void (APIENTRYP PFNGLGETBOOLEANI_VPROC)(GLenum target, GLuint index, GLboolean *data); -GLAPI PFNGLGETBOOLEANI_VPROC glad_glGetBooleani_v; -#define glGetBooleani_v glad_glGetBooleani_v -typedef void (APIENTRYP PFNGLGETINTEGERI_VPROC)(GLenum target, GLuint index, GLint *data); -GLAPI PFNGLGETINTEGERI_VPROC glad_glGetIntegeri_v; -#define glGetIntegeri_v glad_glGetIntegeri_v -typedef void (APIENTRYP PFNGLENABLEIPROC)(GLenum target, GLuint index); -GLAPI PFNGLENABLEIPROC glad_glEnablei; -#define glEnablei glad_glEnablei -typedef void (APIENTRYP PFNGLDISABLEIPROC)(GLenum target, GLuint index); -GLAPI PFNGLDISABLEIPROC glad_glDisablei; -#define glDisablei glad_glDisablei -typedef GLboolean (APIENTRYP PFNGLISENABLEDIPROC)(GLenum target, GLuint index); -GLAPI PFNGLISENABLEDIPROC glad_glIsEnabledi; -#define glIsEnabledi glad_glIsEnabledi -typedef void (APIENTRYP PFNGLBEGINTRANSFORMFEEDBACKPROC)(GLenum primitiveMode); -GLAPI PFNGLBEGINTRANSFORMFEEDBACKPROC glad_glBeginTransformFeedback; -#define glBeginTransformFeedback glad_glBeginTransformFeedback -typedef void (APIENTRYP PFNGLENDTRANSFORMFEEDBACKPROC)(void); -GLAPI PFNGLENDTRANSFORMFEEDBACKPROC glad_glEndTransformFeedback; -#define glEndTransformFeedback glad_glEndTransformFeedback -typedef void (APIENTRYP PFNGLBINDBUFFERRANGEPROC)(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size); -GLAPI PFNGLBINDBUFFERRANGEPROC glad_glBindBufferRange; -#define glBindBufferRange glad_glBindBufferRange -typedef void (APIENTRYP PFNGLBINDBUFFERBASEPROC)(GLenum target, GLuint index, GLuint buffer); -GLAPI PFNGLBINDBUFFERBASEPROC glad_glBindBufferBase; -#define glBindBufferBase glad_glBindBufferBase -typedef void (APIENTRYP PFNGLTRANSFORMFEEDBACKVARYINGSPROC)(GLuint program, GLsizei count, const GLchar *const*varyings, GLenum bufferMode); -GLAPI PFNGLTRANSFORMFEEDBACKVARYINGSPROC glad_glTransformFeedbackVaryings; -#define glTransformFeedbackVaryings glad_glTransformFeedbackVaryings -typedef void (APIENTRYP PFNGLGETTRANSFORMFEEDBACKVARYINGPROC)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLsizei *size, GLenum *type, GLchar *name); -GLAPI PFNGLGETTRANSFORMFEEDBACKVARYINGPROC glad_glGetTransformFeedbackVarying; -#define glGetTransformFeedbackVarying glad_glGetTransformFeedbackVarying -typedef void (APIENTRYP PFNGLCLAMPCOLORPROC)(GLenum target, GLenum clamp); -GLAPI PFNGLCLAMPCOLORPROC glad_glClampColor; -#define glClampColor glad_glClampColor -typedef void (APIENTRYP PFNGLBEGINCONDITIONALRENDERPROC)(GLuint id, GLenum mode); -GLAPI PFNGLBEGINCONDITIONALRENDERPROC glad_glBeginConditionalRender; -#define glBeginConditionalRender glad_glBeginConditionalRender -typedef void (APIENTRYP PFNGLENDCONDITIONALRENDERPROC)(void); -GLAPI PFNGLENDCONDITIONALRENDERPROC glad_glEndConditionalRender; -#define glEndConditionalRender glad_glEndConditionalRender -typedef void (APIENTRYP PFNGLVERTEXATTRIBIPOINTERPROC)(GLuint index, GLint size, GLenum type, GLsizei stride, const void *pointer); -GLAPI PFNGLVERTEXATTRIBIPOINTERPROC glad_glVertexAttribIPointer; -#define glVertexAttribIPointer glad_glVertexAttribIPointer -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIIVPROC)(GLuint index, GLenum pname, GLint *params); -GLAPI PFNGLGETVERTEXATTRIBIIVPROC glad_glGetVertexAttribIiv; -#define glGetVertexAttribIiv glad_glGetVertexAttribIiv -typedef void (APIENTRYP PFNGLGETVERTEXATTRIBIUIVPROC)(GLuint index, GLenum pname, GLuint *params); -GLAPI PFNGLGETVERTEXATTRIBIUIVPROC glad_glGetVertexAttribIuiv; -#define glGetVertexAttribIuiv glad_glGetVertexAttribIuiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI1IPROC)(GLuint index, GLint x); -GLAPI PFNGLVERTEXATTRIBI1IPROC glad_glVertexAttribI1i; -#define glVertexAttribI1i glad_glVertexAttribI1i -typedef void (APIENTRYP PFNGLVERTEXATTRIBI2IPROC)(GLuint index, GLint x, GLint y); -GLAPI PFNGLVERTEXATTRIBI2IPROC glad_glVertexAttribI2i; -#define glVertexAttribI2i glad_glVertexAttribI2i -typedef void (APIENTRYP PFNGLVERTEXATTRIBI3IPROC)(GLuint index, GLint x, GLint y, GLint z); -GLAPI PFNGLVERTEXATTRIBI3IPROC glad_glVertexAttribI3i; -#define glVertexAttribI3i glad_glVertexAttribI3i -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4IPROC)(GLuint index, GLint x, GLint y, GLint z, GLint w); -GLAPI PFNGLVERTEXATTRIBI4IPROC glad_glVertexAttribI4i; -#define glVertexAttribI4i glad_glVertexAttribI4i -typedef void (APIENTRYP PFNGLVERTEXATTRIBI1UIPROC)(GLuint index, GLuint x); -GLAPI PFNGLVERTEXATTRIBI1UIPROC glad_glVertexAttribI1ui; -#define glVertexAttribI1ui glad_glVertexAttribI1ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBI2UIPROC)(GLuint index, GLuint x, GLuint y); -GLAPI PFNGLVERTEXATTRIBI2UIPROC glad_glVertexAttribI2ui; -#define glVertexAttribI2ui glad_glVertexAttribI2ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBI3UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z); -GLAPI PFNGLVERTEXATTRIBI3UIPROC glad_glVertexAttribI3ui; -#define glVertexAttribI3ui glad_glVertexAttribI3ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UIPROC)(GLuint index, GLuint x, GLuint y, GLuint z, GLuint w); -GLAPI PFNGLVERTEXATTRIBI4UIPROC glad_glVertexAttribI4ui; -#define glVertexAttribI4ui glad_glVertexAttribI4ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBI1IVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIBI1IVPROC glad_glVertexAttribI1iv; -#define glVertexAttribI1iv glad_glVertexAttribI1iv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI2IVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIBI2IVPROC glad_glVertexAttribI2iv; -#define glVertexAttribI2iv glad_glVertexAttribI2iv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI3IVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIBI3IVPROC glad_glVertexAttribI3iv; -#define glVertexAttribI3iv glad_glVertexAttribI3iv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4IVPROC)(GLuint index, const GLint *v); -GLAPI PFNGLVERTEXATTRIBI4IVPROC glad_glVertexAttribI4iv; -#define glVertexAttribI4iv glad_glVertexAttribI4iv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI1UIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIBI1UIVPROC glad_glVertexAttribI1uiv; -#define glVertexAttribI1uiv glad_glVertexAttribI1uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI2UIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIBI2UIVPROC glad_glVertexAttribI2uiv; -#define glVertexAttribI2uiv glad_glVertexAttribI2uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI3UIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIBI3UIVPROC glad_glVertexAttribI3uiv; -#define glVertexAttribI3uiv glad_glVertexAttribI3uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UIVPROC)(GLuint index, const GLuint *v); -GLAPI PFNGLVERTEXATTRIBI4UIVPROC glad_glVertexAttribI4uiv; -#define glVertexAttribI4uiv glad_glVertexAttribI4uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4BVPROC)(GLuint index, const GLbyte *v); -GLAPI PFNGLVERTEXATTRIBI4BVPROC glad_glVertexAttribI4bv; -#define glVertexAttribI4bv glad_glVertexAttribI4bv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4SVPROC)(GLuint index, const GLshort *v); -GLAPI PFNGLVERTEXATTRIBI4SVPROC glad_glVertexAttribI4sv; -#define glVertexAttribI4sv glad_glVertexAttribI4sv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4UBVPROC)(GLuint index, const GLubyte *v); -GLAPI PFNGLVERTEXATTRIBI4UBVPROC glad_glVertexAttribI4ubv; -#define glVertexAttribI4ubv glad_glVertexAttribI4ubv -typedef void (APIENTRYP PFNGLVERTEXATTRIBI4USVPROC)(GLuint index, const GLushort *v); -GLAPI PFNGLVERTEXATTRIBI4USVPROC glad_glVertexAttribI4usv; -#define glVertexAttribI4usv glad_glVertexAttribI4usv -typedef void (APIENTRYP PFNGLGETUNIFORMUIVPROC)(GLuint program, GLint location, GLuint *params); -GLAPI PFNGLGETUNIFORMUIVPROC glad_glGetUniformuiv; -#define glGetUniformuiv glad_glGetUniformuiv -typedef void (APIENTRYP PFNGLBINDFRAGDATALOCATIONPROC)(GLuint program, GLuint color, const GLchar *name); -GLAPI PFNGLBINDFRAGDATALOCATIONPROC glad_glBindFragDataLocation; -#define glBindFragDataLocation glad_glBindFragDataLocation -typedef GLint (APIENTRYP PFNGLGETFRAGDATALOCATIONPROC)(GLuint program, const GLchar *name); -GLAPI PFNGLGETFRAGDATALOCATIONPROC glad_glGetFragDataLocation; -#define glGetFragDataLocation glad_glGetFragDataLocation -typedef void (APIENTRYP PFNGLUNIFORM1UIPROC)(GLint location, GLuint v0); -GLAPI PFNGLUNIFORM1UIPROC glad_glUniform1ui; -#define glUniform1ui glad_glUniform1ui -typedef void (APIENTRYP PFNGLUNIFORM2UIPROC)(GLint location, GLuint v0, GLuint v1); -GLAPI PFNGLUNIFORM2UIPROC glad_glUniform2ui; -#define glUniform2ui glad_glUniform2ui -typedef void (APIENTRYP PFNGLUNIFORM3UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2); -GLAPI PFNGLUNIFORM3UIPROC glad_glUniform3ui; -#define glUniform3ui glad_glUniform3ui -typedef void (APIENTRYP PFNGLUNIFORM4UIPROC)(GLint location, GLuint v0, GLuint v1, GLuint v2, GLuint v3); -GLAPI PFNGLUNIFORM4UIPROC glad_glUniform4ui; -#define glUniform4ui glad_glUniform4ui -typedef void (APIENTRYP PFNGLUNIFORM1UIVPROC)(GLint location, GLsizei count, const GLuint *value); -GLAPI PFNGLUNIFORM1UIVPROC glad_glUniform1uiv; -#define glUniform1uiv glad_glUniform1uiv -typedef void (APIENTRYP PFNGLUNIFORM2UIVPROC)(GLint location, GLsizei count, const GLuint *value); -GLAPI PFNGLUNIFORM2UIVPROC glad_glUniform2uiv; -#define glUniform2uiv glad_glUniform2uiv -typedef void (APIENTRYP PFNGLUNIFORM3UIVPROC)(GLint location, GLsizei count, const GLuint *value); -GLAPI PFNGLUNIFORM3UIVPROC glad_glUniform3uiv; -#define glUniform3uiv glad_glUniform3uiv -typedef void (APIENTRYP PFNGLUNIFORM4UIVPROC)(GLint location, GLsizei count, const GLuint *value); -GLAPI PFNGLUNIFORM4UIVPROC glad_glUniform4uiv; -#define glUniform4uiv glad_glUniform4uiv -typedef void (APIENTRYP PFNGLTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, const GLint *params); -GLAPI PFNGLTEXPARAMETERIIVPROC glad_glTexParameterIiv; -#define glTexParameterIiv glad_glTexParameterIiv -typedef void (APIENTRYP PFNGLTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, const GLuint *params); -GLAPI PFNGLTEXPARAMETERIUIVPROC glad_glTexParameterIuiv; -#define glTexParameterIuiv glad_glTexParameterIuiv -typedef void (APIENTRYP PFNGLGETTEXPARAMETERIIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETTEXPARAMETERIIVPROC glad_glGetTexParameterIiv; -#define glGetTexParameterIiv glad_glGetTexParameterIiv -typedef void (APIENTRYP PFNGLGETTEXPARAMETERIUIVPROC)(GLenum target, GLenum pname, GLuint *params); -GLAPI PFNGLGETTEXPARAMETERIUIVPROC glad_glGetTexParameterIuiv; -#define glGetTexParameterIuiv glad_glGetTexParameterIuiv -typedef void (APIENTRYP PFNGLCLEARBUFFERIVPROC)(GLenum buffer, GLint drawbuffer, const GLint *value); -GLAPI PFNGLCLEARBUFFERIVPROC glad_glClearBufferiv; -#define glClearBufferiv glad_glClearBufferiv -typedef void (APIENTRYP PFNGLCLEARBUFFERUIVPROC)(GLenum buffer, GLint drawbuffer, const GLuint *value); -GLAPI PFNGLCLEARBUFFERUIVPROC glad_glClearBufferuiv; -#define glClearBufferuiv glad_glClearBufferuiv -typedef void (APIENTRYP PFNGLCLEARBUFFERFVPROC)(GLenum buffer, GLint drawbuffer, const GLfloat *value); -GLAPI PFNGLCLEARBUFFERFVPROC glad_glClearBufferfv; -#define glClearBufferfv glad_glClearBufferfv -typedef void (APIENTRYP PFNGLCLEARBUFFERFIPROC)(GLenum buffer, GLint drawbuffer, GLfloat depth, GLint stencil); -GLAPI PFNGLCLEARBUFFERFIPROC glad_glClearBufferfi; -#define glClearBufferfi glad_glClearBufferfi -typedef const GLubyte * (APIENTRYP PFNGLGETSTRINGIPROC)(GLenum name, GLuint index); -GLAPI PFNGLGETSTRINGIPROC glad_glGetStringi; -#define glGetStringi glad_glGetStringi -typedef GLboolean (APIENTRYP PFNGLISRENDERBUFFERPROC)(GLuint renderbuffer); -GLAPI PFNGLISRENDERBUFFERPROC glad_glIsRenderbuffer; -#define glIsRenderbuffer glad_glIsRenderbuffer -typedef void (APIENTRYP PFNGLBINDRENDERBUFFERPROC)(GLenum target, GLuint renderbuffer); -GLAPI PFNGLBINDRENDERBUFFERPROC glad_glBindRenderbuffer; -#define glBindRenderbuffer glad_glBindRenderbuffer -typedef void (APIENTRYP PFNGLDELETERENDERBUFFERSPROC)(GLsizei n, const GLuint *renderbuffers); -GLAPI PFNGLDELETERENDERBUFFERSPROC glad_glDeleteRenderbuffers; -#define glDeleteRenderbuffers glad_glDeleteRenderbuffers -typedef void (APIENTRYP PFNGLGENRENDERBUFFERSPROC)(GLsizei n, GLuint *renderbuffers); -GLAPI PFNGLGENRENDERBUFFERSPROC glad_glGenRenderbuffers; -#define glGenRenderbuffers glad_glGenRenderbuffers -typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEPROC)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height); -GLAPI PFNGLRENDERBUFFERSTORAGEPROC glad_glRenderbufferStorage; -#define glRenderbufferStorage glad_glRenderbufferStorage -typedef void (APIENTRYP PFNGLGETRENDERBUFFERPARAMETERIVPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETRENDERBUFFERPARAMETERIVPROC glad_glGetRenderbufferParameteriv; -#define glGetRenderbufferParameteriv glad_glGetRenderbufferParameteriv -typedef GLboolean (APIENTRYP PFNGLISFRAMEBUFFERPROC)(GLuint framebuffer); -GLAPI PFNGLISFRAMEBUFFERPROC glad_glIsFramebuffer; -#define glIsFramebuffer glad_glIsFramebuffer -typedef void (APIENTRYP PFNGLBINDFRAMEBUFFERPROC)(GLenum target, GLuint framebuffer); -GLAPI PFNGLBINDFRAMEBUFFERPROC glad_glBindFramebuffer; -#define glBindFramebuffer glad_glBindFramebuffer -typedef void (APIENTRYP PFNGLDELETEFRAMEBUFFERSPROC)(GLsizei n, const GLuint *framebuffers); -GLAPI PFNGLDELETEFRAMEBUFFERSPROC glad_glDeleteFramebuffers; -#define glDeleteFramebuffers glad_glDeleteFramebuffers -typedef void (APIENTRYP PFNGLGENFRAMEBUFFERSPROC)(GLsizei n, GLuint *framebuffers); -GLAPI PFNGLGENFRAMEBUFFERSPROC glad_glGenFramebuffers; -#define glGenFramebuffers glad_glGenFramebuffers -typedef GLenum (APIENTRYP PFNGLCHECKFRAMEBUFFERSTATUSPROC)(GLenum target); -GLAPI PFNGLCHECKFRAMEBUFFERSTATUSPROC glad_glCheckFramebufferStatus; -#define glCheckFramebufferStatus glad_glCheckFramebufferStatus -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE1DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); -GLAPI PFNGLFRAMEBUFFERTEXTURE1DPROC glad_glFramebufferTexture1D; -#define glFramebufferTexture1D glad_glFramebufferTexture1D -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); -GLAPI PFNGLFRAMEBUFFERTEXTURE2DPROC glad_glFramebufferTexture2D; -#define glFramebufferTexture2D glad_glFramebufferTexture2D -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE3DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset); -GLAPI PFNGLFRAMEBUFFERTEXTURE3DPROC glad_glFramebufferTexture3D; -#define glFramebufferTexture3D glad_glFramebufferTexture3D -typedef void (APIENTRYP PFNGLFRAMEBUFFERRENDERBUFFERPROC)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer); -GLAPI PFNGLFRAMEBUFFERRENDERBUFFERPROC glad_glFramebufferRenderbuffer; -#define glFramebufferRenderbuffer glad_glFramebufferRenderbuffer -typedef void (APIENTRYP PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC)(GLenum target, GLenum attachment, GLenum pname, GLint *params); -GLAPI PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVPROC glad_glGetFramebufferAttachmentParameteriv; -#define glGetFramebufferAttachmentParameteriv glad_glGetFramebufferAttachmentParameteriv -typedef void (APIENTRYP PFNGLGENERATEMIPMAPPROC)(GLenum target); -GLAPI PFNGLGENERATEMIPMAPPROC glad_glGenerateMipmap; -#define glGenerateMipmap glad_glGenerateMipmap -typedef void (APIENTRYP PFNGLBLITFRAMEBUFFERPROC)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); -GLAPI PFNGLBLITFRAMEBUFFERPROC glad_glBlitFramebuffer; -#define glBlitFramebuffer glad_glBlitFramebuffer -typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height); -GLAPI PFNGLRENDERBUFFERSTORAGEMULTISAMPLEPROC glad_glRenderbufferStorageMultisample; -#define glRenderbufferStorageMultisample glad_glRenderbufferStorageMultisample -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURELAYERPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint layer); -GLAPI PFNGLFRAMEBUFFERTEXTURELAYERPROC glad_glFramebufferTextureLayer; -#define glFramebufferTextureLayer glad_glFramebufferTextureLayer -typedef void * (APIENTRYP PFNGLMAPBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access); -GLAPI PFNGLMAPBUFFERRANGEPROC glad_glMapBufferRange; -#define glMapBufferRange glad_glMapBufferRange -typedef void (APIENTRYP PFNGLFLUSHMAPPEDBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length); -GLAPI PFNGLFLUSHMAPPEDBUFFERRANGEPROC glad_glFlushMappedBufferRange; -#define glFlushMappedBufferRange glad_glFlushMappedBufferRange -typedef void (APIENTRYP PFNGLBINDVERTEXARRAYPROC)(GLuint array); -GLAPI PFNGLBINDVERTEXARRAYPROC glad_glBindVertexArray; -#define glBindVertexArray glad_glBindVertexArray -typedef void (APIENTRYP PFNGLDELETEVERTEXARRAYSPROC)(GLsizei n, const GLuint *arrays); -GLAPI PFNGLDELETEVERTEXARRAYSPROC glad_glDeleteVertexArrays; -#define glDeleteVertexArrays glad_glDeleteVertexArrays -typedef void (APIENTRYP PFNGLGENVERTEXARRAYSPROC)(GLsizei n, GLuint *arrays); -GLAPI PFNGLGENVERTEXARRAYSPROC glad_glGenVertexArrays; -#define glGenVertexArrays glad_glGenVertexArrays -typedef GLboolean (APIENTRYP PFNGLISVERTEXARRAYPROC)(GLuint array); -GLAPI PFNGLISVERTEXARRAYPROC glad_glIsVertexArray; -#define glIsVertexArray glad_glIsVertexArray -#endif -#ifndef GL_VERSION_3_1 -#define GL_VERSION_3_1 1 -GLAPI int GLAD_GL_VERSION_3_1; -typedef void (APIENTRYP PFNGLDRAWARRAYSINSTANCEDPROC)(GLenum mode, GLint first, GLsizei count, GLsizei instancecount); -GLAPI PFNGLDRAWARRAYSINSTANCEDPROC glad_glDrawArraysInstanced; -#define glDrawArraysInstanced glad_glDrawArraysInstanced -typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei instancecount); -GLAPI PFNGLDRAWELEMENTSINSTANCEDPROC glad_glDrawElementsInstanced; -#define glDrawElementsInstanced glad_glDrawElementsInstanced -typedef void (APIENTRYP PFNGLTEXBUFFERPROC)(GLenum target, GLenum internalformat, GLuint buffer); -GLAPI PFNGLTEXBUFFERPROC glad_glTexBuffer; -#define glTexBuffer glad_glTexBuffer -typedef void (APIENTRYP PFNGLPRIMITIVERESTARTINDEXPROC)(GLuint index); -GLAPI PFNGLPRIMITIVERESTARTINDEXPROC glad_glPrimitiveRestartIndex; -#define glPrimitiveRestartIndex glad_glPrimitiveRestartIndex -typedef void (APIENTRYP PFNGLCOPYBUFFERSUBDATAPROC)(GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size); -GLAPI PFNGLCOPYBUFFERSUBDATAPROC glad_glCopyBufferSubData; -#define glCopyBufferSubData glad_glCopyBufferSubData -typedef void (APIENTRYP PFNGLGETUNIFORMINDICESPROC)(GLuint program, GLsizei uniformCount, const GLchar *const*uniformNames, GLuint *uniformIndices); -GLAPI PFNGLGETUNIFORMINDICESPROC glad_glGetUniformIndices; -#define glGetUniformIndices glad_glGetUniformIndices -typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMSIVPROC)(GLuint program, GLsizei uniformCount, const GLuint *uniformIndices, GLenum pname, GLint *params); -GLAPI PFNGLGETACTIVEUNIFORMSIVPROC glad_glGetActiveUniformsiv; -#define glGetActiveUniformsiv glad_glGetActiveUniformsiv -typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMNAMEPROC)(GLuint program, GLuint uniformIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformName); -GLAPI PFNGLGETACTIVEUNIFORMNAMEPROC glad_glGetActiveUniformName; -#define glGetActiveUniformName glad_glGetActiveUniformName -typedef GLuint (APIENTRYP PFNGLGETUNIFORMBLOCKINDEXPROC)(GLuint program, const GLchar *uniformBlockName); -GLAPI PFNGLGETUNIFORMBLOCKINDEXPROC glad_glGetUniformBlockIndex; -#define glGetUniformBlockIndex glad_glGetUniformBlockIndex -typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMBLOCKIVPROC)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params); -GLAPI PFNGLGETACTIVEUNIFORMBLOCKIVPROC glad_glGetActiveUniformBlockiv; -#define glGetActiveUniformBlockiv glad_glGetActiveUniformBlockiv -typedef void (APIENTRYP PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC)(GLuint program, GLuint uniformBlockIndex, GLsizei bufSize, GLsizei *length, GLchar *uniformBlockName); -GLAPI PFNGLGETACTIVEUNIFORMBLOCKNAMEPROC glad_glGetActiveUniformBlockName; -#define glGetActiveUniformBlockName glad_glGetActiveUniformBlockName -typedef void (APIENTRYP PFNGLUNIFORMBLOCKBINDINGPROC)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding); -GLAPI PFNGLUNIFORMBLOCKBINDINGPROC glad_glUniformBlockBinding; -#define glUniformBlockBinding glad_glUniformBlockBinding -#endif -#ifndef GL_VERSION_3_2 -#define GL_VERSION_3_2 1 -GLAPI int GLAD_GL_VERSION_3_2; -typedef void (APIENTRYP PFNGLDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLint basevertex); -GLAPI PFNGLDRAWELEMENTSBASEVERTEXPROC glad_glDrawElementsBaseVertex; -#define glDrawElementsBaseVertex glad_glDrawElementsBaseVertex -typedef void (APIENTRYP PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC)(GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void *indices, GLint basevertex); -GLAPI PFNGLDRAWRANGEELEMENTSBASEVERTEXPROC glad_glDrawRangeElementsBaseVertex; -#define glDrawRangeElementsBaseVertex glad_glDrawRangeElementsBaseVertex -typedef void (APIENTRYP PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC)(GLenum mode, GLsizei count, GLenum type, const void *indices, GLsizei instancecount, GLint basevertex); -GLAPI PFNGLDRAWELEMENTSINSTANCEDBASEVERTEXPROC glad_glDrawElementsInstancedBaseVertex; -#define glDrawElementsInstancedBaseVertex glad_glDrawElementsInstancedBaseVertex -typedef void (APIENTRYP PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC)(GLenum mode, const GLsizei *count, GLenum type, const void *const*indices, GLsizei drawcount, const GLint *basevertex); -GLAPI PFNGLMULTIDRAWELEMENTSBASEVERTEXPROC glad_glMultiDrawElementsBaseVertex; -#define glMultiDrawElementsBaseVertex glad_glMultiDrawElementsBaseVertex -typedef void (APIENTRYP PFNGLPROVOKINGVERTEXPROC)(GLenum mode); -GLAPI PFNGLPROVOKINGVERTEXPROC glad_glProvokingVertex; -#define glProvokingVertex glad_glProvokingVertex -typedef GLsync (APIENTRYP PFNGLFENCESYNCPROC)(GLenum condition, GLbitfield flags); -GLAPI PFNGLFENCESYNCPROC glad_glFenceSync; -#define glFenceSync glad_glFenceSync -typedef GLboolean (APIENTRYP PFNGLISSYNCPROC)(GLsync sync); -GLAPI PFNGLISSYNCPROC glad_glIsSync; -#define glIsSync glad_glIsSync -typedef void (APIENTRYP PFNGLDELETESYNCPROC)(GLsync sync); -GLAPI PFNGLDELETESYNCPROC glad_glDeleteSync; -#define glDeleteSync glad_glDeleteSync -typedef GLenum (APIENTRYP PFNGLCLIENTWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout); -GLAPI PFNGLCLIENTWAITSYNCPROC glad_glClientWaitSync; -#define glClientWaitSync glad_glClientWaitSync -typedef void (APIENTRYP PFNGLWAITSYNCPROC)(GLsync sync, GLbitfield flags, GLuint64 timeout); -GLAPI PFNGLWAITSYNCPROC glad_glWaitSync; -#define glWaitSync glad_glWaitSync -typedef void (APIENTRYP PFNGLGETINTEGER64VPROC)(GLenum pname, GLint64 *data); -GLAPI PFNGLGETINTEGER64VPROC glad_glGetInteger64v; -#define glGetInteger64v glad_glGetInteger64v -typedef void (APIENTRYP PFNGLGETSYNCIVPROC)(GLsync sync, GLenum pname, GLsizei count, GLsizei *length, GLint *values); -GLAPI PFNGLGETSYNCIVPROC glad_glGetSynciv; -#define glGetSynciv glad_glGetSynciv -typedef void (APIENTRYP PFNGLGETINTEGER64I_VPROC)(GLenum target, GLuint index, GLint64 *data); -GLAPI PFNGLGETINTEGER64I_VPROC glad_glGetInteger64i_v; -#define glGetInteger64i_v glad_glGetInteger64i_v -typedef void (APIENTRYP PFNGLGETBUFFERPARAMETERI64VPROC)(GLenum target, GLenum pname, GLint64 *params); -GLAPI PFNGLGETBUFFERPARAMETERI64VPROC glad_glGetBufferParameteri64v; -#define glGetBufferParameteri64v glad_glGetBufferParameteri64v -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level); -GLAPI PFNGLFRAMEBUFFERTEXTUREPROC glad_glFramebufferTexture; -#define glFramebufferTexture glad_glFramebufferTexture -typedef void (APIENTRYP PFNGLTEXIMAGE2DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations); -GLAPI PFNGLTEXIMAGE2DMULTISAMPLEPROC glad_glTexImage2DMultisample; -#define glTexImage2DMultisample glad_glTexImage2DMultisample -typedef void (APIENTRYP PFNGLTEXIMAGE3DMULTISAMPLEPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLsizei depth, GLboolean fixedsamplelocations); -GLAPI PFNGLTEXIMAGE3DMULTISAMPLEPROC glad_glTexImage3DMultisample; -#define glTexImage3DMultisample glad_glTexImage3DMultisample -typedef void (APIENTRYP PFNGLGETMULTISAMPLEFVPROC)(GLenum pname, GLuint index, GLfloat *val); -GLAPI PFNGLGETMULTISAMPLEFVPROC glad_glGetMultisamplefv; -#define glGetMultisamplefv glad_glGetMultisamplefv -typedef void (APIENTRYP PFNGLSAMPLEMASKIPROC)(GLuint maskNumber, GLbitfield mask); -GLAPI PFNGLSAMPLEMASKIPROC glad_glSampleMaski; -#define glSampleMaski glad_glSampleMaski -#endif -#ifndef GL_VERSION_3_3 -#define GL_VERSION_3_3 1 -GLAPI int GLAD_GL_VERSION_3_3; -typedef void (APIENTRYP PFNGLBINDFRAGDATALOCATIONINDEXEDPROC)(GLuint program, GLuint colorNumber, GLuint index, const GLchar *name); -GLAPI PFNGLBINDFRAGDATALOCATIONINDEXEDPROC glad_glBindFragDataLocationIndexed; -#define glBindFragDataLocationIndexed glad_glBindFragDataLocationIndexed -typedef GLint (APIENTRYP PFNGLGETFRAGDATAINDEXPROC)(GLuint program, const GLchar *name); -GLAPI PFNGLGETFRAGDATAINDEXPROC glad_glGetFragDataIndex; -#define glGetFragDataIndex glad_glGetFragDataIndex -typedef void (APIENTRYP PFNGLGENSAMPLERSPROC)(GLsizei count, GLuint *samplers); -GLAPI PFNGLGENSAMPLERSPROC glad_glGenSamplers; -#define glGenSamplers glad_glGenSamplers -typedef void (APIENTRYP PFNGLDELETESAMPLERSPROC)(GLsizei count, const GLuint *samplers); -GLAPI PFNGLDELETESAMPLERSPROC glad_glDeleteSamplers; -#define glDeleteSamplers glad_glDeleteSamplers -typedef GLboolean (APIENTRYP PFNGLISSAMPLERPROC)(GLuint sampler); -GLAPI PFNGLISSAMPLERPROC glad_glIsSampler; -#define glIsSampler glad_glIsSampler -typedef void (APIENTRYP PFNGLBINDSAMPLERPROC)(GLuint unit, GLuint sampler); -GLAPI PFNGLBINDSAMPLERPROC glad_glBindSampler; -#define glBindSampler glad_glBindSampler -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIPROC)(GLuint sampler, GLenum pname, GLint param); -GLAPI PFNGLSAMPLERPARAMETERIPROC glad_glSamplerParameteri; -#define glSamplerParameteri glad_glSamplerParameteri -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, const GLint *param); -GLAPI PFNGLSAMPLERPARAMETERIVPROC glad_glSamplerParameteriv; -#define glSamplerParameteriv glad_glSamplerParameteriv -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERFPROC)(GLuint sampler, GLenum pname, GLfloat param); -GLAPI PFNGLSAMPLERPARAMETERFPROC glad_glSamplerParameterf; -#define glSamplerParameterf glad_glSamplerParameterf -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, const GLfloat *param); -GLAPI PFNGLSAMPLERPARAMETERFVPROC glad_glSamplerParameterfv; -#define glSamplerParameterfv glad_glSamplerParameterfv -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, const GLint *param); -GLAPI PFNGLSAMPLERPARAMETERIIVPROC glad_glSamplerParameterIiv; -#define glSamplerParameterIiv glad_glSamplerParameterIiv -typedef void (APIENTRYP PFNGLSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, const GLuint *param); -GLAPI PFNGLSAMPLERPARAMETERIUIVPROC glad_glSamplerParameterIuiv; -#define glSamplerParameterIuiv glad_glSamplerParameterIuiv -typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIVPROC)(GLuint sampler, GLenum pname, GLint *params); -GLAPI PFNGLGETSAMPLERPARAMETERIVPROC glad_glGetSamplerParameteriv; -#define glGetSamplerParameteriv glad_glGetSamplerParameteriv -typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIIVPROC)(GLuint sampler, GLenum pname, GLint *params); -GLAPI PFNGLGETSAMPLERPARAMETERIIVPROC glad_glGetSamplerParameterIiv; -#define glGetSamplerParameterIiv glad_glGetSamplerParameterIiv -typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERFVPROC)(GLuint sampler, GLenum pname, GLfloat *params); -GLAPI PFNGLGETSAMPLERPARAMETERFVPROC glad_glGetSamplerParameterfv; -#define glGetSamplerParameterfv glad_glGetSamplerParameterfv -typedef void (APIENTRYP PFNGLGETSAMPLERPARAMETERIUIVPROC)(GLuint sampler, GLenum pname, GLuint *params); -GLAPI PFNGLGETSAMPLERPARAMETERIUIVPROC glad_glGetSamplerParameterIuiv; -#define glGetSamplerParameterIuiv glad_glGetSamplerParameterIuiv -typedef void (APIENTRYP PFNGLQUERYCOUNTERPROC)(GLuint id, GLenum target); -GLAPI PFNGLQUERYCOUNTERPROC glad_glQueryCounter; -#define glQueryCounter glad_glQueryCounter -typedef void (APIENTRYP PFNGLGETQUERYOBJECTI64VPROC)(GLuint id, GLenum pname, GLint64 *params); -GLAPI PFNGLGETQUERYOBJECTI64VPROC glad_glGetQueryObjecti64v; -#define glGetQueryObjecti64v glad_glGetQueryObjecti64v -typedef void (APIENTRYP PFNGLGETQUERYOBJECTUI64VPROC)(GLuint id, GLenum pname, GLuint64 *params); -GLAPI PFNGLGETQUERYOBJECTUI64VPROC glad_glGetQueryObjectui64v; -#define glGetQueryObjectui64v glad_glGetQueryObjectui64v -typedef void (APIENTRYP PFNGLVERTEXATTRIBDIVISORPROC)(GLuint index, GLuint divisor); -GLAPI PFNGLVERTEXATTRIBDIVISORPROC glad_glVertexAttribDivisor; -#define glVertexAttribDivisor glad_glVertexAttribDivisor -typedef void (APIENTRYP PFNGLVERTEXATTRIBP1UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); -GLAPI PFNGLVERTEXATTRIBP1UIPROC glad_glVertexAttribP1ui; -#define glVertexAttribP1ui glad_glVertexAttribP1ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBP1UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value); -GLAPI PFNGLVERTEXATTRIBP1UIVPROC glad_glVertexAttribP1uiv; -#define glVertexAttribP1uiv glad_glVertexAttribP1uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBP2UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); -GLAPI PFNGLVERTEXATTRIBP2UIPROC glad_glVertexAttribP2ui; -#define glVertexAttribP2ui glad_glVertexAttribP2ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBP2UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value); -GLAPI PFNGLVERTEXATTRIBP2UIVPROC glad_glVertexAttribP2uiv; -#define glVertexAttribP2uiv glad_glVertexAttribP2uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBP3UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); -GLAPI PFNGLVERTEXATTRIBP3UIPROC glad_glVertexAttribP3ui; -#define glVertexAttribP3ui glad_glVertexAttribP3ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBP3UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value); -GLAPI PFNGLVERTEXATTRIBP3UIVPROC glad_glVertexAttribP3uiv; -#define glVertexAttribP3uiv glad_glVertexAttribP3uiv -typedef void (APIENTRYP PFNGLVERTEXATTRIBP4UIPROC)(GLuint index, GLenum type, GLboolean normalized, GLuint value); -GLAPI PFNGLVERTEXATTRIBP4UIPROC glad_glVertexAttribP4ui; -#define glVertexAttribP4ui glad_glVertexAttribP4ui -typedef void (APIENTRYP PFNGLVERTEXATTRIBP4UIVPROC)(GLuint index, GLenum type, GLboolean normalized, const GLuint *value); -GLAPI PFNGLVERTEXATTRIBP4UIVPROC glad_glVertexAttribP4uiv; -#define glVertexAttribP4uiv glad_glVertexAttribP4uiv -typedef void (APIENTRYP PFNGLVERTEXP2UIPROC)(GLenum type, GLuint value); -GLAPI PFNGLVERTEXP2UIPROC glad_glVertexP2ui; -#define glVertexP2ui glad_glVertexP2ui -typedef void (APIENTRYP PFNGLVERTEXP2UIVPROC)(GLenum type, const GLuint *value); -GLAPI PFNGLVERTEXP2UIVPROC glad_glVertexP2uiv; -#define glVertexP2uiv glad_glVertexP2uiv -typedef void (APIENTRYP PFNGLVERTEXP3UIPROC)(GLenum type, GLuint value); -GLAPI PFNGLVERTEXP3UIPROC glad_glVertexP3ui; -#define glVertexP3ui glad_glVertexP3ui -typedef void (APIENTRYP PFNGLVERTEXP3UIVPROC)(GLenum type, const GLuint *value); -GLAPI PFNGLVERTEXP3UIVPROC glad_glVertexP3uiv; -#define glVertexP3uiv glad_glVertexP3uiv -typedef void (APIENTRYP PFNGLVERTEXP4UIPROC)(GLenum type, GLuint value); -GLAPI PFNGLVERTEXP4UIPROC glad_glVertexP4ui; -#define glVertexP4ui glad_glVertexP4ui -typedef void (APIENTRYP PFNGLVERTEXP4UIVPROC)(GLenum type, const GLuint *value); -GLAPI PFNGLVERTEXP4UIVPROC glad_glVertexP4uiv; -#define glVertexP4uiv glad_glVertexP4uiv -typedef void (APIENTRYP PFNGLTEXCOORDP1UIPROC)(GLenum type, GLuint coords); -GLAPI PFNGLTEXCOORDP1UIPROC glad_glTexCoordP1ui; -#define glTexCoordP1ui glad_glTexCoordP1ui -typedef void (APIENTRYP PFNGLTEXCOORDP1UIVPROC)(GLenum type, const GLuint *coords); -GLAPI PFNGLTEXCOORDP1UIVPROC glad_glTexCoordP1uiv; -#define glTexCoordP1uiv glad_glTexCoordP1uiv -typedef void (APIENTRYP PFNGLTEXCOORDP2UIPROC)(GLenum type, GLuint coords); -GLAPI PFNGLTEXCOORDP2UIPROC glad_glTexCoordP2ui; -#define glTexCoordP2ui glad_glTexCoordP2ui -typedef void (APIENTRYP PFNGLTEXCOORDP2UIVPROC)(GLenum type, const GLuint *coords); -GLAPI PFNGLTEXCOORDP2UIVPROC glad_glTexCoordP2uiv; -#define glTexCoordP2uiv glad_glTexCoordP2uiv -typedef void (APIENTRYP PFNGLTEXCOORDP3UIPROC)(GLenum type, GLuint coords); -GLAPI PFNGLTEXCOORDP3UIPROC glad_glTexCoordP3ui; -#define glTexCoordP3ui glad_glTexCoordP3ui -typedef void (APIENTRYP PFNGLTEXCOORDP3UIVPROC)(GLenum type, const GLuint *coords); -GLAPI PFNGLTEXCOORDP3UIVPROC glad_glTexCoordP3uiv; -#define glTexCoordP3uiv glad_glTexCoordP3uiv -typedef void (APIENTRYP PFNGLTEXCOORDP4UIPROC)(GLenum type, GLuint coords); -GLAPI PFNGLTEXCOORDP4UIPROC glad_glTexCoordP4ui; -#define glTexCoordP4ui glad_glTexCoordP4ui -typedef void (APIENTRYP PFNGLTEXCOORDP4UIVPROC)(GLenum type, const GLuint *coords); -GLAPI PFNGLTEXCOORDP4UIVPROC glad_glTexCoordP4uiv; -#define glTexCoordP4uiv glad_glTexCoordP4uiv -typedef void (APIENTRYP PFNGLMULTITEXCOORDP1UIPROC)(GLenum texture, GLenum type, GLuint coords); -GLAPI PFNGLMULTITEXCOORDP1UIPROC glad_glMultiTexCoordP1ui; -#define glMultiTexCoordP1ui glad_glMultiTexCoordP1ui -typedef void (APIENTRYP PFNGLMULTITEXCOORDP1UIVPROC)(GLenum texture, GLenum type, const GLuint *coords); -GLAPI PFNGLMULTITEXCOORDP1UIVPROC glad_glMultiTexCoordP1uiv; -#define glMultiTexCoordP1uiv glad_glMultiTexCoordP1uiv -typedef void (APIENTRYP PFNGLMULTITEXCOORDP2UIPROC)(GLenum texture, GLenum type, GLuint coords); -GLAPI PFNGLMULTITEXCOORDP2UIPROC glad_glMultiTexCoordP2ui; -#define glMultiTexCoordP2ui glad_glMultiTexCoordP2ui -typedef void (APIENTRYP PFNGLMULTITEXCOORDP2UIVPROC)(GLenum texture, GLenum type, const GLuint *coords); -GLAPI PFNGLMULTITEXCOORDP2UIVPROC glad_glMultiTexCoordP2uiv; -#define glMultiTexCoordP2uiv glad_glMultiTexCoordP2uiv -typedef void (APIENTRYP PFNGLMULTITEXCOORDP3UIPROC)(GLenum texture, GLenum type, GLuint coords); -GLAPI PFNGLMULTITEXCOORDP3UIPROC glad_glMultiTexCoordP3ui; -#define glMultiTexCoordP3ui glad_glMultiTexCoordP3ui -typedef void (APIENTRYP PFNGLMULTITEXCOORDP3UIVPROC)(GLenum texture, GLenum type, const GLuint *coords); -GLAPI PFNGLMULTITEXCOORDP3UIVPROC glad_glMultiTexCoordP3uiv; -#define glMultiTexCoordP3uiv glad_glMultiTexCoordP3uiv -typedef void (APIENTRYP PFNGLMULTITEXCOORDP4UIPROC)(GLenum texture, GLenum type, GLuint coords); -GLAPI PFNGLMULTITEXCOORDP4UIPROC glad_glMultiTexCoordP4ui; -#define glMultiTexCoordP4ui glad_glMultiTexCoordP4ui -typedef void (APIENTRYP PFNGLMULTITEXCOORDP4UIVPROC)(GLenum texture, GLenum type, const GLuint *coords); -GLAPI PFNGLMULTITEXCOORDP4UIVPROC glad_glMultiTexCoordP4uiv; -#define glMultiTexCoordP4uiv glad_glMultiTexCoordP4uiv -typedef void (APIENTRYP PFNGLNORMALP3UIPROC)(GLenum type, GLuint coords); -GLAPI PFNGLNORMALP3UIPROC glad_glNormalP3ui; -#define glNormalP3ui glad_glNormalP3ui -typedef void (APIENTRYP PFNGLNORMALP3UIVPROC)(GLenum type, const GLuint *coords); -GLAPI PFNGLNORMALP3UIVPROC glad_glNormalP3uiv; -#define glNormalP3uiv glad_glNormalP3uiv -typedef void (APIENTRYP PFNGLCOLORP3UIPROC)(GLenum type, GLuint color); -GLAPI PFNGLCOLORP3UIPROC glad_glColorP3ui; -#define glColorP3ui glad_glColorP3ui -typedef void (APIENTRYP PFNGLCOLORP3UIVPROC)(GLenum type, const GLuint *color); -GLAPI PFNGLCOLORP3UIVPROC glad_glColorP3uiv; -#define glColorP3uiv glad_glColorP3uiv -typedef void (APIENTRYP PFNGLCOLORP4UIPROC)(GLenum type, GLuint color); -GLAPI PFNGLCOLORP4UIPROC glad_glColorP4ui; -#define glColorP4ui glad_glColorP4ui -typedef void (APIENTRYP PFNGLCOLORP4UIVPROC)(GLenum type, const GLuint *color); -GLAPI PFNGLCOLORP4UIVPROC glad_glColorP4uiv; -#define glColorP4uiv glad_glColorP4uiv -typedef void (APIENTRYP PFNGLSECONDARYCOLORP3UIPROC)(GLenum type, GLuint color); -GLAPI PFNGLSECONDARYCOLORP3UIPROC glad_glSecondaryColorP3ui; -#define glSecondaryColorP3ui glad_glSecondaryColorP3ui -typedef void (APIENTRYP PFNGLSECONDARYCOLORP3UIVPROC)(GLenum type, const GLuint *color); -GLAPI PFNGLSECONDARYCOLORP3UIVPROC glad_glSecondaryColorP3uiv; -#define glSecondaryColorP3uiv glad_glSecondaryColorP3uiv -#endif -#define GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB 0x8242 -#define GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH_ARB 0x8243 -#define GL_DEBUG_CALLBACK_FUNCTION_ARB 0x8244 -#define GL_DEBUG_CALLBACK_USER_PARAM_ARB 0x8245 -#define GL_DEBUG_SOURCE_API_ARB 0x8246 -#define GL_DEBUG_SOURCE_WINDOW_SYSTEM_ARB 0x8247 -#define GL_DEBUG_SOURCE_SHADER_COMPILER_ARB 0x8248 -#define GL_DEBUG_SOURCE_THIRD_PARTY_ARB 0x8249 -#define GL_DEBUG_SOURCE_APPLICATION_ARB 0x824A -#define GL_DEBUG_SOURCE_OTHER_ARB 0x824B -#define GL_DEBUG_TYPE_ERROR_ARB 0x824C -#define GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_ARB 0x824D -#define GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_ARB 0x824E -#define GL_DEBUG_TYPE_PORTABILITY_ARB 0x824F -#define GL_DEBUG_TYPE_PERFORMANCE_ARB 0x8250 -#define GL_DEBUG_TYPE_OTHER_ARB 0x8251 -#define GL_MAX_DEBUG_MESSAGE_LENGTH_ARB 0x9143 -#define GL_MAX_DEBUG_LOGGED_MESSAGES_ARB 0x9144 -#define GL_DEBUG_LOGGED_MESSAGES_ARB 0x9145 -#define GL_DEBUG_SEVERITY_HIGH_ARB 0x9146 -#define GL_DEBUG_SEVERITY_MEDIUM_ARB 0x9147 -#define GL_DEBUG_SEVERITY_LOW_ARB 0x9148 -#define GL_READ_FRAMEBUFFER_EXT 0x8CA8 -#define GL_DRAW_FRAMEBUFFER_EXT 0x8CA9 -#define GL_DRAW_FRAMEBUFFER_BINDING_EXT 0x8CA6 -#define GL_READ_FRAMEBUFFER_BINDING_EXT 0x8CAA -#define GL_RENDERBUFFER_SAMPLES_EXT 0x8CAB -#define GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE_EXT 0x8D56 -#define GL_MAX_SAMPLES_EXT 0x8D57 -#define GL_INVALID_FRAMEBUFFER_OPERATION_EXT 0x0506 -#define GL_MAX_RENDERBUFFER_SIZE_EXT 0x84E8 -#define GL_FRAMEBUFFER_BINDING_EXT 0x8CA6 -#define GL_RENDERBUFFER_BINDING_EXT 0x8CA7 -#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT 0x8CD0 -#define GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT 0x8CD1 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT 0x8CD2 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_CUBE_MAP_FACE_EXT 0x8CD3 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_EXT 0x8CD4 -#define GL_FRAMEBUFFER_COMPLETE_EXT 0x8CD5 -#define GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT_EXT 0x8CD6 -#define GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT 0x8CD7 -#define GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS_EXT 0x8CD9 -#define GL_FRAMEBUFFER_INCOMPLETE_FORMATS_EXT 0x8CDA -#define GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER_EXT 0x8CDB -#define GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER_EXT 0x8CDC -#define GL_FRAMEBUFFER_UNSUPPORTED_EXT 0x8CDD -#define GL_MAX_COLOR_ATTACHMENTS_EXT 0x8CDF -#define GL_COLOR_ATTACHMENT0_EXT 0x8CE0 -#define GL_COLOR_ATTACHMENT1_EXT 0x8CE1 -#define GL_COLOR_ATTACHMENT2_EXT 0x8CE2 -#define GL_COLOR_ATTACHMENT3_EXT 0x8CE3 -#define GL_COLOR_ATTACHMENT4_EXT 0x8CE4 -#define GL_COLOR_ATTACHMENT5_EXT 0x8CE5 -#define GL_COLOR_ATTACHMENT6_EXT 0x8CE6 -#define GL_COLOR_ATTACHMENT7_EXT 0x8CE7 -#define GL_COLOR_ATTACHMENT8_EXT 0x8CE8 -#define GL_COLOR_ATTACHMENT9_EXT 0x8CE9 -#define GL_COLOR_ATTACHMENT10_EXT 0x8CEA -#define GL_COLOR_ATTACHMENT11_EXT 0x8CEB -#define GL_COLOR_ATTACHMENT12_EXT 0x8CEC -#define GL_COLOR_ATTACHMENT13_EXT 0x8CED -#define GL_COLOR_ATTACHMENT14_EXT 0x8CEE -#define GL_COLOR_ATTACHMENT15_EXT 0x8CEF -#define GL_DEPTH_ATTACHMENT_EXT 0x8D00 -#define GL_STENCIL_ATTACHMENT_EXT 0x8D20 -#define GL_FRAMEBUFFER_EXT 0x8D40 -#define GL_RENDERBUFFER_EXT 0x8D41 -#define GL_RENDERBUFFER_WIDTH_EXT 0x8D42 -#define GL_RENDERBUFFER_HEIGHT_EXT 0x8D43 -#define GL_RENDERBUFFER_INTERNAL_FORMAT_EXT 0x8D44 -#define GL_STENCIL_INDEX1_EXT 0x8D46 -#define GL_STENCIL_INDEX4_EXT 0x8D47 -#define GL_STENCIL_INDEX8_EXT 0x8D48 -#define GL_STENCIL_INDEX16_EXT 0x8D49 -#define GL_RENDERBUFFER_RED_SIZE_EXT 0x8D50 -#define GL_RENDERBUFFER_GREEN_SIZE_EXT 0x8D51 -#define GL_RENDERBUFFER_BLUE_SIZE_EXT 0x8D52 -#define GL_RENDERBUFFER_ALPHA_SIZE_EXT 0x8D53 -#define GL_RENDERBUFFER_DEPTH_SIZE_EXT 0x8D54 -#define GL_RENDERBUFFER_STENCIL_SIZE_EXT 0x8D55 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630 -#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632 -#define GL_MAX_VIEWS_OVR 0x9631 -#define GL_FRAMEBUFFER_INCOMPLETE_VIEW_TARGETS_OVR 0x9633 -#ifndef GL_ARB_debug_output -#define GL_ARB_debug_output 1 -GLAPI int GLAD_GL_ARB_debug_output; -typedef void (APIENTRYP PFNGLDEBUGMESSAGECONTROLARBPROC)(GLenum source, GLenum type, GLenum severity, GLsizei count, const GLuint *ids, GLboolean enabled); -GLAPI PFNGLDEBUGMESSAGECONTROLARBPROC glad_glDebugMessageControlARB; -#define glDebugMessageControlARB glad_glDebugMessageControlARB -typedef void (APIENTRYP PFNGLDEBUGMESSAGEINSERTARBPROC)(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *buf); -GLAPI PFNGLDEBUGMESSAGEINSERTARBPROC glad_glDebugMessageInsertARB; -#define glDebugMessageInsertARB glad_glDebugMessageInsertARB -typedef void (APIENTRYP PFNGLDEBUGMESSAGECALLBACKARBPROC)(GLDEBUGPROCARB callback, const void *userParam); -GLAPI PFNGLDEBUGMESSAGECALLBACKARBPROC glad_glDebugMessageCallbackARB; -#define glDebugMessageCallbackARB glad_glDebugMessageCallbackARB -typedef GLuint (APIENTRYP PFNGLGETDEBUGMESSAGELOGARBPROC)(GLuint count, GLsizei bufSize, GLenum *sources, GLenum *types, GLuint *ids, GLenum *severities, GLsizei *lengths, GLchar *messageLog); -GLAPI PFNGLGETDEBUGMESSAGELOGARBPROC glad_glGetDebugMessageLogARB; -#define glGetDebugMessageLogARB glad_glGetDebugMessageLogARB -#endif -#ifndef GL_ARB_framebuffer_object -#define GL_ARB_framebuffer_object 1 -GLAPI int GLAD_GL_ARB_framebuffer_object; -#endif -#ifndef GL_EXT_framebuffer_blit -#define GL_EXT_framebuffer_blit 1 -GLAPI int GLAD_GL_EXT_framebuffer_blit; -typedef void (APIENTRYP PFNGLBLITFRAMEBUFFEREXTPROC)(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter); -GLAPI PFNGLBLITFRAMEBUFFEREXTPROC glad_glBlitFramebufferEXT; -#define glBlitFramebufferEXT glad_glBlitFramebufferEXT -#endif -#ifndef GL_EXT_framebuffer_multisample -#define GL_EXT_framebuffer_multisample 1 -GLAPI int GLAD_GL_EXT_framebuffer_multisample; -typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)(GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height); -GLAPI PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glad_glRenderbufferStorageMultisampleEXT; -#define glRenderbufferStorageMultisampleEXT glad_glRenderbufferStorageMultisampleEXT -#endif -#ifndef GL_EXT_framebuffer_object -#define GL_EXT_framebuffer_object 1 -GLAPI int GLAD_GL_EXT_framebuffer_object; -typedef GLboolean (APIENTRYP PFNGLISRENDERBUFFEREXTPROC)(GLuint renderbuffer); -GLAPI PFNGLISRENDERBUFFEREXTPROC glad_glIsRenderbufferEXT; -#define glIsRenderbufferEXT glad_glIsRenderbufferEXT -typedef void (APIENTRYP PFNGLBINDRENDERBUFFEREXTPROC)(GLenum target, GLuint renderbuffer); -GLAPI PFNGLBINDRENDERBUFFEREXTPROC glad_glBindRenderbufferEXT; -#define glBindRenderbufferEXT glad_glBindRenderbufferEXT -typedef void (APIENTRYP PFNGLDELETERENDERBUFFERSEXTPROC)(GLsizei n, const GLuint *renderbuffers); -GLAPI PFNGLDELETERENDERBUFFERSEXTPROC glad_glDeleteRenderbuffersEXT; -#define glDeleteRenderbuffersEXT glad_glDeleteRenderbuffersEXT -typedef void (APIENTRYP PFNGLGENRENDERBUFFERSEXTPROC)(GLsizei n, GLuint *renderbuffers); -GLAPI PFNGLGENRENDERBUFFERSEXTPROC glad_glGenRenderbuffersEXT; -#define glGenRenderbuffersEXT glad_glGenRenderbuffersEXT -typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEEXTPROC)(GLenum target, GLenum internalformat, GLsizei width, GLsizei height); -GLAPI PFNGLRENDERBUFFERSTORAGEEXTPROC glad_glRenderbufferStorageEXT; -#define glRenderbufferStorageEXT glad_glRenderbufferStorageEXT -typedef void (APIENTRYP PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC)(GLenum target, GLenum pname, GLint *params); -GLAPI PFNGLGETRENDERBUFFERPARAMETERIVEXTPROC glad_glGetRenderbufferParameterivEXT; -#define glGetRenderbufferParameterivEXT glad_glGetRenderbufferParameterivEXT -typedef GLboolean (APIENTRYP PFNGLISFRAMEBUFFEREXTPROC)(GLuint framebuffer); -GLAPI PFNGLISFRAMEBUFFEREXTPROC glad_glIsFramebufferEXT; -#define glIsFramebufferEXT glad_glIsFramebufferEXT -typedef void (APIENTRYP PFNGLBINDFRAMEBUFFEREXTPROC)(GLenum target, GLuint framebuffer); -GLAPI PFNGLBINDFRAMEBUFFEREXTPROC glad_glBindFramebufferEXT; -#define glBindFramebufferEXT glad_glBindFramebufferEXT -typedef void (APIENTRYP PFNGLDELETEFRAMEBUFFERSEXTPROC)(GLsizei n, const GLuint *framebuffers); -GLAPI PFNGLDELETEFRAMEBUFFERSEXTPROC glad_glDeleteFramebuffersEXT; -#define glDeleteFramebuffersEXT glad_glDeleteFramebuffersEXT -typedef void (APIENTRYP PFNGLGENFRAMEBUFFERSEXTPROC)(GLsizei n, GLuint *framebuffers); -GLAPI PFNGLGENFRAMEBUFFERSEXTPROC glad_glGenFramebuffersEXT; -#define glGenFramebuffersEXT glad_glGenFramebuffersEXT -typedef GLenum (APIENTRYP PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC)(GLenum target); -GLAPI PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC glad_glCheckFramebufferStatusEXT; -#define glCheckFramebufferStatusEXT glad_glCheckFramebufferStatusEXT -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE1DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); -GLAPI PFNGLFRAMEBUFFERTEXTURE1DEXTPROC glad_glFramebufferTexture1DEXT; -#define glFramebufferTexture1DEXT glad_glFramebufferTexture1DEXT -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE2DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); -GLAPI PFNGLFRAMEBUFFERTEXTURE2DEXTPROC glad_glFramebufferTexture2DEXT; -#define glFramebufferTexture2DEXT glad_glFramebufferTexture2DEXT -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTURE3DEXTPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level, GLint zoffset); -GLAPI PFNGLFRAMEBUFFERTEXTURE3DEXTPROC glad_glFramebufferTexture3DEXT; -#define glFramebufferTexture3DEXT glad_glFramebufferTexture3DEXT -typedef void (APIENTRYP PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC)(GLenum target, GLenum attachment, GLenum renderbuffertarget, GLuint renderbuffer); -GLAPI PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC glad_glFramebufferRenderbufferEXT; -#define glFramebufferRenderbufferEXT glad_glFramebufferRenderbufferEXT -typedef void (APIENTRYP PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC)(GLenum target, GLenum attachment, GLenum pname, GLint *params); -GLAPI PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC glad_glGetFramebufferAttachmentParameterivEXT; -#define glGetFramebufferAttachmentParameterivEXT glad_glGetFramebufferAttachmentParameterivEXT -typedef void (APIENTRYP PFNGLGENERATEMIPMAPEXTPROC)(GLenum target); -GLAPI PFNGLGENERATEMIPMAPEXTPROC glad_glGenerateMipmapEXT; -#define glGenerateMipmapEXT glad_glGenerateMipmapEXT -#endif -#ifndef GL_OVR_multiview -#define GL_OVR_multiview 1 -GLAPI int GLAD_GL_OVR_multiview; -typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC)(GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews); -GLAPI PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC glad_glFramebufferTextureMultiviewOVR; -#define glFramebufferTextureMultiviewOVR glad_glFramebufferTextureMultiviewOVR -#endif -#ifndef GL_OVR_multiview2 -#define GL_OVR_multiview2 1 -GLAPI int GLAD_GL_OVR_multiview2; -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/thirdparty/glad/glad/glx.h b/thirdparty/glad/glad/glx.h new file mode 100644 index 0000000000..ac115fa63b --- /dev/null +++ b/thirdparty/glad/glad/glx.h @@ -0,0 +1,605 @@ +/** + * Loader generated by glad 2.0.2 on Tue Nov 15 09:49:49 2022 + * + * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 + * + * Generator: C/C++ + * Specification: glx + * Extensions: 6 + * + * APIs: + * - glx=1.4 + * + * Options: + * - ALIAS = False + * - DEBUG = False + * - HEADER_ONLY = False + * - LOADER = True + * - MX = False + * - ON_DEMAND = False + * + * Commandline: + * --api='glx=1.4' --extensions='GLX_ARB_create_context,GLX_ARB_create_context_profile,GLX_ARB_get_proc_address,GLX_EXT_swap_control,GLX_MESA_swap_control,GLX_SGI_swap_control' c --loader + * + * Online: + * http://glad.sh/#api=glx%3D1.4&extensions=GLX_ARB_create_context%2CGLX_ARB_create_context_profile%2CGLX_ARB_get_proc_address%2CGLX_EXT_swap_control%2CGLX_MESA_swap_control%2CGLX_SGI_swap_control&generator=c&options=LOADER + * + */ + +#ifndef GLAD_GLX_H_ +#define GLAD_GLX_H_ + +#ifdef GLX_H + #error GLX header already included (API: glx), remove previous include! +#endif +#define GLX_H 1 + + +#include <X11/X.h> +#include <X11/Xlib.h> +#include <X11/Xutil.h> + +#include <glad/gl.h> + +#define GLAD_GLX +#define GLAD_OPTION_GLX_LOADER + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef GLAD_PLATFORM_H_ +#define GLAD_PLATFORM_H_ + +#ifndef GLAD_PLATFORM_WIN32 + #if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__MINGW32__) + #define GLAD_PLATFORM_WIN32 1 + #else + #define GLAD_PLATFORM_WIN32 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_APPLE + #ifdef __APPLE__ + #define GLAD_PLATFORM_APPLE 1 + #else + #define GLAD_PLATFORM_APPLE 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_EMSCRIPTEN + #ifdef __EMSCRIPTEN__ + #define GLAD_PLATFORM_EMSCRIPTEN 1 + #else + #define GLAD_PLATFORM_EMSCRIPTEN 0 + #endif +#endif + +#ifndef GLAD_PLATFORM_UWP + #if defined(_MSC_VER) && !defined(GLAD_INTERNAL_HAVE_WINAPIFAMILY) + #ifdef __has_include + #if __has_include(<winapifamily.h>) + #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1 + #endif + #elif _MSC_VER >= 1700 && !_USING_V110_SDK71_ + #define GLAD_INTERNAL_HAVE_WINAPIFAMILY 1 + #endif + #endif + + #ifdef GLAD_INTERNAL_HAVE_WINAPIFAMILY + #include <winapifamily.h> + #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) + #define GLAD_PLATFORM_UWP 1 + #endif + #endif + + #ifndef GLAD_PLATFORM_UWP + #define GLAD_PLATFORM_UWP 0 + #endif +#endif + +#ifdef __GNUC__ + #define GLAD_GNUC_EXTENSION __extension__ +#else + #define GLAD_GNUC_EXTENSION +#endif + +#define GLAD_UNUSED(x) (void)(x) + +#ifndef GLAD_API_CALL + #if defined(GLAD_API_CALL_EXPORT) + #if GLAD_PLATFORM_WIN32 || defined(__CYGWIN__) + #if defined(GLAD_API_CALL_EXPORT_BUILD) + #if defined(__GNUC__) + #define GLAD_API_CALL __attribute__ ((dllexport)) extern + #else + #define GLAD_API_CALL __declspec(dllexport) extern + #endif + #else + #if defined(__GNUC__) + #define GLAD_API_CALL __attribute__ ((dllimport)) extern + #else + #define GLAD_API_CALL __declspec(dllimport) extern + #endif + #endif + #elif defined(__GNUC__) && defined(GLAD_API_CALL_EXPORT_BUILD) + #define GLAD_API_CALL __attribute__ ((visibility ("default"))) extern + #else + #define GLAD_API_CALL extern + #endif + #else + #define GLAD_API_CALL extern + #endif +#endif + +#ifdef APIENTRY + #define GLAD_API_PTR APIENTRY +#elif GLAD_PLATFORM_WIN32 + #define GLAD_API_PTR __stdcall +#else + #define GLAD_API_PTR +#endif + +#ifndef GLAPI +#define GLAPI GLAD_API_CALL +#endif + +#ifndef GLAPIENTRY +#define GLAPIENTRY GLAD_API_PTR +#endif + +#define GLAD_MAKE_VERSION(major, minor) (major * 10000 + minor) +#define GLAD_VERSION_MAJOR(version) (version / 10000) +#define GLAD_VERSION_MINOR(version) (version % 10000) + +#define GLAD_GENERATOR_VERSION "2.0.2" + +typedef void (*GLADapiproc)(void); + +typedef GLADapiproc (*GLADloadfunc)(const char *name); +typedef GLADapiproc (*GLADuserptrloadfunc)(void *userptr, const char *name); + +typedef void (*GLADprecallback)(const char *name, GLADapiproc apiproc, int len_args, ...); +typedef void (*GLADpostcallback)(void *ret, const char *name, GLADapiproc apiproc, int len_args, ...); + +#endif /* GLAD_PLATFORM_H_ */ + +#define GLX_ACCUM_ALPHA_SIZE 17 +#define GLX_ACCUM_BLUE_SIZE 16 +#define GLX_ACCUM_BUFFER_BIT 0x00000080 +#define GLX_ACCUM_GREEN_SIZE 15 +#define GLX_ACCUM_RED_SIZE 14 +#define GLX_ALPHA_SIZE 11 +#define GLX_AUX_BUFFERS 7 +#define GLX_AUX_BUFFERS_BIT 0x00000010 +#define GLX_BACK_LEFT_BUFFER_BIT 0x00000004 +#define GLX_BACK_RIGHT_BUFFER_BIT 0x00000008 +#define GLX_BAD_ATTRIBUTE 2 +#define GLX_BAD_CONTEXT 5 +#define GLX_BAD_ENUM 7 +#define GLX_BAD_SCREEN 1 +#define GLX_BAD_VALUE 6 +#define GLX_BAD_VISUAL 4 +#define GLX_BLUE_SIZE 10 +#define GLX_BUFFER_SIZE 2 +#define GLX_BufferSwapComplete 1 +#define GLX_COLOR_INDEX_BIT 0x00000002 +#define GLX_COLOR_INDEX_TYPE 0x8015 +#define GLX_CONFIG_CAVEAT 0x20 +#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002 +#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB 0x00000001 +#define GLX_CONTEXT_DEBUG_BIT_ARB 0x00000001 +#define GLX_CONTEXT_FLAGS_ARB 0x2094 +#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB 0x00000002 +#define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091 +#define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092 +#define GLX_CONTEXT_PROFILE_MASK_ARB 0x9126 +#define GLX_DAMAGED 0x8020 +#define GLX_DEPTH_BUFFER_BIT 0x00000020 +#define GLX_DEPTH_SIZE 12 +#define GLX_DIRECT_COLOR 0x8003 +#define GLX_DONT_CARE 0xFFFFFFFF +#define GLX_DOUBLEBUFFER 5 +#define GLX_DRAWABLE_TYPE 0x8010 +#define GLX_EVENT_MASK 0x801F +#define GLX_EXTENSIONS 0x3 +#define GLX_EXTENSION_NAME "GLX" +#define GLX_FBCONFIG_ID 0x8013 +#define GLX_FRONT_LEFT_BUFFER_BIT 0x00000001 +#define GLX_FRONT_RIGHT_BUFFER_BIT 0x00000002 +#define GLX_GRAY_SCALE 0x8006 +#define GLX_GREEN_SIZE 9 +#define GLX_HEIGHT 0x801E +#define GLX_LARGEST_PBUFFER 0x801C +#define GLX_LEVEL 3 +#define GLX_MAX_PBUFFER_HEIGHT 0x8017 +#define GLX_MAX_PBUFFER_PIXELS 0x8018 +#define GLX_MAX_PBUFFER_WIDTH 0x8016 +#define GLX_MAX_SWAP_INTERVAL_EXT 0x20F2 +#define GLX_NONE 0x8000 +#define GLX_NON_CONFORMANT_CONFIG 0x800D +#define GLX_NO_EXTENSION 3 +#define GLX_PBUFFER 0x8023 +#define GLX_PBUFFER_BIT 0x00000004 +#define GLX_PBUFFER_CLOBBER_MASK 0x08000000 +#define GLX_PBUFFER_HEIGHT 0x8040 +#define GLX_PBUFFER_WIDTH 0x8041 +#define GLX_PIXMAP_BIT 0x00000002 +#define GLX_PRESERVED_CONTENTS 0x801B +#define GLX_PSEUDO_COLOR 0x8004 +#define GLX_PbufferClobber 0 +#define GLX_RED_SIZE 8 +#define GLX_RENDER_TYPE 0x8011 +#define GLX_RGBA 4 +#define GLX_RGBA_BIT 0x00000001 +#define GLX_RGBA_TYPE 0x8014 +#define GLX_SAMPLES 100001 +#define GLX_SAMPLE_BUFFERS 100000 +#define GLX_SAVED 0x8021 +#define GLX_SCREEN 0x800C +#define GLX_SLOW_CONFIG 0x8001 +#define GLX_STATIC_COLOR 0x8005 +#define GLX_STATIC_GRAY 0x8007 +#define GLX_STENCIL_BUFFER_BIT 0x00000040 +#define GLX_STENCIL_SIZE 13 +#define GLX_STEREO 6 +#define GLX_SWAP_INTERVAL_EXT 0x20F1 +#define GLX_TRANSPARENT_ALPHA_VALUE 0x28 +#define GLX_TRANSPARENT_BLUE_VALUE 0x27 +#define GLX_TRANSPARENT_GREEN_VALUE 0x26 +#define GLX_TRANSPARENT_INDEX 0x8009 +#define GLX_TRANSPARENT_INDEX_VALUE 0x24 +#define GLX_TRANSPARENT_RED_VALUE 0x25 +#define GLX_TRANSPARENT_RGB 0x8008 +#define GLX_TRANSPARENT_TYPE 0x23 +#define GLX_TRUE_COLOR 0x8002 +#define GLX_USE_GL 1 +#define GLX_VENDOR 0x1 +#define GLX_VERSION 0x2 +#define GLX_VISUAL_ID 0x800B +#define GLX_WIDTH 0x801D +#define GLX_WINDOW 0x8022 +#define GLX_WINDOW_BIT 0x00000001 +#define GLX_X_RENDERABLE 0x8012 +#define GLX_X_VISUAL_TYPE 0x22 +#define __GLX_NUMBER_EVENTS 17 + + +#ifndef GLEXT_64_TYPES_DEFINED +/* This code block is duplicated in glext.h, so must be protected */ +#define GLEXT_64_TYPES_DEFINED +/* Define int32_t, int64_t, and uint64_t types for UST/MSC */ +/* (as used in the GLX_OML_sync_control extension). */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#include <inttypes.h> +#elif defined(__sun__) || defined(__digital__) +#include <inttypes.h> +#if defined(__STDC__) +#if defined(__arch64__) || defined(_LP64) +typedef long int int64_t; +typedef unsigned long int uint64_t; +#else +typedef long long int int64_t; +typedef unsigned long long int uint64_t; +#endif /* __arch64__ */ +#endif /* __STDC__ */ +#elif defined( __VMS ) || defined(__sgi) +#include <inttypes.h> +#elif defined(__SCO__) || defined(__USLC__) +#include <stdint.h> +#elif defined(__UNIXOS2__) || defined(__SOL64__) +typedef long int int32_t; +typedef long long int int64_t; +typedef unsigned long long int uint64_t; +#elif defined(_WIN32) && defined(__GNUC__) +#include <stdint.h> +#elif defined(_WIN32) +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +/* Fallback if nothing above works */ +#include <inttypes.h> +#endif +#endif + + + + + + + + + + + + + + + + +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) + +#else + +#endif + +#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1060) + +#else + +#endif + + + + + + + +typedef XID GLXFBConfigID; +typedef struct __GLXFBConfigRec *GLXFBConfig; +typedef XID GLXContextID; +typedef struct __GLXcontextRec *GLXContext; +typedef XID GLXPixmap; +typedef XID GLXDrawable; +typedef XID GLXWindow; +typedef XID GLXPbuffer; +typedef void (GLAD_API_PTR *__GLXextFuncPtr)(void); +typedef XID GLXVideoCaptureDeviceNV; +typedef unsigned int GLXVideoDeviceNV; +typedef XID GLXVideoSourceSGIX; +typedef XID GLXFBConfigIDSGIX; +typedef struct __GLXFBConfigRec *GLXFBConfigSGIX; +typedef XID GLXPbufferSGIX; +typedef struct { + int event_type; /* GLX_DAMAGED or GLX_SAVED */ + int draw_type; /* GLX_WINDOW or GLX_PBUFFER */ + unsigned long serial; /* # of last request processed by server */ + Bool send_event; /* true if this came for SendEvent request */ + Display *display; /* display the event was read from */ + GLXDrawable drawable; /* XID of Drawable */ + unsigned int buffer_mask; /* mask indicating which buffers are affected */ + unsigned int aux_buffer; /* which aux buffer was affected */ + int x, y; + int width, height; + int count; /* if nonzero, at least this many more */ +} GLXPbufferClobberEvent; +typedef struct { + int type; + unsigned long serial; /* # of last request processed by server */ + Bool send_event; /* true if this came from a SendEvent request */ + Display *display; /* Display the event was read from */ + GLXDrawable drawable; /* drawable on which event was requested in event mask */ + int event_type; + int64_t ust; + int64_t msc; + int64_t sbc; +} GLXBufferSwapComplete; +typedef union __GLXEvent { + GLXPbufferClobberEvent glxpbufferclobber; + GLXBufferSwapComplete glxbufferswapcomplete; + long pad[24]; +} GLXEvent; +typedef struct { + int type; + unsigned long serial; + Bool send_event; + Display *display; + int extension; + int evtype; + GLXDrawable window; + Bool stereo_tree; +} GLXStereoNotifyEventEXT; +typedef struct { + int type; + unsigned long serial; /* # of last request processed by server */ + Bool send_event; /* true if this came for SendEvent request */ + Display *display; /* display the event was read from */ + GLXDrawable drawable; /* i.d. of Drawable */ + int event_type; /* GLX_DAMAGED_SGIX or GLX_SAVED_SGIX */ + int draw_type; /* GLX_WINDOW_SGIX or GLX_PBUFFER_SGIX */ + unsigned int mask; /* mask indicating which buffers are affected*/ + int x, y; + int width, height; + int count; /* if nonzero, at least this many more */ +} GLXBufferClobberEventSGIX; +typedef struct { + char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */ + int networkId; +} GLXHyperpipeNetworkSGIX; +typedef struct { + char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */ + int channel; + unsigned int participationType; + int timeSlice; +} GLXHyperpipeConfigSGIX; +typedef struct { + char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */ + int srcXOrigin, srcYOrigin, srcWidth, srcHeight; + int destXOrigin, destYOrigin, destWidth, destHeight; +} GLXPipeRect; +typedef struct { + char pipeName[80]; /* Should be [GLX_HYPERPIPE_PIPE_NAME_LENGTH_SGIX] */ + int XOrigin, YOrigin, maxHeight, maxWidth; +} GLXPipeRectLimits; + + +#define GLX_VERSION_1_0 1 +GLAD_API_CALL int GLAD_GLX_VERSION_1_0; +#define GLX_VERSION_1_1 1 +GLAD_API_CALL int GLAD_GLX_VERSION_1_1; +#define GLX_VERSION_1_2 1 +GLAD_API_CALL int GLAD_GLX_VERSION_1_2; +#define GLX_VERSION_1_3 1 +GLAD_API_CALL int GLAD_GLX_VERSION_1_3; +#define GLX_VERSION_1_4 1 +GLAD_API_CALL int GLAD_GLX_VERSION_1_4; +#define GLX_ARB_create_context 1 +GLAD_API_CALL int GLAD_GLX_ARB_create_context; +#define GLX_ARB_create_context_profile 1 +GLAD_API_CALL int GLAD_GLX_ARB_create_context_profile; +#define GLX_ARB_get_proc_address 1 +GLAD_API_CALL int GLAD_GLX_ARB_get_proc_address; +#define GLX_EXT_swap_control 1 +GLAD_API_CALL int GLAD_GLX_EXT_swap_control; +#define GLX_MESA_swap_control 1 +GLAD_API_CALL int GLAD_GLX_MESA_swap_control; +#define GLX_SGI_swap_control 1 +GLAD_API_CALL int GLAD_GLX_SGI_swap_control; + + +typedef GLXFBConfig * (GLAD_API_PTR *PFNGLXCHOOSEFBCONFIGPROC)(Display * dpy, int screen, const int * attrib_list, int * nelements); +typedef XVisualInfo * (GLAD_API_PTR *PFNGLXCHOOSEVISUALPROC)(Display * dpy, int screen, int * attribList); +typedef void (GLAD_API_PTR *PFNGLXCOPYCONTEXTPROC)(Display * dpy, GLXContext src, GLXContext dst, unsigned long mask); +typedef GLXContext (GLAD_API_PTR *PFNGLXCREATECONTEXTPROC)(Display * dpy, XVisualInfo * vis, GLXContext shareList, Bool direct); +typedef GLXContext (GLAD_API_PTR *PFNGLXCREATECONTEXTATTRIBSARBPROC)(Display * dpy, GLXFBConfig config, GLXContext share_context, Bool direct, const int * attrib_list); +typedef GLXPixmap (GLAD_API_PTR *PFNGLXCREATEGLXPIXMAPPROC)(Display * dpy, XVisualInfo * visual, Pixmap pixmap); +typedef GLXContext (GLAD_API_PTR *PFNGLXCREATENEWCONTEXTPROC)(Display * dpy, GLXFBConfig config, int render_type, GLXContext share_list, Bool direct); +typedef GLXPbuffer (GLAD_API_PTR *PFNGLXCREATEPBUFFERPROC)(Display * dpy, GLXFBConfig config, const int * attrib_list); +typedef GLXPixmap (GLAD_API_PTR *PFNGLXCREATEPIXMAPPROC)(Display * dpy, GLXFBConfig config, Pixmap pixmap, const int * attrib_list); +typedef GLXWindow (GLAD_API_PTR *PFNGLXCREATEWINDOWPROC)(Display * dpy, GLXFBConfig config, Window win, const int * attrib_list); +typedef void (GLAD_API_PTR *PFNGLXDESTROYCONTEXTPROC)(Display * dpy, GLXContext ctx); +typedef void (GLAD_API_PTR *PFNGLXDESTROYGLXPIXMAPPROC)(Display * dpy, GLXPixmap pixmap); +typedef void (GLAD_API_PTR *PFNGLXDESTROYPBUFFERPROC)(Display * dpy, GLXPbuffer pbuf); +typedef void (GLAD_API_PTR *PFNGLXDESTROYPIXMAPPROC)(Display * dpy, GLXPixmap pixmap); +typedef void (GLAD_API_PTR *PFNGLXDESTROYWINDOWPROC)(Display * dpy, GLXWindow win); +typedef const char * (GLAD_API_PTR *PFNGLXGETCLIENTSTRINGPROC)(Display * dpy, int name); +typedef int (GLAD_API_PTR *PFNGLXGETCONFIGPROC)(Display * dpy, XVisualInfo * visual, int attrib, int * value); +typedef GLXContext (GLAD_API_PTR *PFNGLXGETCURRENTCONTEXTPROC)(void); +typedef Display * (GLAD_API_PTR *PFNGLXGETCURRENTDISPLAYPROC)(void); +typedef GLXDrawable (GLAD_API_PTR *PFNGLXGETCURRENTDRAWABLEPROC)(void); +typedef GLXDrawable (GLAD_API_PTR *PFNGLXGETCURRENTREADDRAWABLEPROC)(void); +typedef int (GLAD_API_PTR *PFNGLXGETFBCONFIGATTRIBPROC)(Display * dpy, GLXFBConfig config, int attribute, int * value); +typedef GLXFBConfig * (GLAD_API_PTR *PFNGLXGETFBCONFIGSPROC)(Display * dpy, int screen, int * nelements); +typedef __GLXextFuncPtr (GLAD_API_PTR *PFNGLXGETPROCADDRESSPROC)(const GLubyte * procName); +typedef __GLXextFuncPtr (GLAD_API_PTR *PFNGLXGETPROCADDRESSARBPROC)(const GLubyte * procName); +typedef void (GLAD_API_PTR *PFNGLXGETSELECTEDEVENTPROC)(Display * dpy, GLXDrawable draw, unsigned long * event_mask); +typedef int (GLAD_API_PTR *PFNGLXGETSWAPINTERVALMESAPROC)(void); +typedef XVisualInfo * (GLAD_API_PTR *PFNGLXGETVISUALFROMFBCONFIGPROC)(Display * dpy, GLXFBConfig config); +typedef Bool (GLAD_API_PTR *PFNGLXISDIRECTPROC)(Display * dpy, GLXContext ctx); +typedef Bool (GLAD_API_PTR *PFNGLXMAKECONTEXTCURRENTPROC)(Display * dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx); +typedef Bool (GLAD_API_PTR *PFNGLXMAKECURRENTPROC)(Display * dpy, GLXDrawable drawable, GLXContext ctx); +typedef int (GLAD_API_PTR *PFNGLXQUERYCONTEXTPROC)(Display * dpy, GLXContext ctx, int attribute, int * value); +typedef void (GLAD_API_PTR *PFNGLXQUERYDRAWABLEPROC)(Display * dpy, GLXDrawable draw, int attribute, unsigned int * value); +typedef Bool (GLAD_API_PTR *PFNGLXQUERYEXTENSIONPROC)(Display * dpy, int * errorb, int * event); +typedef const char * (GLAD_API_PTR *PFNGLXQUERYEXTENSIONSSTRINGPROC)(Display * dpy, int screen); +typedef const char * (GLAD_API_PTR *PFNGLXQUERYSERVERSTRINGPROC)(Display * dpy, int screen, int name); +typedef Bool (GLAD_API_PTR *PFNGLXQUERYVERSIONPROC)(Display * dpy, int * maj, int * min); +typedef void (GLAD_API_PTR *PFNGLXSELECTEVENTPROC)(Display * dpy, GLXDrawable draw, unsigned long event_mask); +typedef void (GLAD_API_PTR *PFNGLXSWAPBUFFERSPROC)(Display * dpy, GLXDrawable drawable); +typedef void (GLAD_API_PTR *PFNGLXSWAPINTERVALEXTPROC)(Display * dpy, GLXDrawable drawable, int interval); +typedef int (GLAD_API_PTR *PFNGLXSWAPINTERVALMESAPROC)(unsigned int interval); +typedef int (GLAD_API_PTR *PFNGLXSWAPINTERVALSGIPROC)(int interval); +typedef void (GLAD_API_PTR *PFNGLXUSEXFONTPROC)(Font font, int first, int count, int list); +typedef void (GLAD_API_PTR *PFNGLXWAITGLPROC)(void); +typedef void (GLAD_API_PTR *PFNGLXWAITXPROC)(void); + +GLAD_API_CALL PFNGLXCHOOSEFBCONFIGPROC glad_glXChooseFBConfig; +#define glXChooseFBConfig glad_glXChooseFBConfig +GLAD_API_CALL PFNGLXCHOOSEVISUALPROC glad_glXChooseVisual; +#define glXChooseVisual glad_glXChooseVisual +GLAD_API_CALL PFNGLXCOPYCONTEXTPROC glad_glXCopyContext; +#define glXCopyContext glad_glXCopyContext +GLAD_API_CALL PFNGLXCREATECONTEXTPROC glad_glXCreateContext; +#define glXCreateContext glad_glXCreateContext +GLAD_API_CALL PFNGLXCREATECONTEXTATTRIBSARBPROC glad_glXCreateContextAttribsARB; +#define glXCreateContextAttribsARB glad_glXCreateContextAttribsARB +GLAD_API_CALL PFNGLXCREATEGLXPIXMAPPROC glad_glXCreateGLXPixmap; +#define glXCreateGLXPixmap glad_glXCreateGLXPixmap +GLAD_API_CALL PFNGLXCREATENEWCONTEXTPROC glad_glXCreateNewContext; +#define glXCreateNewContext glad_glXCreateNewContext +GLAD_API_CALL PFNGLXCREATEPBUFFERPROC glad_glXCreatePbuffer; +#define glXCreatePbuffer glad_glXCreatePbuffer +GLAD_API_CALL PFNGLXCREATEPIXMAPPROC glad_glXCreatePixmap; +#define glXCreatePixmap glad_glXCreatePixmap +GLAD_API_CALL PFNGLXCREATEWINDOWPROC glad_glXCreateWindow; +#define glXCreateWindow glad_glXCreateWindow +GLAD_API_CALL PFNGLXDESTROYCONTEXTPROC glad_glXDestroyContext; +#define glXDestroyContext glad_glXDestroyContext +GLAD_API_CALL PFNGLXDESTROYGLXPIXMAPPROC glad_glXDestroyGLXPixmap; +#define glXDestroyGLXPixmap glad_glXDestroyGLXPixmap +GLAD_API_CALL PFNGLXDESTROYPBUFFERPROC glad_glXDestroyPbuffer; +#define glXDestroyPbuffer glad_glXDestroyPbuffer +GLAD_API_CALL PFNGLXDESTROYPIXMAPPROC glad_glXDestroyPixmap; +#define glXDestroyPixmap glad_glXDestroyPixmap +GLAD_API_CALL PFNGLXDESTROYWINDOWPROC glad_glXDestroyWindow; +#define glXDestroyWindow glad_glXDestroyWindow +GLAD_API_CALL PFNGLXGETCLIENTSTRINGPROC glad_glXGetClientString; +#define glXGetClientString glad_glXGetClientString +GLAD_API_CALL PFNGLXGETCONFIGPROC glad_glXGetConfig; +#define glXGetConfig glad_glXGetConfig +GLAD_API_CALL PFNGLXGETCURRENTCONTEXTPROC glad_glXGetCurrentContext; +#define glXGetCurrentContext glad_glXGetCurrentContext +GLAD_API_CALL PFNGLXGETCURRENTDISPLAYPROC glad_glXGetCurrentDisplay; +#define glXGetCurrentDisplay glad_glXGetCurrentDisplay +GLAD_API_CALL PFNGLXGETCURRENTDRAWABLEPROC glad_glXGetCurrentDrawable; +#define glXGetCurrentDrawable glad_glXGetCurrentDrawable +GLAD_API_CALL PFNGLXGETCURRENTREADDRAWABLEPROC glad_glXGetCurrentReadDrawable; +#define glXGetCurrentReadDrawable glad_glXGetCurrentReadDrawable +GLAD_API_CALL PFNGLXGETFBCONFIGATTRIBPROC glad_glXGetFBConfigAttrib; +#define glXGetFBConfigAttrib glad_glXGetFBConfigAttrib +GLAD_API_CALL PFNGLXGETFBCONFIGSPROC glad_glXGetFBConfigs; +#define glXGetFBConfigs glad_glXGetFBConfigs +GLAD_API_CALL PFNGLXGETPROCADDRESSPROC glad_glXGetProcAddress; +#define glXGetProcAddress glad_glXGetProcAddress +GLAD_API_CALL PFNGLXGETPROCADDRESSARBPROC glad_glXGetProcAddressARB; +#define glXGetProcAddressARB glad_glXGetProcAddressARB +GLAD_API_CALL PFNGLXGETSELECTEDEVENTPROC glad_glXGetSelectedEvent; +#define glXGetSelectedEvent glad_glXGetSelectedEvent +GLAD_API_CALL PFNGLXGETSWAPINTERVALMESAPROC glad_glXGetSwapIntervalMESA; +#define glXGetSwapIntervalMESA glad_glXGetSwapIntervalMESA +GLAD_API_CALL PFNGLXGETVISUALFROMFBCONFIGPROC glad_glXGetVisualFromFBConfig; +#define glXGetVisualFromFBConfig glad_glXGetVisualFromFBConfig +GLAD_API_CALL PFNGLXISDIRECTPROC glad_glXIsDirect; +#define glXIsDirect glad_glXIsDirect +GLAD_API_CALL PFNGLXMAKECONTEXTCURRENTPROC glad_glXMakeContextCurrent; +#define glXMakeContextCurrent glad_glXMakeContextCurrent +GLAD_API_CALL PFNGLXMAKECURRENTPROC glad_glXMakeCurrent; +#define glXMakeCurrent glad_glXMakeCurrent +GLAD_API_CALL PFNGLXQUERYCONTEXTPROC glad_glXQueryContext; +#define glXQueryContext glad_glXQueryContext +GLAD_API_CALL PFNGLXQUERYDRAWABLEPROC glad_glXQueryDrawable; +#define glXQueryDrawable glad_glXQueryDrawable +GLAD_API_CALL PFNGLXQUERYEXTENSIONPROC glad_glXQueryExtension; +#define glXQueryExtension glad_glXQueryExtension +GLAD_API_CALL PFNGLXQUERYEXTENSIONSSTRINGPROC glad_glXQueryExtensionsString; +#define glXQueryExtensionsString glad_glXQueryExtensionsString +GLAD_API_CALL PFNGLXQUERYSERVERSTRINGPROC glad_glXQueryServerString; +#define glXQueryServerString glad_glXQueryServerString +GLAD_API_CALL PFNGLXQUERYVERSIONPROC glad_glXQueryVersion; +#define glXQueryVersion glad_glXQueryVersion +GLAD_API_CALL PFNGLXSELECTEVENTPROC glad_glXSelectEvent; +#define glXSelectEvent glad_glXSelectEvent +GLAD_API_CALL PFNGLXSWAPBUFFERSPROC glad_glXSwapBuffers; +#define glXSwapBuffers glad_glXSwapBuffers +GLAD_API_CALL PFNGLXSWAPINTERVALEXTPROC glad_glXSwapIntervalEXT; +#define glXSwapIntervalEXT glad_glXSwapIntervalEXT +GLAD_API_CALL PFNGLXSWAPINTERVALMESAPROC glad_glXSwapIntervalMESA; +#define glXSwapIntervalMESA glad_glXSwapIntervalMESA +GLAD_API_CALL PFNGLXSWAPINTERVALSGIPROC glad_glXSwapIntervalSGI; +#define glXSwapIntervalSGI glad_glXSwapIntervalSGI +GLAD_API_CALL PFNGLXUSEXFONTPROC glad_glXUseXFont; +#define glXUseXFont glad_glXUseXFont +GLAD_API_CALL PFNGLXWAITGLPROC glad_glXWaitGL; +#define glXWaitGL glad_glXWaitGL +GLAD_API_CALL PFNGLXWAITXPROC glad_glXWaitX; +#define glXWaitX glad_glXWaitX + + + + + +GLAD_API_CALL int gladLoadGLXUserPtr(Display *display, int screen, GLADuserptrloadfunc load, void *userptr); +GLAD_API_CALL int gladLoadGLX(Display *display, int screen, GLADloadfunc load); + +#ifdef GLAD_GLX + +GLAD_API_CALL int gladLoaderLoadGLX(Display *display, int screen); + +GLAD_API_CALL void gladLoaderUnloadGLX(void); + +#endif +#ifdef __cplusplus +} +#endif +#endif diff --git a/thirdparty/glad/glx.c b/thirdparty/glad/glx.c new file mode 100644 index 0000000000..6391027db2 --- /dev/null +++ b/thirdparty/glad/glx.c @@ -0,0 +1,395 @@ +/** + * SPDX-License-Identifier: (WTFPL OR CC0-1.0) AND Apache-2.0 + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <glad/glx.h> + +#ifndef GLAD_IMPL_UTIL_C_ +#define GLAD_IMPL_UTIL_C_ + +#ifdef _MSC_VER +#define GLAD_IMPL_UTIL_SSCANF sscanf_s +#else +#define GLAD_IMPL_UTIL_SSCANF sscanf +#endif + +#endif /* GLAD_IMPL_UTIL_C_ */ + +#ifdef __cplusplus +extern "C" { +#endif + + + +int GLAD_GLX_VERSION_1_0 = 0; +int GLAD_GLX_VERSION_1_1 = 0; +int GLAD_GLX_VERSION_1_2 = 0; +int GLAD_GLX_VERSION_1_3 = 0; +int GLAD_GLX_VERSION_1_4 = 0; +int GLAD_GLX_ARB_create_context = 0; +int GLAD_GLX_ARB_create_context_profile = 0; +int GLAD_GLX_ARB_get_proc_address = 0; +int GLAD_GLX_EXT_swap_control = 0; +int GLAD_GLX_MESA_swap_control = 0; +int GLAD_GLX_SGI_swap_control = 0; + + + +PFNGLXCHOOSEFBCONFIGPROC glad_glXChooseFBConfig = NULL; +PFNGLXCHOOSEVISUALPROC glad_glXChooseVisual = NULL; +PFNGLXCOPYCONTEXTPROC glad_glXCopyContext = NULL; +PFNGLXCREATECONTEXTPROC glad_glXCreateContext = NULL; +PFNGLXCREATECONTEXTATTRIBSARBPROC glad_glXCreateContextAttribsARB = NULL; +PFNGLXCREATEGLXPIXMAPPROC glad_glXCreateGLXPixmap = NULL; +PFNGLXCREATENEWCONTEXTPROC glad_glXCreateNewContext = NULL; +PFNGLXCREATEPBUFFERPROC glad_glXCreatePbuffer = NULL; +PFNGLXCREATEPIXMAPPROC glad_glXCreatePixmap = NULL; +PFNGLXCREATEWINDOWPROC glad_glXCreateWindow = NULL; +PFNGLXDESTROYCONTEXTPROC glad_glXDestroyContext = NULL; +PFNGLXDESTROYGLXPIXMAPPROC glad_glXDestroyGLXPixmap = NULL; +PFNGLXDESTROYPBUFFERPROC glad_glXDestroyPbuffer = NULL; +PFNGLXDESTROYPIXMAPPROC glad_glXDestroyPixmap = NULL; +PFNGLXDESTROYWINDOWPROC glad_glXDestroyWindow = NULL; +PFNGLXGETCLIENTSTRINGPROC glad_glXGetClientString = NULL; +PFNGLXGETCONFIGPROC glad_glXGetConfig = NULL; +PFNGLXGETCURRENTCONTEXTPROC glad_glXGetCurrentContext = NULL; +PFNGLXGETCURRENTDISPLAYPROC glad_glXGetCurrentDisplay = NULL; +PFNGLXGETCURRENTDRAWABLEPROC glad_glXGetCurrentDrawable = NULL; +PFNGLXGETCURRENTREADDRAWABLEPROC glad_glXGetCurrentReadDrawable = NULL; +PFNGLXGETFBCONFIGATTRIBPROC glad_glXGetFBConfigAttrib = NULL; +PFNGLXGETFBCONFIGSPROC glad_glXGetFBConfigs = NULL; +PFNGLXGETPROCADDRESSPROC glad_glXGetProcAddress = NULL; +PFNGLXGETPROCADDRESSARBPROC glad_glXGetProcAddressARB = NULL; +PFNGLXGETSELECTEDEVENTPROC glad_glXGetSelectedEvent = NULL; +PFNGLXGETSWAPINTERVALMESAPROC glad_glXGetSwapIntervalMESA = NULL; +PFNGLXGETVISUALFROMFBCONFIGPROC glad_glXGetVisualFromFBConfig = NULL; +PFNGLXISDIRECTPROC glad_glXIsDirect = NULL; +PFNGLXMAKECONTEXTCURRENTPROC glad_glXMakeContextCurrent = NULL; +PFNGLXMAKECURRENTPROC glad_glXMakeCurrent = NULL; +PFNGLXQUERYCONTEXTPROC glad_glXQueryContext = NULL; +PFNGLXQUERYDRAWABLEPROC glad_glXQueryDrawable = NULL; +PFNGLXQUERYEXTENSIONPROC glad_glXQueryExtension = NULL; +PFNGLXQUERYEXTENSIONSSTRINGPROC glad_glXQueryExtensionsString = NULL; +PFNGLXQUERYSERVERSTRINGPROC glad_glXQueryServerString = NULL; +PFNGLXQUERYVERSIONPROC glad_glXQueryVersion = NULL; +PFNGLXSELECTEVENTPROC glad_glXSelectEvent = NULL; +PFNGLXSWAPBUFFERSPROC glad_glXSwapBuffers = NULL; +PFNGLXSWAPINTERVALEXTPROC glad_glXSwapIntervalEXT = NULL; +PFNGLXSWAPINTERVALMESAPROC glad_glXSwapIntervalMESA = NULL; +PFNGLXSWAPINTERVALSGIPROC glad_glXSwapIntervalSGI = NULL; +PFNGLXUSEXFONTPROC glad_glXUseXFont = NULL; +PFNGLXWAITGLPROC glad_glXWaitGL = NULL; +PFNGLXWAITXPROC glad_glXWaitX = NULL; + + +static void glad_glx_load_GLX_VERSION_1_0( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_VERSION_1_0) return; + glad_glXChooseVisual = (PFNGLXCHOOSEVISUALPROC) load(userptr, "glXChooseVisual"); + glad_glXCopyContext = (PFNGLXCOPYCONTEXTPROC) load(userptr, "glXCopyContext"); + glad_glXCreateContext = (PFNGLXCREATECONTEXTPROC) load(userptr, "glXCreateContext"); + glad_glXCreateGLXPixmap = (PFNGLXCREATEGLXPIXMAPPROC) load(userptr, "glXCreateGLXPixmap"); + glad_glXDestroyContext = (PFNGLXDESTROYCONTEXTPROC) load(userptr, "glXDestroyContext"); + glad_glXDestroyGLXPixmap = (PFNGLXDESTROYGLXPIXMAPPROC) load(userptr, "glXDestroyGLXPixmap"); + glad_glXGetConfig = (PFNGLXGETCONFIGPROC) load(userptr, "glXGetConfig"); + glad_glXGetCurrentContext = (PFNGLXGETCURRENTCONTEXTPROC) load(userptr, "glXGetCurrentContext"); + glad_glXGetCurrentDrawable = (PFNGLXGETCURRENTDRAWABLEPROC) load(userptr, "glXGetCurrentDrawable"); + glad_glXIsDirect = (PFNGLXISDIRECTPROC) load(userptr, "glXIsDirect"); + glad_glXMakeCurrent = (PFNGLXMAKECURRENTPROC) load(userptr, "glXMakeCurrent"); + glad_glXQueryExtension = (PFNGLXQUERYEXTENSIONPROC) load(userptr, "glXQueryExtension"); + glad_glXQueryVersion = (PFNGLXQUERYVERSIONPROC) load(userptr, "glXQueryVersion"); + glad_glXSwapBuffers = (PFNGLXSWAPBUFFERSPROC) load(userptr, "glXSwapBuffers"); + glad_glXUseXFont = (PFNGLXUSEXFONTPROC) load(userptr, "glXUseXFont"); + glad_glXWaitGL = (PFNGLXWAITGLPROC) load(userptr, "glXWaitGL"); + glad_glXWaitX = (PFNGLXWAITXPROC) load(userptr, "glXWaitX"); +} +static void glad_glx_load_GLX_VERSION_1_1( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_VERSION_1_1) return; + glad_glXGetClientString = (PFNGLXGETCLIENTSTRINGPROC) load(userptr, "glXGetClientString"); + glad_glXQueryExtensionsString = (PFNGLXQUERYEXTENSIONSSTRINGPROC) load(userptr, "glXQueryExtensionsString"); + glad_glXQueryServerString = (PFNGLXQUERYSERVERSTRINGPROC) load(userptr, "glXQueryServerString"); +} +static void glad_glx_load_GLX_VERSION_1_2( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_VERSION_1_2) return; + glad_glXGetCurrentDisplay = (PFNGLXGETCURRENTDISPLAYPROC) load(userptr, "glXGetCurrentDisplay"); +} +static void glad_glx_load_GLX_VERSION_1_3( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_VERSION_1_3) return; + glad_glXChooseFBConfig = (PFNGLXCHOOSEFBCONFIGPROC) load(userptr, "glXChooseFBConfig"); + glad_glXCreateNewContext = (PFNGLXCREATENEWCONTEXTPROC) load(userptr, "glXCreateNewContext"); + glad_glXCreatePbuffer = (PFNGLXCREATEPBUFFERPROC) load(userptr, "glXCreatePbuffer"); + glad_glXCreatePixmap = (PFNGLXCREATEPIXMAPPROC) load(userptr, "glXCreatePixmap"); + glad_glXCreateWindow = (PFNGLXCREATEWINDOWPROC) load(userptr, "glXCreateWindow"); + glad_glXDestroyPbuffer = (PFNGLXDESTROYPBUFFERPROC) load(userptr, "glXDestroyPbuffer"); + glad_glXDestroyPixmap = (PFNGLXDESTROYPIXMAPPROC) load(userptr, "glXDestroyPixmap"); + glad_glXDestroyWindow = (PFNGLXDESTROYWINDOWPROC) load(userptr, "glXDestroyWindow"); + glad_glXGetCurrentReadDrawable = (PFNGLXGETCURRENTREADDRAWABLEPROC) load(userptr, "glXGetCurrentReadDrawable"); + glad_glXGetFBConfigAttrib = (PFNGLXGETFBCONFIGATTRIBPROC) load(userptr, "glXGetFBConfigAttrib"); + glad_glXGetFBConfigs = (PFNGLXGETFBCONFIGSPROC) load(userptr, "glXGetFBConfigs"); + glad_glXGetSelectedEvent = (PFNGLXGETSELECTEDEVENTPROC) load(userptr, "glXGetSelectedEvent"); + glad_glXGetVisualFromFBConfig = (PFNGLXGETVISUALFROMFBCONFIGPROC) load(userptr, "glXGetVisualFromFBConfig"); + glad_glXMakeContextCurrent = (PFNGLXMAKECONTEXTCURRENTPROC) load(userptr, "glXMakeContextCurrent"); + glad_glXQueryContext = (PFNGLXQUERYCONTEXTPROC) load(userptr, "glXQueryContext"); + glad_glXQueryDrawable = (PFNGLXQUERYDRAWABLEPROC) load(userptr, "glXQueryDrawable"); + glad_glXSelectEvent = (PFNGLXSELECTEVENTPROC) load(userptr, "glXSelectEvent"); +} +static void glad_glx_load_GLX_VERSION_1_4( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_VERSION_1_4) return; + glad_glXGetProcAddress = (PFNGLXGETPROCADDRESSPROC) load(userptr, "glXGetProcAddress"); +} +static void glad_glx_load_GLX_ARB_create_context( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_ARB_create_context) return; + glad_glXCreateContextAttribsARB = (PFNGLXCREATECONTEXTATTRIBSARBPROC) load(userptr, "glXCreateContextAttribsARB"); +} +static void glad_glx_load_GLX_ARB_get_proc_address( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_ARB_get_proc_address) return; + glad_glXGetProcAddressARB = (PFNGLXGETPROCADDRESSARBPROC) load(userptr, "glXGetProcAddressARB"); +} +static void glad_glx_load_GLX_EXT_swap_control( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_EXT_swap_control) return; + glad_glXSwapIntervalEXT = (PFNGLXSWAPINTERVALEXTPROC) load(userptr, "glXSwapIntervalEXT"); +} +static void glad_glx_load_GLX_MESA_swap_control( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_MESA_swap_control) return; + glad_glXGetSwapIntervalMESA = (PFNGLXGETSWAPINTERVALMESAPROC) load(userptr, "glXGetSwapIntervalMESA"); + glad_glXSwapIntervalMESA = (PFNGLXSWAPINTERVALMESAPROC) load(userptr, "glXSwapIntervalMESA"); +} +static void glad_glx_load_GLX_SGI_swap_control( GLADuserptrloadfunc load, void* userptr) { + if(!GLAD_GLX_SGI_swap_control) return; + glad_glXSwapIntervalSGI = (PFNGLXSWAPINTERVALSGIPROC) load(userptr, "glXSwapIntervalSGI"); +} + + + +static int glad_glx_has_extension(Display *display, int screen, const char *ext) { +#ifndef GLX_VERSION_1_1 + GLAD_UNUSED(display); + GLAD_UNUSED(screen); + GLAD_UNUSED(ext); +#else + const char *terminator; + const char *loc; + const char *extensions; + + if (glXQueryExtensionsString == NULL) { + return 0; + } + + extensions = glXQueryExtensionsString(display, screen); + + if(extensions == NULL || ext == NULL) { + return 0; + } + + while(1) { + loc = strstr(extensions, ext); + if(loc == NULL) + break; + + terminator = loc + strlen(ext); + if((loc == extensions || *(loc - 1) == ' ') && + (*terminator == ' ' || *terminator == '\0')) { + return 1; + } + extensions = terminator; + } +#endif + + return 0; +} + +static GLADapiproc glad_glx_get_proc_from_userptr(void *userptr, const char* name) { + return (GLAD_GNUC_EXTENSION (GLADapiproc (*)(const char *name)) userptr)(name); +} + +static int glad_glx_find_extensions(Display *display, int screen) { + GLAD_GLX_ARB_create_context = glad_glx_has_extension(display, screen, "GLX_ARB_create_context"); + GLAD_GLX_ARB_create_context_profile = glad_glx_has_extension(display, screen, "GLX_ARB_create_context_profile"); + GLAD_GLX_ARB_get_proc_address = glad_glx_has_extension(display, screen, "GLX_ARB_get_proc_address"); + GLAD_GLX_EXT_swap_control = glad_glx_has_extension(display, screen, "GLX_EXT_swap_control"); + GLAD_GLX_MESA_swap_control = glad_glx_has_extension(display, screen, "GLX_MESA_swap_control"); + GLAD_GLX_SGI_swap_control = glad_glx_has_extension(display, screen, "GLX_SGI_swap_control"); + return 1; +} + +static int glad_glx_find_core_glx(Display **display, int *screen) { + int major = 0, minor = 0; + if(*display == NULL) { +#ifdef GLAD_GLX_NO_X11 + GLAD_UNUSED(screen); + return 0; +#else + *display = XOpenDisplay(0); + if (*display == NULL) { + return 0; + } + *screen = XScreenNumberOfScreen(XDefaultScreenOfDisplay(*display)); +#endif + } + glXQueryVersion(*display, &major, &minor); + GLAD_GLX_VERSION_1_0 = (major == 1 && minor >= 0) || major > 1; + GLAD_GLX_VERSION_1_1 = (major == 1 && minor >= 1) || major > 1; + GLAD_GLX_VERSION_1_2 = (major == 1 && minor >= 2) || major > 1; + GLAD_GLX_VERSION_1_3 = (major == 1 && minor >= 3) || major > 1; + GLAD_GLX_VERSION_1_4 = (major == 1 && minor >= 4) || major > 1; + return GLAD_MAKE_VERSION(major, minor); +} + +int gladLoadGLXUserPtr(Display *display, int screen, GLADuserptrloadfunc load, void *userptr) { + int version; + glXQueryVersion = (PFNGLXQUERYVERSIONPROC) load(userptr, "glXQueryVersion"); + if(glXQueryVersion == NULL) return 0; + version = glad_glx_find_core_glx(&display, &screen); + + glad_glx_load_GLX_VERSION_1_0(load, userptr); + glad_glx_load_GLX_VERSION_1_1(load, userptr); + glad_glx_load_GLX_VERSION_1_2(load, userptr); + glad_glx_load_GLX_VERSION_1_3(load, userptr); + glad_glx_load_GLX_VERSION_1_4(load, userptr); + + if (!glad_glx_find_extensions(display, screen)) return 0; + glad_glx_load_GLX_ARB_create_context(load, userptr); + glad_glx_load_GLX_ARB_get_proc_address(load, userptr); + glad_glx_load_GLX_EXT_swap_control(load, userptr); + glad_glx_load_GLX_MESA_swap_control(load, userptr); + glad_glx_load_GLX_SGI_swap_control(load, userptr); + + + return version; +} + +int gladLoadGLX(Display *display, int screen, GLADloadfunc load) { + return gladLoadGLXUserPtr(display, screen, glad_glx_get_proc_from_userptr, GLAD_GNUC_EXTENSION (void*) load); +} + + + +#ifdef GLAD_GLX + +#ifndef GLAD_LOADER_LIBRARY_C_ +#define GLAD_LOADER_LIBRARY_C_ + +#include <stddef.h> +#include <stdlib.h> + +#if GLAD_PLATFORM_WIN32 +#include <windows.h> +#else +#include <dlfcn.h> +#endif + + +static void* glad_get_dlopen_handle(const char *lib_names[], int length) { + void *handle = NULL; + int i; + + for (i = 0; i < length; ++i) { +#if GLAD_PLATFORM_WIN32 + #if GLAD_PLATFORM_UWP + size_t buffer_size = (strlen(lib_names[i]) + 1) * sizeof(WCHAR); + LPWSTR buffer = (LPWSTR) malloc(buffer_size); + if (buffer != NULL) { + int ret = MultiByteToWideChar(CP_ACP, 0, lib_names[i], -1, buffer, buffer_size); + if (ret != 0) { + handle = (void*) LoadPackagedLibrary(buffer, 0); + } + free((void*) buffer); + } + #else + handle = (void*) LoadLibraryA(lib_names[i]); + #endif +#else + handle = dlopen(lib_names[i], RTLD_LAZY | RTLD_LOCAL); +#endif + if (handle != NULL) { + return handle; + } + } + + return NULL; +} + +static void glad_close_dlopen_handle(void* handle) { + if (handle != NULL) { +#if GLAD_PLATFORM_WIN32 + FreeLibrary((HMODULE) handle); +#else + dlclose(handle); +#endif + } +} + +static GLADapiproc glad_dlsym_handle(void* handle, const char *name) { + if (handle == NULL) { + return NULL; + } + +#if GLAD_PLATFORM_WIN32 + return (GLADapiproc) GetProcAddress((HMODULE) handle, name); +#else + return GLAD_GNUC_EXTENSION (GLADapiproc) dlsym(handle, name); +#endif +} + +#endif /* GLAD_LOADER_LIBRARY_C_ */ + +typedef void* (GLAD_API_PTR *GLADglxprocaddrfunc)(const char*); + +static GLADapiproc glad_glx_get_proc(void *userptr, const char *name) { + return GLAD_GNUC_EXTENSION ((GLADapiproc (*)(const char *name)) userptr)(name); +} + +static void* _glx_handle; + +static void* glad_glx_dlopen_handle(void) { + static const char *NAMES[] = { +#if defined __CYGWIN__ + "libGL-1.so", +#endif + "libGL.so.1", + "libGL.so" + }; + + if (_glx_handle == NULL) { + _glx_handle = glad_get_dlopen_handle(NAMES, sizeof(NAMES) / sizeof(NAMES[0])); + } + + return _glx_handle; +} + +int gladLoaderLoadGLX(Display *display, int screen) { + int version = 0; + void *handle = NULL; + int did_load = 0; + GLADglxprocaddrfunc loader; + + did_load = _glx_handle == NULL; + handle = glad_glx_dlopen_handle(); + if (handle != NULL) { + loader = (GLADglxprocaddrfunc) glad_dlsym_handle(handle, "glXGetProcAddressARB"); + if (loader != NULL) { + version = gladLoadGLXUserPtr(display, screen, glad_glx_get_proc, GLAD_GNUC_EXTENSION (void*) loader); + } + + if (!version && did_load) { + gladLoaderUnloadGLX(); + } + } + + return version; +} + + +void gladLoaderUnloadGLX() { + if (_glx_handle != NULL) { + glad_close_dlopen_handle(_glx_handle); + _glx_handle = NULL; + } +} + +#endif /* GLAD_GLX */ + +#ifdef __cplusplus +} +#endif diff --git a/thirdparty/libtheora/LICENSE b/thirdparty/libtheora/LICENSE index 5e5ec08469..97e8431790 100644 --- a/thirdparty/libtheora/LICENSE +++ b/thirdparty/libtheora/LICENSE @@ -4,13 +4,13 @@ In addition to and irrespective of the copyright license associated with this software, On2 Technologies, Inc. makes the following statement regarding technology used in this software: - On2 represents and warrants that it shall not assert any rights + On2 represents and warrants that it shall not assert any rights relating to infringement of On2's registered patents, nor initiate any litigation asserting such rights, against any person who, or - entity which utilizes the On2 VP3 Codec Software, including any - use, distribution, and sale of said Software; which make changes, + entity which utilizes the On2 VP3 Codec Software, including any + use, distribution, and sale of said Software; which make changes, modifications, and improvements in said Software; and to use, - distribute, and sell said changes as well as applications for other + distribute, and sell said changes as well as applications for other fields of use. This reference implementation is originally derived from the On2 VP3 diff --git a/thirdparty/libtheora/analyze.c b/thirdparty/libtheora/analyze.c index af01b60dff..19d7612d23 100644 --- a/thirdparty/libtheora/analyze.c +++ b/thirdparty/libtheora/analyze.c @@ -18,12 +18,12 @@ #include <string.h> #include "encint.h" #include "modedec.h" +#if defined(OC_COLLECT_METRICS) +# include "collect.c" +#endif -typedef struct oc_fr_state oc_fr_state; -typedef struct oc_qii_state oc_qii_state; -typedef struct oc_enc_pipeline_state oc_enc_pipeline_state; typedef struct oc_rd_metric oc_rd_metric; typedef struct oc_mode_choice oc_mode_choice; @@ -42,7 +42,7 @@ typedef struct oc_mode_choice oc_mode_choice; This is the inverse of the equivalent table OC_MODE_ALPHABETS in the decoder.*/ static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={ - /*Last MV dominates.*/ + /*Last MV dominates.*/ /*L P M N I G GM 4*/ {3,4,2,0,1,5,6,7}, /*L P N M I G GM 4*/ @@ -87,6 +87,29 @@ static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){ } } +/*Return the cost of coding _mb_mode in the specified scheme.*/ +static int oc_mode_scheme_chooser_scheme_mb_cost( + const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){ + int codebook; + int ri; + codebook=_scheme+1>>3; + /*For any scheme except 0, we can just use the bit cost of the mode's rank + in that scheme.*/ + ri=_chooser->mode_ranks[_scheme][_mb_mode]; + if(_scheme==0){ + int mc; + /*For scheme 0, incrementing the mode count could potentially change the + mode's rank. + Find the index where the mode would be moved to in the optimal list, + and use its bit cost instead of the one for the mode's current + position in the list.*/ + /*We don't actually reorder the list; this is for computing opportunity + cost, not an update.*/ + mc=_chooser->mode_counts[_mb_mode]; + while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--; + } + return OC_MODE_BITS[codebook][ri]; +} /*This is the real purpose of this data structure: not actually selecting a mode scheme, but estimating the cost of coding a given mode given all the @@ -108,46 +131,32 @@ static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser, int best_bits; int mode_bits; int si; - int scheme_bits; + int scheme0_bits; + int scheme1_bits; scheme0=_chooser->scheme_list[0]; scheme1=_chooser->scheme_list[1]; - best_bits=_chooser->scheme_bits[scheme0]; - mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]]; + scheme0_bits=_chooser->scheme_bits[scheme0]; + scheme1_bits=_chooser->scheme_bits[scheme1]; + mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode); /*Typical case: If the difference between the best scheme and the next best is greater than 6 bits, then adding just one mode cannot change which scheme we use.*/ - if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits; + if(scheme1_bits-scheme0_bits>6)return mode_bits; /*Otherwise, check to see if adding this mode selects a different scheme as the best.*/ si=1; - best_bits+=mode_bits; + best_bits=scheme0_bits+mode_bits; do{ - /*For any scheme except 0, we can just use the bit cost of the mode's rank - in that scheme.*/ - if(scheme1!=0){ - scheme_bits=_chooser->scheme_bits[scheme1]+ - OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]]; - } - else{ - int ri; - /*For scheme 0, incrementing the mode count could potentially change the - mode's rank. - Find the index where the mode would be moved to in the optimal list, - and use its bit cost instead of the one for the mode's current - position in the list.*/ - /*We don't recompute scheme bits; this is computing opportunity cost, not - an update.*/ - for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&& - _chooser->mode_counts[_mb_mode]>= - _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--); - scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri]; - } - if(scheme_bits<best_bits)best_bits=scheme_bits; + int cur_bits; + cur_bits=scheme1_bits+ + oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode); + if(cur_bits<best_bits)best_bits=cur_bits; if(++si>=8)break; scheme1=_chooser->scheme_list[si]; + scheme1_bits=_chooser->scheme_bits[scheme1]; } - while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6); - return best_bits-_chooser->scheme_bits[scheme0]; + while(scheme1_bits-scheme0_bits<=6); + return best_bits-scheme0_bits; } /*Incrementally update the mode counts and per-scheme bit counts and re-order @@ -211,22 +220,6 @@ static int oc_block_run_bits(int _run_count){ -/*State to track coded block flags and their bit cost.*/ -struct oc_fr_state{ - ptrdiff_t bits; - unsigned sb_partial_count:16; - unsigned sb_full_count:16; - unsigned b_coded_count_prev:8; - unsigned b_coded_count:8; - unsigned b_count:8; - signed int sb_partial:2; - signed int sb_full:2; - signed int b_coded_prev:2; - signed int b_coded:2; -}; - - - static void oc_fr_state_init(oc_fr_state *_fr){ _fr->bits=0; _fr->sb_partial_count=0; @@ -234,6 +227,8 @@ static void oc_fr_state_init(oc_fr_state *_fr){ _fr->b_coded_count_prev=0; _fr->b_coded_count=0; _fr->b_count=0; + _fr->sb_prefer_partial=0; + _fr->sb_bits=0; _fr->sb_partial=-1; _fr->sb_full=-1; _fr->b_coded_prev=-1; @@ -241,14 +236,14 @@ static void oc_fr_state_init(oc_fr_state *_fr){ } -static void oc_fr_state_advance_sb(oc_fr_state *_fr, +static int oc_fr_state_sb_cost(const oc_fr_state *_fr, int _sb_partial,int _sb_full){ - ptrdiff_t bits; - int sb_partial_count; - int sb_full_count; - bits=_fr->bits; + int bits; + int sb_partial_count; + int sb_full_count; + bits=0; + sb_partial_count=_fr->sb_partial_count; /*Extend the sb_partial run, or start a new one.*/ - sb_partial_count=_fr->sb_partial; if(_fr->sb_partial==_sb_partial){ if(sb_partial_count>=4129){ bits++; @@ -257,8 +252,7 @@ static void oc_fr_state_advance_sb(oc_fr_state *_fr, else bits-=oc_sb_run_bits(sb_partial_count); } else sb_partial_count=0; - sb_partial_count++; - bits+=oc_sb_run_bits(sb_partial_count); + bits+=oc_sb_run_bits(++sb_partial_count); if(!_sb_partial){ /*Extend the sb_full run, or start a new one.*/ sb_full_count=_fr->sb_full_count; @@ -270,98 +264,161 @@ static void oc_fr_state_advance_sb(oc_fr_state *_fr, else bits-=oc_sb_run_bits(sb_full_count); } else sb_full_count=0; + bits+=oc_sb_run_bits(++sb_full_count); + } + return bits; +} + +static void oc_fr_state_advance_sb(oc_fr_state *_fr, + int _sb_partial,int _sb_full){ + int sb_partial_count; + int sb_full_count; + sb_partial_count=_fr->sb_partial_count; + if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0; + sb_partial_count++; + if(!_sb_partial){ + sb_full_count=_fr->sb_full_count; + if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0; sb_full_count++; - bits+=oc_sb_run_bits(sb_full_count); - _fr->sb_full=_sb_full; _fr->sb_full_count=sb_full_count; + _fr->sb_full=_sb_full; + /*Roll back the partial block state.*/ + _fr->b_coded=_fr->b_coded_prev; + _fr->b_coded_count=_fr->b_coded_count_prev; + } + else{ + /*Commit back the partial block state.*/ + _fr->b_coded_prev=_fr->b_coded; + _fr->b_coded_count_prev=_fr->b_coded_count; } - _fr->bits=bits; - _fr->sb_partial=_sb_partial; _fr->sb_partial_count=sb_partial_count; + _fr->sb_partial=_sb_partial; + _fr->b_count=0; + _fr->sb_prefer_partial=0; + _fr->sb_bits=0; } -/*Flush any outstanding block flags for a SB (e.g., one with fewer than 16 - blocks).*/ +/*Commit the state of the current super block and advance to the next.*/ static void oc_fr_state_flush_sb(oc_fr_state *_fr){ - ptrdiff_t bits; - int sb_partial; - int sb_full=sb_full; - int b_coded_count; - int b_coded; - int b_count; + int sb_partial; + int sb_full; + int b_coded_count; + int b_count; b_count=_fr->b_count; - if(b_count>0){ - bits=_fr->bits; - b_coded=_fr->b_coded; - b_coded_count=_fr->b_coded_count; - if(b_coded_count>=b_count){ - /*This SB was fully coded/uncoded; roll back the partial block flags.*/ - bits-=oc_block_run_bits(b_coded_count); - if(b_coded_count>b_count)bits+=oc_block_run_bits(b_coded_count-b_count); - sb_partial=0; - sb_full=b_coded; - b_coded=_fr->b_coded_prev; - b_coded_count=_fr->b_coded_count_prev; - } - else{ - /*It was partially coded.*/ - sb_partial=1; - /*sb_full is unused.*/ + b_coded_count=_fr->b_coded_count; + sb_full=_fr->b_coded; + sb_partial=b_coded_count<b_count; + if(!sb_partial){ + /*If the super block is fully coded/uncoded...*/ + if(_fr->sb_prefer_partial){ + /*So far coding this super block as partial was cheaper anyway.*/ + if(b_coded_count>15||_fr->b_coded_prev<0){ + int sb_bits; + /*If the block run is too long, this will limit how far it can be + extended into the next partial super block. + If we need to extend it farther, we don't want to have to roll all + the way back here (since there could be many full SBs between now + and then), so we disallow this. + Similarly, if this is the start of a stripe, we don't know how the + length of the outstanding block run from the previous stripe.*/ + sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full); + _fr->bits+=sb_bits-_fr->sb_bits; + _fr->sb_bits=sb_bits; + } + else sb_partial=1; } - _fr->bits=bits; - _fr->b_coded_count=b_coded_count; - _fr->b_coded_count_prev=b_coded_count; - _fr->b_count=0; - _fr->b_coded=b_coded; - _fr->b_coded_prev=b_coded; - oc_fr_state_advance_sb(_fr,sb_partial,sb_full); } + oc_fr_state_advance_sb(_fr,sb_partial,sb_full); } static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){ ptrdiff_t bits; + int sb_bits; int b_coded_count; int b_count; - int sb_partial; - int sb_full=sb_full; - bits=_fr->bits; - /*Extend the b_coded run, or start a new one.*/ + int sb_prefer_partial; + sb_bits=_fr->sb_bits; + bits=_fr->bits-sb_bits; + b_count=_fr->b_count; b_coded_count=_fr->b_coded_count; - if(_fr->b_coded==_b_coded)bits-=oc_block_run_bits(b_coded_count); - else b_coded_count=0; - b_coded_count++; - b_count=_fr->b_count+1; - if(b_count>=16){ - /*We finished a superblock.*/ - if(b_coded_count>=16){ - /*It was fully coded/uncoded; roll back the partial block flags.*/ - if(b_coded_count>16)bits+=oc_block_run_bits(b_coded_count-16); - sb_partial=0; - sb_full=_b_coded; - _b_coded=_fr->b_coded_prev; - b_coded_count=_fr->b_coded_count_prev; + sb_prefer_partial=_fr->sb_prefer_partial; + if(b_coded_count>=b_count){ + int sb_partial_bits; + /*This super block is currently fully coded/uncoded.*/ + if(b_count<=0){ + /*This is the first block in this SB.*/ + b_count=1; + /*Check to see whether it's cheaper to code it partially or fully.*/ + if(_fr->b_coded==_b_coded){ + sb_partial_bits=-oc_block_run_bits(b_coded_count); + sb_partial_bits+=oc_block_run_bits(++b_coded_count); + } + else{ + b_coded_count=1; + sb_partial_bits=2; + } + sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded); + sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); + sb_prefer_partial=sb_partial_bits<sb_bits; + sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial; + } + else if(_fr->b_coded==_b_coded){ + b_coded_count++; + if(++b_count<16){ + if(sb_prefer_partial){ + /*Check to see if it's cheaper to code it fully.*/ + sb_partial_bits=sb_bits; + sb_partial_bits+=oc_block_run_bits(b_coded_count); + if(b_coded_count>0){ + sb_partial_bits-=oc_block_run_bits(b_coded_count-1); + } + sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); + sb_prefer_partial=sb_partial_bits<sb_bits; + sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial; + } + /*There's no need to check the converse (whether it's cheaper to code + this SB partially if we were coding it fully), since the cost to + code a SB partially can only increase as we add more blocks, whereas + the cost to code it fully stays constant.*/ + } + else{ + /*If we get to the end and this SB is still full, then force it to be + coded full. + Otherwise we might not be able to extend the block run far enough + into the next partial SB.*/ + if(sb_prefer_partial){ + sb_prefer_partial=0; + sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); + } + } } else{ - bits+=oc_block_run_bits(b_coded_count); - /*It was partially coded.*/ - sb_partial=1; - /*sb_full is unused.*/ + /*This SB was full, but now must be made partial.*/ + if(!sb_prefer_partial){ + sb_bits=oc_block_run_bits(b_coded_count); + if(b_coded_count>b_count){ + sb_bits-=oc_block_run_bits(b_coded_count-b_count); + } + sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded); + } + b_count++; + b_coded_count=1; + sb_prefer_partial=1; + sb_bits+=2; } - _fr->bits=bits; - _fr->b_coded_count=b_coded_count; - _fr->b_coded_count_prev=b_coded_count; - _fr->b_count=0; - _fr->b_coded=_b_coded; - _fr->b_coded_prev=_b_coded; - oc_fr_state_advance_sb(_fr,sb_partial,sb_full); } else{ - bits+=oc_block_run_bits(b_coded_count); - _fr->bits=bits; - _fr->b_coded_count=b_coded_count; - _fr->b_count=b_count; - _fr->b_coded=_b_coded; + b_count++; + if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count); + else b_coded_count=0; + sb_bits+=oc_block_run_bits(++b_coded_count); } + _fr->bits=bits+sb_bits; + _fr->b_coded_count=b_coded_count; + _fr->b_coded=_b_coded; + _fr->b_count=b_count; + _fr->sb_prefer_partial=sb_prefer_partial; + _fr->sb_bits=sb_bits; } static void oc_fr_skip_block(oc_fr_state *_fr){ @@ -395,16 +452,6 @@ static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){ -struct oc_qii_state{ - ptrdiff_t bits; - unsigned qi01_count:14; - signed int qi01:2; - unsigned qi12_count:14; - signed int qi12:2; -}; - - - static void oc_qii_state_init(oc_qii_state *_qs){ _qs->bits=0; _qs->qi01_count=0; @@ -458,49 +505,17 @@ static void oc_qii_state_advance(oc_qii_state *_qd, -/*Temporary encoder state for the analysis pipeline.*/ -struct oc_enc_pipeline_state{ - int bounding_values[256]; - oc_fr_state fr[3]; - oc_qii_state qs[3]; - /*Condensed dequantization tables.*/ - const ogg_uint16_t *dequant[3][3][2]; - /*Condensed quantization tables.*/ - const oc_iquant *enquant[3][3][2]; - /*Skip SSD storage for the current MCU in each plane.*/ - unsigned *skip_ssd[3]; - /*Coded/uncoded fragment lists for each plane for the current MCU.*/ - ptrdiff_t *coded_fragis[3]; - ptrdiff_t *uncoded_fragis[3]; - ptrdiff_t ncoded_fragis[3]; - ptrdiff_t nuncoded_fragis[3]; - /*The starting fragment for the current MCU in each plane.*/ - ptrdiff_t froffset[3]; - /*The starting row for the current MCU in each plane.*/ - int fragy0[3]; - /*The ending row for the current MCU in each plane.*/ - int fragy_end[3]; - /*The starting superblock for the current MCU in each plane.*/ - unsigned sbi0[3]; - /*The ending superblock for the current MCU in each plane.*/ - unsigned sbi_end[3]; - /*The number of tokens for zzi=1 for each color plane.*/ - int ndct_tokens1[3]; - /*The outstanding eob_run count for zzi=1 for each color plane.*/ - int eob_run1[3]; - /*Whether or not the loop filter is enabled.*/ - int loop_filter; -}; - - static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){ ptrdiff_t *coded_fragis; unsigned mcu_nvsbs; ptrdiff_t mcu_nfrags; + int flimit; int hdec; int vdec; int pli; + int nqis; int qii; + int qi0; int qti; /*Initialize the per-plane coded block flag trackers. These are used for bit-estimation purposes only; the real flag bits span @@ -529,24 +544,36 @@ static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){ memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis)); memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis)); /*Set up condensed quantizer tables.*/ + qi0=_enc->state.qis[0]; + nqis=_enc->state.nqis; for(pli=0;pli<3;pli++){ - for(qii=0;qii<_enc->state.nqis;qii++){ + for(qii=0;qii<nqis;qii++){ int qi; qi=_enc->state.qis[qii]; for(qti=0;qti<2;qti++){ - _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti]; - _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti]; + /*Set the DC coefficient in the dequantization table.*/ + _enc->state.dequant_tables[qi][pli][qti][0]= + _enc->dequant_dc[qi0][pli][qti]; + _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti]; + /*Copy over the quantization table.*/ + memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti], + _enc->opt_data.enquant_table_size); } } } + /*Fix up the DC coefficients in the quantization tables.*/ + oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis); /*Initialize the tokenization state.*/ for(pli=0;pli<3;pli++){ _pipe->ndct_tokens1[pli]=0; _pipe->eob_run1[pli]=0; } /*Initialize the bounding value array for the loop filter.*/ - _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state, - _pipe->bounding_values); + flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]]; + _pipe->loop_filter=flimit!=0; + if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit); + /*Clear the temporary DCT scratch space.*/ + memset(_pipe->dct_data,0,sizeof(_pipe->dct_data)); } /*Sets the current MCU stripe to super block row _sby. @@ -585,13 +612,17 @@ static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc, static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc, oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){ - int refi; /*Copy over all the uncoded fragments from this plane and advance the uncoded fragment list.*/ - _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; - oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli], - _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli); - _pipe->nuncoded_fragis[_pli]=0; + if(_pipe->nuncoded_fragis[_pli]>0){ + _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; + oc_frag_copy_list(&_enc->state, + _enc->state.ref_frame_data[OC_FRAME_SELF], + _enc->state.ref_frame_data[OC_FRAME_PREV], + _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli], + _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs); + _pipe->nuncoded_fragis[_pli]=0; + } /*Perform DC prediction.*/ oc_enc_pred_dc_frag_rows(_enc,_pli, _pipe->fragy0[_pli],_pipe->fragy_end[_pli]); @@ -606,17 +637,18 @@ static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc, _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli]; _pipe->ncoded_fragis[_pli]=0; /*Apply the loop filter if necessary.*/ - refi=_enc->state.ref_frame_idx[OC_FRAME_SELF]; if(_pipe->loop_filter){ - oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values, - refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay); + oc_state_loop_filter_frag_rows(&_enc->state, + _pipe->bounding_values,OC_FRAME_SELF,_pli, + _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay); } else _sdelay=_edelay=0; /*To fill borders, we have an additional two pixel delay, since a fragment in the next row could filter its top edge, using two pixels from a fragment in this row. But there's no reason to delay a full fragment between the two.*/ - oc_state_borders_fill_rows(&_enc->state,refi,_pli, + oc_state_borders_fill_rows(&_enc->state, + _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli, (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1), (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1)); } @@ -634,62 +666,62 @@ struct oc_rd_metric{ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc, - oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits, - oc_rd_metric *_mo,oc_token_checkpoint **_stack){ - OC_ALIGN16(ogg_int16_t dct[64]); - OC_ALIGN16(ogg_int16_t data[64]); - ogg_uint16_t dc_dequant; + oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi, + unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo, + oc_fr_state *_fr,oc_token_checkpoint **_stack){ + ogg_int16_t *data; + ogg_int16_t *dct; + ogg_int16_t *idct; + oc_qii_state qs; const ogg_uint16_t *dequant; - const oc_iquant *enquant; + ogg_uint16_t dequant_dc; ptrdiff_t frag_offs; int ystride; const unsigned char *src; const unsigned char *ref; unsigned char *dst; - int frame_type; int nonzero; unsigned uncoded_ssd; unsigned coded_ssd; - int coded_dc; oc_token_checkpoint *checkpoint; oc_fragment *frags; int mb_mode; + int refi; int mv_offs[2]; int nmv_offs; int ac_bits; int borderi; + int nqis; int qti; int qii; - int pi; - int zzi; - int v; - int val; - int d; - int s; int dc; + nqis=_enc->state.nqis; frags=_enc->state.frags; frag_offs=_enc->state.frag_buf_offs[_fragi]; ystride=_enc->state.ref_ystride[_pli]; src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs; borderi=frags[_fragi].borderi; qii=frags[_fragi].qii; + data=_enc->pipe.dct_data; + dct=data+64; + idct=data+128; if(qii&~3){ #if !defined(OC_COLLECT_METRICS) if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){ /*Enable early skip detection.*/ frags[_fragi].coded=0; + frags[_fragi].refi=OC_FRAME_NONE; + oc_fr_skip_block(_fr); return 0; } #endif /*Try and code this block anyway.*/ qii&=3; - frags[_fragi].qii=qii; } + refi=frags[_fragi].refi; mb_mode=frags[_fragi].mb_mode; - ref=_enc->state.ref_frame_data[ - _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs; - dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]] - +frag_offs; + ref=_enc->state.ref_frame_data[refi]+frag_offs; + dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs; /*Motion compensation:*/ switch(mb_mode){ case OC_MODE_INTRA:{ @@ -704,9 +736,9 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc, }break; default:{ const oc_mv *frag_mvs; - frag_mvs=(const oc_mv *)_enc->state.frag_mvs; - nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli, - frag_mvs[_fragi][0],frag_mvs[_fragi][1]); + frag_mvs=_enc->state.frag_mvs; + nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs, + _pli,frag_mvs[_fragi]); if(nmv_offs>1){ oc_enc_frag_copy2(_enc,dst, ref+mv_offs[0],ref+mv_offs[1],ystride); @@ -717,126 +749,121 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc, } #if defined(OC_COLLECT_METRICS) { + unsigned sad; unsigned satd; switch(nmv_offs){ - case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break; + case 0:{ + sad=oc_enc_frag_intra_sad(_enc,src,ystride); + satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride); + }break; case 1:{ - satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX); + sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX); + satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride); + satd+=abs(dc); }break; default:{ - satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX); - } + sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX); + satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride); + satd+=abs(dc); + }break; } + _enc->frag_sad[_fragi]=sad; _enc->frag_satd[_fragi]=satd; } #endif /*Transform:*/ oc_enc_fdct8x8(_enc,dct,data); - /*Quantize the DC coefficient:*/ + /*Quantize:*/ qti=mb_mode!=OC_MODE_INTRA; - enquant=_pipe->enquant[_pli][0][qti]; - dc_dequant=_pipe->dequant[_pli][0][qti][0]; - v=dct[0]; - val=v<<1; - s=OC_SIGNMASK(val); - val+=dc_dequant+s^s; - val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s; - dc=OC_CLAMPI(-580,val,580); - nonzero=0; - /*Quantize the AC coefficients:*/ - dequant=_pipe->dequant[_pli][qii][qti]; - enquant=_pipe->enquant[_pli][qii][qti]; - for(zzi=1;zzi<64;zzi++){ - v=dct[OC_FZIG_ZAG[zzi]]; - d=dequant[zzi]; - val=v<<1; - v=abs(val); - if(v>=d){ - s=OC_SIGNMASK(val); - /*The bias added here rounds ties away from zero, since token - optimization can only decrease the magnitude of the quantized - value.*/ - val+=d+s^s; - /*Note the arithmetic right shift is not guaranteed by ANSI C. - Hopefully no one still uses ones-complement architectures.*/ - val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s; - data[zzi]=OC_CLAMPI(-580,val,580); - nonzero=zzi; - } - else data[zzi]=0; - } + dequant=_enc->dequant[_pli][qii][qti]; + nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]); + dc=data[0]; /*Tokenize.*/ checkpoint=*_stack; - ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1, - _stack,qti?0:3); + if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ + ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct, + nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3); + } + else{ + ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct, + nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3); + } /*Reconstruct. TODO: nonzero may need to be adjusted after tokenization.*/ + dequant_dc=dequant[0]; if(nonzero==0){ ogg_int16_t p; int ci; + int qi01; + int qi12; /*We round this dequant product (and not any of the others) because there's no iDCT rounding.*/ - p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5); + p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5); /*LOOP VECTORIZES.*/ for(ci=0;ci<64;ci++)data[ci]=p; + /*We didn't code any AC coefficients, so don't change the quantizer.*/ + qi01=_pipe->qs[_pli].qi01; + qi12=_pipe->qs[_pli].qi12; + if(qi01>0)qii=1+qi12; + else if(qi01>=0)qii=0; } else{ - data[0]=dc*dc_dequant; - oc_idct8x8(&_enc->state,data,nonzero+1); + idct[0]=dc*dequant_dc; + /*Note: This clears idct[] back to zero for the next block.*/ + oc_idct8x8(&_enc->state,data,idct,nonzero+1); + } + frags[_fragi].qii=qii; + if(nqis>1){ + oc_qii_state_advance(&qs,_pipe->qs+_pli,qii); + ac_bits+=qs.bits-_pipe->qs[_pli].bits; } if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data); else{ oc_enc_frag_recon_inter(_enc,dst, nmv_offs==1?ref+mv_offs[0]:dst,ystride,data); } - frame_type=_enc->state.frame_type; + /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/ #if !defined(OC_COLLECT_METRICS) - if(frame_type!=OC_INTRA_FRAME) + if(_fr!=NULL) #endif { /*In retrospect, should we have skipped this block?*/ - oc_enc_frag_sub(_enc,data,src,dst,ystride); - coded_ssd=coded_dc=0; if(borderi<0){ - for(pi=0;pi<64;pi++){ - coded_ssd+=data[pi]*data[pi]; - coded_dc+=data[pi]; - } + coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride); } else{ - ogg_int64_t mask; - mask=_enc->state.borders[borderi].mask; - for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){ - coded_ssd+=data[pi]*data[pi]; - coded_dc+=data[pi]; - } + coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride, + _enc->state.borders[borderi].mask); } /*Scale to match DCT domain.*/ coded_ssd<<=4; - /*We actually only want the AC contribution to the SSD.*/ - coded_ssd-=coded_dc*coded_dc>>2; #if defined(OC_COLLECT_METRICS) _enc->frag_ssd[_fragi]=coded_ssd; } - if(frame_type!=OC_INTRA_FRAME){ + if(_fr!=NULL){ #endif + coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale); uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]]; - if(uncoded_ssd<UINT_MAX){ + if(uncoded_ssd<UINT_MAX&& + /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility + is enabled.*/ + (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){ + int overhead_bits; + overhead_bits=oc_fr_cost1(_fr); /*Although the fragment coding overhead determination is accurate, it is greedy, using very coarse-grained local information. Allowing it to mildly discourage coding turns out to be beneficial, but it's not clear that allowing it to encourage coding through negative coding overhead deltas is useful. - For that reason, we disallow negative coding_overheads.*/ - if(_overhead_bits<0)_overhead_bits=0; - if(uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&& - /*Don't allow luma blocks to be skipped in 4MV mode when VP3 - compatibility is enabled.*/ - (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){ + For that reason, we disallow negative coding overheads.*/ + if(overhead_bits<0)overhead_bits=0; + if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){ /*Hm, not worth it; roll back.*/ oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint); *_stack=checkpoint; frags[_fragi].coded=0; + frags[_fragi].refi=OC_FRAME_NONE; + oc_fr_skip_block(_fr); return 0; } } @@ -844,15 +871,20 @@ static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc, _mo->uncoded_ac_ssd+=uncoded_ssd; _mo->coded_ac_ssd+=coded_ssd; _mo->ac_bits+=ac_bits; + oc_fr_code_block(_fr); } - oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii); + /*GCC 4.4.4 generates a warning here because it can't tell that + the init code in the nqis check above will run anytime this + line runs.*/ + if(nqis>1)*(_pipe->qs+_pli)=*&qs; frags[_fragi].dc=dc; frags[_fragi].coded=1; return 1; } -static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc, - oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead){ +static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc, + oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead, + const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){ /*Worst case token stack usage for 4 fragments.*/ oc_token_checkpoint stack[64*4]; oc_token_checkpoint *stackptr; @@ -867,6 +899,7 @@ static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc, oc_fr_state fr_checkpoint; oc_qii_state qs_checkpoint; int mb_mode; + int refi; int ncoded; ptrdiff_t fragi; int bi; @@ -880,78 +913,83 @@ static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc, uncoded_fragis=_pipe->uncoded_fragis[0]; nuncoded_fragis=_pipe->nuncoded_fragis[0]; mb_mode=mb_modes[_mbi]; + refi=OC_FRAME_FOR_MODE(mb_mode); ncoded=0; stackptr=stack; memset(&mo,0,sizeof(mo)); for(bi=0;bi<4;bi++){ fragi=sb_maps[_mbi>>2][_mbi&3][bi]; + frags[fragi].refi=refi; frags[fragi].mb_mode=mb_mode; - if(oc_enc_block_transform_quantize(_enc, - _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){ - oc_fr_code_block(_pipe->fr+0); + if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi, + _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){ coded_fragis[ncoded_fragis++]=fragi; ncoded++; } - else{ - *(uncoded_fragis-++nuncoded_fragis)=fragi; - oc_fr_skip_block(_pipe->fr+0); - } + else *(uncoded_fragis-++nuncoded_fragis)=fragi; } - if(_enc->state.frame_type!=OC_INTRA_FRAME){ - if(ncoded>0&&!mo.dc_flag){ - int cost; - /*Some individual blocks were worth coding. - See if that's still true when accounting for mode and MV overhead.*/ - cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits - +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead); - if(mo.uncoded_ac_ssd<=cost){ - /*Taking macroblock overhead into account, it is not worth coding this - MB.*/ - oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack); - *(_pipe->fr+0)=*&fr_checkpoint; - *(_pipe->qs+0)=*&qs_checkpoint; - for(bi=0;bi<4;bi++){ - fragi=sb_maps[_mbi>>2][_mbi&3][bi]; - if(frags[fragi].coded){ - *(uncoded_fragis-++nuncoded_fragis)=fragi; - frags[fragi].coded=0; - } - oc_fr_skip_block(_pipe->fr+0); + if(ncoded>0&&!mo.dc_flag){ + int cost; + /*Some individual blocks were worth coding. + See if that's still true when accounting for mode and MV overhead.*/ + cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits + +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead); + if(mo.uncoded_ac_ssd<=cost){ + /*Taking macroblock overhead into account, it is not worth coding this + MB.*/ + oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack); + *(_pipe->fr+0)=*&fr_checkpoint; + *(_pipe->qs+0)=*&qs_checkpoint; + for(bi=0;bi<4;bi++){ + fragi=sb_maps[_mbi>>2][_mbi&3][bi]; + if(frags[fragi].coded){ + *(uncoded_fragis-++nuncoded_fragis)=fragi; + frags[fragi].coded=0; + frags[fragi].refi=OC_FRAME_NONE; } - ncoded_fragis-=ncoded; - ncoded=0; + oc_fr_skip_block(_pipe->fr+0); } - } - /*If no luma blocks coded, the mode is forced.*/ - if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV; - /*Assume that a 1MV with a single coded block is always cheaper than a 4MV - with a single coded block. - This may not be strictly true: a 4MV computes chroma MVs using (0,0) for - skipped blocks, while a 1MV does not.*/ - else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){ - mb_modes[_mbi]=OC_MODE_INTER_MV; + ncoded_fragis-=ncoded; + ncoded=0; } } + /*If no luma blocks coded, the mode is forced.*/ + if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV; + /*Assume that a 1MV with a single coded block is always cheaper than a 4MV + with a single coded block. + This may not be strictly true: a 4MV computes chroma MVs using (0,0) for + skipped blocks, while a 1MV does not.*/ + else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){ + mb_modes[_mbi]=OC_MODE_INTER_MV; + } _pipe->ncoded_fragis[0]=ncoded_fragis; _pipe->nuncoded_fragis[0]=nuncoded_fragis; return ncoded; } -static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc, +static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc, oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){ - const oc_sb_map *sb_maps; - oc_sb_flags *sb_flags; - ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t *uncoded_fragis; - ptrdiff_t nuncoded_fragis; - int sbi; + const ogg_uint16_t *mcu_rd_scale; + const ogg_uint16_t *mcu_rd_iscale; + const oc_sb_map *sb_maps; + oc_sb_flags *sb_flags; + oc_fr_state *fr; + ptrdiff_t *coded_fragis; + ptrdiff_t ncoded_fragis; + ptrdiff_t *uncoded_fragis; + ptrdiff_t nuncoded_fragis; + ptrdiff_t froffset; + int sbi; + fr=_pipe->fr+_pli; + mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale; + mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale; sb_maps=(const oc_sb_map *)_enc->state.sb_maps; sb_flags=_enc->state.sb_flags; coded_fragis=_pipe->coded_fragis[_pli]; ncoded_fragis=_pipe->ncoded_fragis[_pli]; uncoded_fragis=_pipe->uncoded_fragis[_pli]; nuncoded_fragis=_pipe->nuncoded_fragis[_pli]; + froffset=_pipe->froffset[_pli]; for(sbi=_sbi_start;sbi<_sbi_end;sbi++){ /*Worst case token stack usage for 1 fragment.*/ oc_token_checkpoint stack[64]; @@ -964,21 +1002,21 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc, fragi=sb_maps[sbi][quadi][bi]; if(fragi>=0){ oc_token_checkpoint *stackptr; + unsigned rd_scale; + unsigned rd_iscale; + rd_scale=mcu_rd_scale[fragi-froffset]; + rd_iscale=mcu_rd_iscale[fragi-froffset]; stackptr=stack; - if(oc_enc_block_transform_quantize(_enc, - _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){ + if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi, + rd_scale,rd_iscale,&mo,fr,&stackptr)){ coded_fragis[ncoded_fragis++]=fragi; - oc_fr_code_block(_pipe->fr+_pli); - } - else{ - *(uncoded_fragis-++nuncoded_fragis)=fragi; - oc_fr_skip_block(_pipe->fr+_pli); } + else *(uncoded_fragis-++nuncoded_fragis)=fragi; } } - oc_fr_state_flush_sb(_pipe->fr+_pli); - sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full; - sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial; + oc_fr_state_flush_sb(fr); + sb_flags[sbi].coded_fully=fr->sb_full; + sb_flags[sbi].coded_partially=fr->sb_partial; } _pipe->ncoded_fragis[_pli]=ncoded_fragis; _pipe->nuncoded_fragis[_pli]=nuncoded_fragis; @@ -1006,8 +1044,8 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc, The bit counts and SSD measurements are obtained by examining actual encoded frames, with appropriate lambda values and optimal Huffman codes selected. EOB bits are assigned to the fragment that started the EOB run (as opposed to - dividing them among all the blocks in the run; though the latter approach - seems more theoretically correct, Monty's testing showed a small improvement + dividing them among all the blocks in the run; the latter approach seems + more theoretically correct, but Monty's testing showed a small improvement with the former, though that may have been merely statistical noise). @ARTICLE{Kim03, @@ -1028,11 +1066,63 @@ static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc, +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \ +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE) +static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){ +#if !defined(OC_COLLECT_METRICS) + const +#endif + oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]= + _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD; + int qii; +#if defined(OC_COLLECT_METRICS) + oc_enc_mode_metrics_load(_enc); +#endif + for(qii=0;qii<_enc->state.nqis;qii++){ + int qi; + int pli; + qi=_enc->state.qis[qii]; + for(pli=0;pli<3;pli++){ + int qti; + for(qti=0;qti<2;qti++){ + int log_plq; + int modeline; + int bin; + int dx; + int dq; + log_plq=_enc->log_plq[qi][pli][qti]; + /*Find the pair of rows in the mode table that bracket this quantizer. + If it falls outside the range the table covers, then we just use a + pair on the edge for linear extrapolation.*/ + for(modeline=0;modeline<OC_LOGQ_BINS-1&& + OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++); + /*Interpolate a row for this quantizer.*/ + dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq; + dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti]; + if(dq==0)dq=1; + for(bin=0;bin<OC_COMP_BINS;bin++){ + int y0; + int z0; + int dy; + int dz; + y0=oc_mode_rd_table[modeline][pli][qti][bin].rate; + z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse; + dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0; + dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0; + _enc->mode_rd[qii][pli][qti][bin].rate= + (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767); + _enc->mode_rd[qii][pli][qti][bin].rmse= + (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767); + } + } + } + } +} + /*Estimate the R-D cost of the DCT coefficients given the SATD of a block after prediction.*/ -static unsigned oc_dct_cost2(unsigned *_ssd, - int _qi,int _pli,int _qti,int _satd){ +static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd, + int _qii,int _pli,int _qti,int _satd){ unsigned rmse; + int shift; int bin; int dx; int y0; @@ -1042,20 +1132,279 @@ static unsigned oc_dct_cost2(unsigned *_ssd, /*SATD metrics for chroma planes vary much less than luma, so we scale them by 4 to distribute them into the mode decision bins more evenly.*/ _satd<<=_pli+1&2; - bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2); - dx=_satd-(bin<<OC_SAD_SHIFT); - y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate; - z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse; - dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0; - dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0; - rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0); + shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT; + bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2); + dx=_satd-(bin<<shift); + y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate; + z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse; + dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0; + dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0; + rmse=OC_MAXI(z0+(dz*dx>>shift),0); *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE; - return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0); + return OC_MAXI(y0+(dy*dx>>shift),0); +} + +/*activity_avg must be positive, or flat regions could get a zero weight, which + confounds analysis. + We set the minimum to this value so that it also avoids the need for divide + by zero checks in oc_mb_masking().*/ +# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS) + +static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi, + unsigned _activity[4]){ + const unsigned char *src; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *sb_map; + unsigned luma; + int ystride; + ptrdiff_t frag_offs; + ptrdiff_t fragi; + int bi; + frag_buf_offs=_enc->state.frag_buf_offs; + sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ystride=_enc->state.ref_ystride[0]; + luma=0; + for(bi=0;bi<4;bi++){ + const unsigned char *s; + unsigned x; + unsigned x2; + unsigned act; + int i; + int j; + fragi=sb_map[bi]; + frag_offs=frag_buf_offs[fragi]; + /*TODO: This could be replaced with SATD^2, since we already have to + compute SATD.*/ + x=x2=0; + s=src+frag_offs; + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + unsigned c; + c=s[j]; + x+=c; + x2+=c*c; + } + s+=ystride; + } + luma+=x; + act=(x2<<6)-x*x; + if(act<8<<12){ + /*The region is flat.*/ + act=OC_MINI(act,5<<12); + } + else{ + unsigned e1; + unsigned e2; + unsigned e3; + unsigned e4; + /*Test for an edge. + TODO: There are probably much simpler ways to do this (e.g., it could + probably be combined with the SATD calculation). + Alternatively, we could split the block around the mean and compute the + reduction in variance in each half. + For a Gaussian source the reduction should be + (1-2/pi) ~= 0.36338022763241865692446494650994. + Significantly more reduction is a good indication of a bi-level image. + This has the advantage of identifying, in addition to straight edges, + small text regions, which would otherwise be classified as "texture".*/ + e1=e2=e3=e4=0; + s=src+frag_offs-1; + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j] + +(s+ystride)[j+2]-(s+ystride)[j]); + e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1) + +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]); + e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1) + +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]); + e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1) + +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]); + } + s+=ystride; + } + /*If the largest component of the edge energy is at least 40% of the + total, then classify the block as an edge block.*/ + if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){ + /*act=act_th*(act/act_th)**0.7 + =exp(log(act_th)+0.7*(log(act)-log(act_th))). + Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/ + act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10)); + } + } + _activity[bi]=act; + } + return luma; +} + +static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi, + unsigned _activity[4],const unsigned _intra_satd[12]){ + int bi; + for(bi=0;bi<4;bi++){ + unsigned act; + act=(11*_intra_satd[bi]>>8)*_intra_satd[bi]; + if(act<8<<12){ + /*The region is flat.*/ + act=OC_MINI(act,5<<12); + } + _activity[bi]=act; + } +} + +/*Compute the masking scales for the blocks in a macro block. + All masking is computed from the luma blocks. + We derive scaling factors for the chroma blocks from these, and use the same + ones for all chroma blocks, regardless of the subsampling. + It's possible for luma to be perfectly flat and yet have high chroma energy, + but this is unlikely in non-artificial images, and not a case that has been + addressed by any research to my knowledge. + The output of the masking process is two scale factors, which are fed into + the various R-D optimizations. + The first, rd_scale, is applied to D in the equation + D*rd_scale+lambda*R. + This is the form that must be used to properly combine scores from multiple + blocks, and can be interpreted as scaling distortions by their visibility. + The inverse, rd_iscale, is applied to lambda in the equation + D+rd_iscale*lambda*R. + This is equivalent to the first form within a single block, but much faster + to use when evaluating many possible distortions (e.g., during actual + quantization, where separate distortions are evaluated for every + coefficient). + The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are + used to perform the multiplications with the proper re-scaling for the range + of the scaling factors. + Many researchers apply masking values directly to the quantizers used, and + not to the R-D cost. + Since we generally use MSE for D, rd_scale must use the square of their + values to generate an equivalent effect.*/ +static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5], + const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4], + unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){ + unsigned activity_sum; + unsigned la; + unsigned lb; + unsigned d; + int bi; + int bi_min; + int bi_min2; + /*The ratio lb/la is meant to approximate + ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the + effective luminance masking from~\cite{LKW06} (including the self-masking + deflator). + The following actually turns out to be a pretty good approximation for + _luma>75 or so. + For smaller values luminance does not really follow Weber's Law anyway, and + this approximation gives a much less aggressive bitrate boost in this + region. + Though some researchers claim that contrast sensitivity actually decreases + for very low luminance values, in my experience excessive brightness on + LCDs or buggy color conversions (e.g., treating Y' as full-range instead + of the CCIR 601 range) make artifacts in such regions extremely visible. + We substitute _luma_avg for 128 to allow the strength of the masking to + vary with the actual average image luminance, within certain limits (the + caller has clamped _luma_avg to the range [90,160], inclusive). + @ARTICLE{LKW06, + author="Zhen Liu and Lina J. Karam and Andrew B. Watson", + title="{JPEG2000} Encoding With Perceptual Distortion Control", + journal="{IEEE} Transactions on Image Processing", + volume=15, + number=7, + pages="1763--1778", + month=Jul, + year=2006 + }*/ +#if 0 + la=_luma+4*_luma_avg; + lb=4*_luma+_luma_avg; +#else + /*Disable luminance masking.*/ + la=lb=1; +#endif + activity_sum=0; + for(bi=0;bi<4;bi++){ + unsigned a; + unsigned b; + activity_sum+=_activity[bi]; + /*Apply activity masking.*/ + a=_activity[bi]+4*_activity_avg; + b=4*_activity[bi]+_activity_avg; + d=OC_RD_SCALE(b,1); + /*And luminance masking.*/ + d=(a+(d>>1))/d; + _rd_scale[bi]=(d*la+(lb>>1))/lb; + /*And now the inverse.*/ + d=OC_MAXI(OC_RD_ISCALE(a,1),1); + d=(b+(d>>1))/d; + _rd_iscale[bi]=(d*lb+(la>>1))/la; + } + /*Now compute scaling factors for chroma blocks. + We start by finding the two smallest iscales from the luma blocks.*/ + bi_min=_rd_iscale[1]<_rd_iscale[0]; + bi_min2=1-bi_min; + for(bi=2;bi<4;bi++){ + if(_rd_iscale[bi]<_rd_iscale[bi_min]){ + bi_min2=bi_min; + bi_min=bi; + } + else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi; + } + /*If the minimum iscale is less than 1.0, use the second smallest instead, + and force the value to at least 1.0 (inflating chroma is a waste).*/ + if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2; + d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS); + _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]); + d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS); + _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]); + return activity_sum; +} + +static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi, + unsigned _frag_satd[12]){ + const unsigned char *src; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *sb_map; + const oc_mb_map_plane *mb_map; + const unsigned char *map_idxs; + int map_nidxs; + int mapii; + int mapi; + int ystride; + int pli; + int bi; + ptrdiff_t fragi; + ptrdiff_t frag_offs; + unsigned luma; + int dc; + frag_buf_offs=_enc->state.frag_buf_offs; + sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ystride=_enc->state.ref_ystride[0]; + luma=0; + for(bi=0;bi<4;bi++){ + fragi=sb_map[bi]; + frag_offs=frag_buf_offs[fragi]; + _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); + luma+=dc; + } + mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; + map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; + map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; + /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ + ystride=_enc->state.ref_ystride[1]; + for(mapii=4;mapii<map_nidxs;mapii++){ + mapi=map_idxs[mapii]; + pli=mapi>>2; + bi=mapi&3; + fragi=mb_map[pli][bi]; + frag_offs=frag_buf_offs[fragi]; + _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); + } + return luma; } /*Select luma block-level quantizers for a MB in an INTRA frame.*/ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, - const oc_qii_state *_qs,unsigned _mbi){ + const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){ const unsigned char *src; const ptrdiff_t *frag_buf_offs; const oc_sb_map *sb_maps; @@ -1068,6 +1417,7 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, unsigned rate[4][3]; int prev[3][3]; unsigned satd; + int dc; unsigned best_cost; unsigned best_ssd; unsigned best_rate; @@ -1083,19 +1433,30 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, ystride=_enc->state.ref_ystride[0]; fragi=sb_maps[_mbi>>2][_mbi&3][0]; frag_offs=frag_buf_offs[fragi]; - satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); + } + else{ + satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); + } nqis=_enc->state.nqis; lambda=_enc->lambda; for(qii=0;qii<nqis;qii++){ oc_qii_state_advance(qs[0]+qii,_qs,qii); - rate[0][qii]=oc_dct_cost2(ssd[0]+qii,_enc->state.qis[qii],0,0,satd) + rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd) +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE); + ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]); cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda); } for(bi=1;bi<4;bi++){ fragi=sb_maps[_mbi>>2][_mbi&3][bi]; frag_offs=frag_buf_offs[fragi]; - satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); + } + else{ + satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); + } for(qii=0;qii<nqis;qii++){ oc_qii_state qt[3]; unsigned cur_ssd; @@ -1103,7 +1464,8 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, int best_qij; int qij; oc_qii_state_advance(qt+0,qs[bi-1]+0,qii); - cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,0,satd); + cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd); + cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]); best_ssd=ssd[bi-1][0]+cur_ssd; best_rate=rate[bi-1][0]+cur_rate +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE); @@ -1152,13 +1514,14 @@ static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, /*Select a block-level quantizer for a single chroma block in an INTRA frame.*/ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc, - const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi){ + const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){ const unsigned char *src; oc_fragment *frags; ptrdiff_t frag_offs; oc_qii_state qt[3]; unsigned cost[3]; unsigned satd; + int dc; unsigned best_cost; int best_qii; int qii; @@ -1168,16 +1531,30 @@ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc, src=_enc->state.ref_frame_data[OC_FRAME_IO]; ystride=_enc->state.ref_ystride[_pli]; frag_offs=_enc->state.frag_buf_offs[_fragi]; - satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); + } + else{ + satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); + } + /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not + worth spending the bits to change the AC quantizer. + TODO: This may be worth revisiting when we separate out DC and AC + predictions from SATD.*/ +#if 0 nqis=_enc->state.nqis; +#else + nqis=1; +#endif lambda=_enc->lambda; best_qii=0; for(qii=0;qii<nqis;qii++){ unsigned cur_rate; unsigned cur_ssd; oc_qii_state_advance(qt+qii,_qs,qii); - cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],_pli,0,satd) + cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd) +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE); + cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale); cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda); } best_cost=cost[0]; @@ -1192,17 +1569,49 @@ static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc, return best_cost; } +static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc, + oc_enc_pipeline_state *_pipe,unsigned _mbi, + const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){ + /*Worst case token stack usage for 4 fragments.*/ + oc_token_checkpoint stack[64*4]; + oc_token_checkpoint *stackptr; + const oc_sb_map *sb_maps; + oc_fragment *frags; + ptrdiff_t *coded_fragis; + ptrdiff_t ncoded_fragis; + ptrdiff_t fragi; + int bi; + sb_maps=(const oc_sb_map *)_enc->state.sb_maps; + frags=_enc->state.frags; + coded_fragis=_pipe->coded_fragis[0]; + ncoded_fragis=_pipe->ncoded_fragis[0]; + stackptr=stack; + for(bi=0;bi<4;bi++){ + fragi=sb_maps[_mbi>>2][_mbi&3][bi]; + frags[fragi].refi=OC_FRAME_SELF; + frags[fragi].mb_mode=OC_MODE_INTRA; + oc_enc_block_transform_quantize(_enc,_pipe,0,fragi, + _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr); + coded_fragis[ncoded_fragis++]=fragi; + } + _pipe->ncoded_fragis[0]=ncoded_fragis; +} + static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc, oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){ - const oc_sb_map *sb_maps; - oc_sb_flags *sb_flags; - ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - int sbi; + const ogg_uint16_t *mcu_rd_scale; + const ogg_uint16_t *mcu_rd_iscale; + const oc_sb_map *sb_maps; + ptrdiff_t *coded_fragis; + ptrdiff_t ncoded_fragis; + ptrdiff_t froffset; + int sbi; + mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale; + mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale; sb_maps=(const oc_sb_map *)_enc->state.sb_maps; - sb_flags=_enc->state.sb_flags; coded_fragis=_pipe->coded_fragis[_pli]; ncoded_fragis=_pipe->ncoded_fragis[_pli]; + froffset=_pipe->froffset[_pli]; for(sbi=_sbi_start;sbi<_sbi_end;sbi++){ /*Worst case token stack usage for 1 fragment.*/ oc_token_checkpoint stack[64]; @@ -1213,10 +1622,14 @@ static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc, fragi=sb_maps[sbi][quadi][bi]; if(fragi>=0){ oc_token_checkpoint *stackptr; - oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi); + unsigned rd_scale; + unsigned rd_iscale; + rd_scale=mcu_rd_scale[fragi-froffset]; + rd_iscale=mcu_rd_iscale[fragi-froffset]; + oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale); stackptr=stack; - oc_enc_block_transform_quantize(_enc, - _pipe,_pli,fragi,0,NULL,&stackptr); + oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi, + rd_scale,rd_iscale,NULL,NULL,&stackptr); coded_fragis[ncoded_fragis++]=fragi; } } @@ -1226,13 +1639,19 @@ static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc, /*Analysis stage for an INTRA frame.*/ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){ - oc_enc_pipeline_state pipe; + ogg_int64_t activity_sum; + ogg_int64_t luma_sum; + unsigned activity_avg; + unsigned luma_avg; + const ogg_uint16_t *chroma_rd_scale; + ogg_uint16_t *mcu_rd_scale; + ogg_uint16_t *mcu_rd_iscale; const unsigned char *map_idxs; int nmap_idxs; oc_sb_flags *sb_flags; signed char *mb_modes; const oc_mb_map *mb_maps; - oc_mb_enc_info *embs; + const oc_sb_map *sb_maps; oc_fragment *frags; unsigned stripe_sby; unsigned mcu_nvsbs; @@ -1242,7 +1661,14 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){ int pli; _enc->state.frame_type=OC_INTRA_FRAME; oc_enc_tokenize_start(_enc); - oc_enc_pipeline_init(_enc,&pipe); + oc_enc_pipeline_init(_enc,&_enc->pipe); + oc_enc_mode_rd_init(_enc); + activity_sum=luma_sum=0; + activity_avg=_enc->activity_avg; + luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8); + chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]]; + mcu_rd_scale=_enc->mcu_rd_scale; + mcu_rd_iscale=_enc->mcu_rd_iscale; /*Choose MVs and MB modes and quantize and code luma. Must be done in Hilbert order.*/ map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; @@ -1253,52 +1679,91 @@ void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){ sb_flags=_enc->state.sb_flags; mb_modes=_enc->state.mb_modes; mb_maps=(const oc_mb_map *)_enc->state.mb_maps; - embs=_enc->mb_info; + sb_maps=(const oc_sb_map *)_enc->state.sb_maps; frags=_enc->state.frags; notstart=0; notdone=1; mcu_nvsbs=_enc->mcu_nvsbs; for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){ - unsigned sbi; - unsigned sbi_end; - notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby); - sbi_end=pipe.sbi_end[0]; - for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){ + ptrdiff_t cfroffset; + unsigned sbi; + unsigned sbi_end; + notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby); + sbi_end=_enc->pipe.sbi_end[0]; + cfroffset=_enc->pipe.froffset[1]; + for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){ int quadi; /*Mode addressing is through Y plane, always 4 MB per SB.*/ for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ + unsigned activity[4]; + unsigned rd_scale[5]; + unsigned rd_iscale[5]; + unsigned luma; unsigned mbi; int mapii; int mapi; int bi; ptrdiff_t fragi; mbi=sbi<<2|quadi; + /*Activity masking.*/ + if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ + luma=oc_mb_activity(_enc,mbi,activity); + } + else{ + unsigned intra_satd[12]; + luma=oc_mb_intra_satd(_enc,mbi,intra_satd); + oc_mb_activity_fast(_enc,mbi,activity,intra_satd); + for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0; + } + activity_sum+=oc_mb_masking(rd_scale,rd_iscale, + chroma_rd_scale,activity,activity_avg,luma,luma_avg); + luma_sum+=luma; /*Motion estimation: - We always do a basic 1MV search for all macroblocks, coded or not, - keyframe or not.*/ - if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_search(_enc,mbi); - oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi); + We do a basic 1MV search for all macroblocks, coded or not, + keyframe or not, unless we aren't using motion estimation at all.*/ + if(!_recode&&_enc->state.curframe_num>0&& + _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){ + oc_mcenc_search(_enc,mbi); + } + if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ + oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale); + } mb_modes[mbi]=OC_MODE_INTRA; - oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0); + oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe, + mbi,rd_scale,rd_iscale); /*Propagate final MB mode and MVs to the chroma blocks.*/ for(mapii=4;mapii<nmap_idxs;mapii++){ mapi=map_idxs[mapii]; pli=mapi>>2; bi=mapi&3; fragi=mb_maps[mbi][pli][bi]; + frags[fragi].refi=OC_FRAME_SELF; frags[fragi].mb_mode=OC_MODE_INTRA; } + /*Save masking scale factors for chroma blocks.*/ + for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){ + mapi=map_idxs[mapii]; + bi=mapi&3; + fragi=mb_maps[mbi][1][bi]; + mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4]; + mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4]; + } } } - oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone); + oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone); /*Code chroma planes.*/ for(pli=1;pli<3;pli++){ - oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe, - pli,pipe.sbi0[pli],pipe.sbi_end[pli]); - oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone); + oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe, + pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]); + oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone); } notstart=1; } + /*Compute the average block activity and MB luma score for the frame.*/ + _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN, + (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/ + _enc->state.fplanes[0].nfrags)); + _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs); /*Finish filling in the reference frame borders.*/ refi=_enc->state.ref_frame_idx[OC_FRAME_SELF]; for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli); @@ -1339,27 +1804,21 @@ static const unsigned OC_NOSKIP[12]={ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs, - const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){ + const unsigned _frag_satd[12],const unsigned _skip_ssd[12], + const unsigned _rd_scale[4],int _qti){ oc_fr_state fr; oc_qii_state qs; unsigned ssd; unsigned rate; - int overhead; unsigned satd; unsigned best_ssd; unsigned best_rate; - int best_overhead; int best_fri; int best_qii; - unsigned cur_cost; - unsigned cur_ssd; - unsigned cur_rate; - int cur_overhead; int lambda; int nqis; int nskipped; int bi; - int qii; lambda=_enc->lambda; nqis=_enc->state.nqis; /*We could do a trellis optimization here, but we don't make final skip @@ -1370,26 +1829,36 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, code the flags, anyway.*/ *&fr=*_fr; *&qs=*_qs; - ssd=rate=overhead=nskipped=0; + ssd=rate=nskipped=0; for(bi=0;bi<4;bi++){ oc_fr_state ft[2]; oc_qii_state qt[3]; unsigned best_cost; + unsigned cur_cost; + unsigned cur_ssd; + unsigned cur_rate; + unsigned cur_overhead; + int qii; satd=_frag_satd[bi]; *(ft+0)=*&fr; oc_fr_code_block(ft+0); - oc_qii_state_advance(qt+0,&qs,0); - best_overhead=(ft[0].bits-fr.bits<<OC_BIT_SCALE); - best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],0,_qti,satd) - +(qt[0].bits-qs.bits<<OC_BIT_SCALE); - best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate+best_overhead,lambda); + cur_overhead=ft[0].bits-fr.bits; + best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd) + +(cur_overhead<<OC_BIT_SCALE); + if(nqis>1){ + oc_qii_state_advance(qt+0,&qs,0); + best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE; + } + best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]); + best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda); best_fri=0; best_qii=0; for(qii=1;qii<nqis;qii++){ oc_qii_state_advance(qt+qii,&qs,qii); - cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd) - +(qt[qii].bits-qs.bits<<OC_BIT_SCALE); - cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate+best_overhead,lambda); + cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd) + +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE); + cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]); + cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda); if(cur_cost<best_cost){ best_cost=cur_cost; best_ssd=cur_ssd; @@ -1397,7 +1866,7 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, best_qii=qii; } } - if(_skip_ssd[bi]<UINT_MAX&&nskipped<3){ + if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){ *(ft+1)=*&fr; oc_fr_skip_block(ft+1); cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE; @@ -1405,15 +1874,13 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda); if(cur_cost<=best_cost){ best_ssd=cur_ssd; - best_rate=0; - best_overhead=cur_overhead; + best_rate=cur_overhead; best_fri=1; best_qii+=4; } } rate+=best_rate; ssd+=best_ssd; - overhead+=best_overhead; *&fr=*(ft+best_fri); if(best_fri==0)*&qs=*(qt+best_qii); else nskipped++; @@ -1421,12 +1888,12 @@ static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, } _modec->ssd=ssd; _modec->rate=rate; - _modec->overhead=OC_MAXI(overhead,0); } static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs, - const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){ + const unsigned _frag_satd[12],const unsigned _skip_ssd[12], + unsigned _rd_scale,int _qti){ unsigned ssd; unsigned rate; unsigned satd; @@ -1443,7 +1910,15 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, int bi; int qii; lambda=_enc->lambda; + /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not + worth spending the bits to change the AC quantizer. + TODO: This may be worth revisiting when we separate out DC and AC + predictions from SATD.*/ +#if 0 nqis=_enc->state.nqis; +#else + nqis=1; +#endif ssd=_modec->ssd; rate=_modec->rate; /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded @@ -1455,13 +1930,15 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, for(;bi<nblocks;bi++){ unsigned best_cost; satd=_frag_satd[bi]; - best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd) + best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd) +OC_CHROMA_QII_RATE; + best_ssd=OC_RD_SCALE(best_ssd,_rd_scale); best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda); best_qii=0; for(qii=1;qii<nqis;qii++){ - cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd) + cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd) +OC_CHROMA_QII_RATE; + cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale); cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda); if(cur_cost<best_cost){ best_cost=cur_cost; @@ -1470,7 +1947,7 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, best_qii=qii; } } - if(_skip_ssd[bi]<UINT_MAX){ + if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){ cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE; cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda); if(cur_cost<=best_cost){ @@ -1490,65 +1967,50 @@ static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, } static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe, - unsigned _mbi,unsigned _ssd[12]){ - OC_ALIGN16(ogg_int16_t buffer[64]); - const unsigned char *src; - const unsigned char *ref; - int ystride; - const oc_fragment *frags; - const ptrdiff_t *frag_buf_offs; - const ptrdiff_t *sb_map; - const oc_mb_map_plane *mb_map; - const unsigned char *map_idxs; - int map_nidxs; - ogg_int64_t mask; - unsigned uncoded_ssd; - int uncoded_dc; - unsigned dc_dequant; - int dc_flag; - int mapii; - int mapi; - int pli; - int bi; - ptrdiff_t fragi; - ptrdiff_t frag_offs; - int borderi; - int pi; + unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){ + const unsigned char *src; + const unsigned char *ref; + int ystride; + const oc_fragment *frags; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *sb_map; + const oc_mb_map_plane *mb_map; + const unsigned char *map_idxs; + oc_mv *mvs; + int map_nidxs; + unsigned uncoded_ssd; + int mapii; + int mapi; + int pli; + int bi; + ptrdiff_t fragi; + ptrdiff_t frag_offs; + int borderi; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]]; + ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; ystride=_enc->state.ref_ystride[0]; frags=_enc->state.frags; frag_buf_offs=_enc->state.frag_buf_offs; sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; - dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][0][1][0]; + mvs=_enc->mb_info[_mbi].block_mv; for(bi=0;bi<4;bi++){ fragi=sb_map[bi]; - frag_offs=frag_buf_offs[fragi]; - oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride); borderi=frags[fragi].borderi; - uncoded_ssd=uncoded_dc=0; + frag_offs=frag_buf_offs[fragi]; if(borderi<0){ - for(pi=0;pi<64;pi++){ - uncoded_ssd+=buffer[pi]*buffer[pi]; - uncoded_dc+=buffer[pi]; - } + uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride); } else{ - ogg_int64_t mask; - mask=_enc->state.borders[borderi].mask; - for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){ - uncoded_ssd+=buffer[pi]*buffer[pi]; - uncoded_dc+=buffer[pi]; - } + uncoded_ssd=oc_enc_frag_border_ssd(_enc, + src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask); } - /*Scale to match DCT domain.*/ - uncoded_ssd<<=4; - /*We actually only want the AC contribution to the SSD.*/ - uncoded_ssd-=uncoded_dc*uncoded_dc>>2; - /*DC is a special case; if there's more than a full-quantizer improvement - in the effective DC component, always force-code the block.*/ - dc_flag=abs(uncoded_dc)>dc_dequant<<1; - uncoded_ssd|=-dc_flag; + /*Scale to match DCT domain and RD.*/ + uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]); + /*Motion is a special case; if there is more than a full-pixel motion + against the prior frame, penalize skipping. + TODO: The factor of two here is a kludge, but it tested out better than a + hard limit.*/ + if(mvs[bi]!=0)uncoded_ssd*=2; _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd; } mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; @@ -1556,96 +2018,52 @@ static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe, map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; map_nidxs=(map_nidxs-4>>1)+4; mapii=4; + mvs=_enc->mb_info[_mbi].unref_mv; for(pli=1;pli<3;pli++){ ystride=_enc->state.ref_ystride[pli]; - dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][pli][1][0]; for(;mapii<map_nidxs;mapii++){ mapi=map_idxs[mapii]; bi=mapi&3; fragi=mb_map[pli][bi]; - frag_offs=frag_buf_offs[fragi]; - oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride); borderi=frags[fragi].borderi; - uncoded_ssd=uncoded_dc=0; + frag_offs=frag_buf_offs[fragi]; if(borderi<0){ - for(pi=0;pi<64;pi++){ - uncoded_ssd+=buffer[pi]*buffer[pi]; - uncoded_dc+=buffer[pi]; - } + uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride); } else{ - mask=_enc->state.borders[borderi].mask; - for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){ - uncoded_ssd+=buffer[pi]*buffer[pi]; - uncoded_dc+=buffer[pi]; - } + uncoded_ssd=oc_enc_frag_border_ssd(_enc, + src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask); } - /*Scale to match DCT domain.*/ - uncoded_ssd<<=4; - /*We actually only want the AC contribution to the SSD.*/ - uncoded_ssd-=uncoded_dc*uncoded_dc>>2; - /*DC is a special case; if there's more than a full-quantizer improvement - in the effective DC component, always force-code the block.*/ - dc_flag=abs(uncoded_dc)>dc_dequant<<1; - uncoded_ssd|=-dc_flag; + /*Scale to match DCT domain and RD.*/ + uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]); + /*Motion is a special case; if there is more than a full-pixel motion + against the prior frame, penalize skipping. + TODO: The factor of two here is a kludge, but it tested out better than + a hard limit*/ + if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2; _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd; } map_nidxs=(map_nidxs-4<<1)+4; } } -static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi, - unsigned _frag_satd[12]){ - const unsigned char *src; - const ptrdiff_t *frag_buf_offs; - const ptrdiff_t *sb_map; - const oc_mb_map_plane *mb_map; - const unsigned char *map_idxs; - int map_nidxs; - int mapii; - int mapi; - int ystride; - int pli; - int bi; - ptrdiff_t fragi; - ptrdiff_t frag_offs; - frag_buf_offs=_enc->state.frag_buf_offs; - sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; - src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ystride=_enc->state.ref_ystride[0]; - for(bi=0;bi<4;bi++){ - fragi=sb_map[bi]; - frag_offs=frag_buf_offs[fragi]; - _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride); - } - mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; - map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; - map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; - /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ - ystride=_enc->state.ref_ystride[1]; - for(mapii=4;mapii<map_nidxs;mapii++){ - mapi=map_idxs[mapii]; - pli=mapi>>2; - bi=mapi&3; - fragi=mb_map[pli][bi]; - frag_offs=frag_buf_offs[fragi]; - _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride); - } -} static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec, unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs, - const unsigned _frag_satd[12],const unsigned _skip_ssd[12]){ - oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0); - oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0); - _modec->overhead+= + const unsigned _frag_satd[12],const unsigned _skip_ssd[12], + const unsigned _rd_scale[5]){ + oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0); + oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, + _frag_satd,_skip_ssd,_rd_scale[4],0); + _modec->overhead= oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE; oc_mode_set_cost(_modec,_enc->lambda); } static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec, - unsigned _mbi,int _mb_mode,const signed char *_mv, - const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){ + unsigned _mbi,int _mb_mode,oc_mv _mv, + const oc_fr_state *_fr,const oc_qii_state *_qs, + const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){ unsigned frag_satd[12]; const unsigned char *src; const unsigned char *ref; @@ -1658,35 +2076,45 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec, int mapii; int mapi; int mv_offs[2]; - int dx; - int dy; int pli; int bi; ptrdiff_t fragi; ptrdiff_t frag_offs; + int dc; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[ - _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]]; + ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)]; ystride=_enc->state.ref_ystride[0]; frag_buf_offs=_enc->state.frag_buf_offs; sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; - dx=_mv[0]; - dy=_mv[1]; _modec->rate=_modec->ssd=0; - if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){ + if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){ for(bi=0;bi<4;bi++){ fragi=sb_map[bi]; frag_offs=frag_buf_offs[fragi]; - frag_satd[bi]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); + frag_satd[bi]+=abs(dc); + } + else{ + frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + } } } else{ for(bi=0;bi<4;bi++){ fragi=sb_map[bi]; frag_offs=frag_buf_offs[fragi]; - frag_satd[bi]=oc_enc_frag_satd_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ystride,UINT_MAX); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); + frag_satd[bi]+=abs(dc); + } + else{ + frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); + } } } mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; @@ -1694,15 +2122,22 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec, map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ ystride=_enc->state.ref_ystride[1]; - if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){ + if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){ for(mapii=4;mapii<map_nidxs;mapii++){ mapi=map_idxs[mapii]; pli=mapi>>2; bi=mapi&3; fragi=mb_map[pli][bi]; frag_offs=frag_buf_offs[fragi]; - frag_satd[mapii]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); + frag_satd[mapii]+=abs(dc); + } + else{ + frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + } } } else{ @@ -1712,30 +2147,38 @@ static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec, bi=mapi&3; fragi=mb_map[pli][bi]; frag_offs=frag_buf_offs[fragi]; - frag_satd[mapii]=oc_enc_frag_satd_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ystride,UINT_MAX); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); + frag_satd[mapii]+=abs(dc); + } + else{ + frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); + } } } - oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1); - oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1); - _modec->overhead+= + oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1); + oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, + frag_satd,_skip_ssd,_rd_scale[4],1); + _modec->overhead= oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE; oc_mode_set_cost(_modec,_enc->lambda); } static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec, unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs, - const unsigned _skip_ssd[12]){ - static const oc_mv OC_MV_ZERO; - oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_fr,_qs,_skip_ssd); + const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){ + oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale); } static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, - unsigned _mbi,int _mb_mode,const signed char *_mv, - const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){ + unsigned _mbi,int _mb_mode,oc_mv _mv, + const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12], + const unsigned _rd_scale[4]){ int bits0; - oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd); - bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31]; + oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale); + bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31]; _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12) -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE; oc_mode_set_cost(_modec,_enc->lambda); @@ -1749,7 +2192,7 @@ static const unsigned char OC_MB_PHASE[4][4]={ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs, - const unsigned _skip_ssd[12]){ + const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){ unsigned frag_satd[12]; oc_mv lbmvs[4]; oc_mv cbmvs[4]; @@ -1765,8 +2208,6 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, int mapii; int mapi; int mv_offs[2]; - int dx; - int dy; int pli; int bi; ptrdiff_t fragi; @@ -1774,8 +2215,9 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, int bits0; int bits1; unsigned satd; + int dc; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]]; + ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; ystride=_enc->state.ref_ystride[0]; frag_buf_offs=_enc->state.frag_buf_offs; frag_mvs=_enc->state.frag_mvs; @@ -1783,41 +2225,36 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, _modec->rate=_modec->ssd=0; for(bi=0;bi<4;bi++){ fragi=mb_map[0][bi]; - dx=_mv[bi][0]; - dy=_mv[bi][1]; /*Save the block MVs as the current ones while we're here; we'll replace them if we don't ultimately choose 4MV mode.*/ - frag_mvs[fragi][0]=(signed char)dx; - frag_mvs[fragi][1]=(signed char)dy; + frag_mvs[fragi]=_mv[bi]; frag_offs=frag_buf_offs[fragi]; - if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){ - satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){ + satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); } else{ - satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ystride,UINT_MAX); + satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); } - frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd; + frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc); } oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd, - _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,1); + _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1); /*Figure out which blocks are being skipped and give them (0,0) MVs.*/ bits0=0; bits1=0; nqis=_enc->state.nqis; for(bi=0;bi<4;bi++){ - if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis){ - memset(lbmvs+bi,0,sizeof(*lbmvs)); - } + if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0; else{ - memcpy(lbmvs+bi,_mv+bi,sizeof(*lbmvs)); - bits0+=OC_MV_BITS[0][_mv[bi][0]+31]+OC_MV_BITS[0][_mv[bi][1]+31]; + lbmvs[bi]=_mv[bi]; + bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31] + +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31]; bits1+=12; } } - (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs, - (const oc_mv *)lbmvs); + (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs); map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ @@ -1827,23 +2264,22 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, pli=mapi>>2; bi=mapi&3; fragi=mb_map[pli][bi]; - dx=cbmvs[bi][0]; - dy=cbmvs[bi][1]; frag_offs=frag_buf_offs[fragi]; /*TODO: We could save half these calls by re-using the results for the Cb and Cr planes; is it worth it?*/ - if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){ - satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); + if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){ + satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); } else{ - satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs, - ref+frag_offs+mv_offs[0],ystride,UINT_MAX); + satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs, + ref+frag_offs+mv_offs[0],ystride); } - frag_satd[mapii]=satd; + frag_satd[mapii]=satd+abs(dc); } - oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1); - _modec->overhead+= + oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, + frag_satd,_skip_ssd,_rd_scale[4],1); + _modec->overhead= oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR) +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1) -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE; @@ -1852,12 +2288,18 @@ static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ oc_set_chroma_mvs_func set_chroma_mvs; - oc_enc_pipeline_state pipe; oc_qii_state intra_luma_qs; oc_mv last_mv; oc_mv prior_mv; ogg_int64_t interbits; ogg_int64_t intrabits; + ogg_int64_t activity_sum; + ogg_int64_t luma_sum; + unsigned activity_avg; + unsigned luma_avg; + const ogg_uint16_t *chroma_rd_scale; + ogg_uint16_t *mcu_rd_scale; + ogg_uint16_t *mcu_rd_iscale; const unsigned char *map_idxs; int nmap_idxs; unsigned *coded_mbis; @@ -1871,30 +2313,36 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ oc_mb_enc_info *embs; oc_fragment *frags; oc_mv *frag_mvs; - int qi; unsigned stripe_sby; unsigned mcu_nvsbs; int notstart; int notdone; - int vdec; unsigned sbi; unsigned sbi_end; int refi; int pli; + int sp_level; + sp_level=_enc->sp_level; set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt]; _enc->state.frame_type=OC_INTER_FRAME; oc_mode_scheme_chooser_reset(&_enc->chooser); oc_enc_tokenize_start(_enc); - oc_enc_pipeline_init(_enc,&pipe); + oc_enc_pipeline_init(_enc,&_enc->pipe); + oc_enc_mode_rd_init(_enc); if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs); _enc->mv_bits[0]=_enc->mv_bits[1]=0; interbits=intrabits=0; - last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0; + activity_sum=luma_sum=0; + activity_avg=_enc->activity_avg; + luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8); + chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]]; + mcu_rd_scale=_enc->mcu_rd_scale; + mcu_rd_iscale=_enc->mcu_rd_iscale; + last_mv=prior_mv=0; /*Choose MVs and MB modes and quantize and code luma. Must be done in Hilbert order.*/ map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; - qi=_enc->state.qis[0]; coded_mbis=_enc->coded_mbis; uncoded_mbis=coded_mbis+_enc->state.nmbs; ncoded_mbis=0; @@ -1909,37 +2357,51 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ embs=_enc->mb_info; frags=_enc->state.frags; frag_mvs=_enc->state.frag_mvs; - vdec=!(_enc->state.info.pixel_fmt&2); notstart=0; notdone=1; mcu_nvsbs=_enc->mcu_nvsbs; for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){ - notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby); - sbi_end=pipe.sbi_end[0]; - for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){ + ptrdiff_t cfroffset; + notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby); + sbi_end=_enc->pipe.sbi_end[0]; + cfroffset=_enc->pipe.froffset[1]; + for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){ int quadi; /*Mode addressing is through Y plane, always 4 MB per SB.*/ for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ oc_mode_choice modes[8]; + unsigned activity[4]; + unsigned rd_scale[5]; + unsigned rd_iscale[5]; unsigned skip_ssd[12]; unsigned intra_satd[12]; + unsigned luma; int mb_mv_bits_0; int mb_gmv_bits_0; int inter_mv_pref; int mb_mode; - int dx; - int dy; + int refi; + int mv; unsigned mbi; int mapii; int mapi; int bi; ptrdiff_t fragi; mbi=sbi<<2|quadi; + luma=oc_mb_intra_satd(_enc,mbi,intra_satd); + /*Activity masking.*/ + if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ + oc_mb_activity(_enc,mbi,activity); + } + else oc_mb_activity_fast(_enc,mbi,activity,intra_satd); + luma_sum+=luma; + activity_sum+=oc_mb_masking(rd_scale,rd_iscale, + chroma_rd_scale,activity,activity_avg,luma,luma_avg); /*Motion estimation: We always do a basic 1MV search for all macroblocks, coded or not, keyframe or not.*/ - if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi); - dx=dy=0; + if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi); + mv=0; /*Find the block choice with the lowest estimated coding cost. If a Cb or Cr block is coded but no Y' block from a macro block then the mode MUST be OC_MODE_INTER_NOMV. @@ -1948,15 +2410,16 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ /*Block coding cost is estimated from correlated SATD metrics.*/ /*At this point, all blocks that are in frame are still marked coded.*/ if(!_recode){ - memcpy(embs[mbi].unref_mv, - embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv)); + embs[mbi].unref_mv[OC_FRAME_GOLD]= + embs[mbi].analysis_mv[0][OC_FRAME_GOLD]; + embs[mbi].unref_mv[OC_FRAME_PREV]= + embs[mbi].analysis_mv[0][OC_FRAME_PREV]; embs[mbi].refined=0; } - oc_mb_intra_satd(_enc,mbi,intra_satd); /*Estimate the cost of coding this MB in a keyframe.*/ if(_allow_keyframe){ oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, - pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP); + _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale); intrabits+=modes[OC_MODE_INTRA].rate; for(bi=0;bi<4;bi++){ oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs, @@ -1964,26 +2427,28 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ } } /*Estimate the cost in a delta frame for various modes.*/ - oc_skip_cost(_enc,&pipe,mbi,skip_ssd); - oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi, - OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd); - if(_enc->sp_level<OC_SP_LEVEL_NOMC){ + oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd); + if(sp_level<OC_SP_LEVEL_NOMC){ + oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi, + OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, - pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd); + _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale); mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi, OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV], - pipe.fr+0,pipe.qs+0,skip_ssd); + _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi, - OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd); + OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi, - OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd); - oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi, - embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd); + OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi, - OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd); + OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi, OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD], - pipe.fr+0,pipe.qs+0,skip_ssd); + _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); /*The explicit MV modes (2,6,7) have not yet gone through halfpel refinement. We choose the explicit MV mode that's already furthest ahead on @@ -1991,6 +2456,14 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ We have to be careful to remember which ones we've refined so that we don't refine it again if we re-encode this frame.*/ inter_mv_pref=_enc->lambda*3; + if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ + oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi, + embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); + } + else{ + modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX; + } if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&& modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){ if(!(embs[mbi].refined&0x80)){ @@ -1998,7 +2471,8 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ embs[mbi].refined|=0x80; } oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi, - embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd); + embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); } else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref< modes[OC_MODE_INTER_MV].cost){ @@ -2008,7 +2482,7 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ } mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi, OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD], - pipe.fr+0,pipe.qs+0,skip_ssd); + _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); } if(!(embs[mbi].refined&0x04)){ oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV); @@ -2016,7 +2490,7 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ } mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi, OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV], - pipe.fr+0,pipe.qs+0,skip_ssd); + _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); /*Finally, pick the mode with the cheapest estimated R-D cost.*/ mb_mode=OC_MODE_INTER_NOMV; if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){ @@ -2046,8 +2520,14 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ } } else{ + oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi, + OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); + oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, + _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale); oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi, - OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd); + OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, + skip_ssd,rd_scale); mb_mode=OC_MODE_INTER_NOMV; if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){ mb_mode=OC_MODE_INTRA; @@ -2062,67 +2542,55 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ if(mb_mode!=OC_MODE_INTER_MV_FOUR){ switch(mb_mode){ case OC_MODE_INTER_MV:{ - dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0]; - dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1]; - }break; - case OC_MODE_INTER_MV_LAST:{ - dx=last_mv[0]; - dy=last_mv[1]; - }break; - case OC_MODE_INTER_MV_LAST2:{ - dx=prior_mv[0]; - dy=prior_mv[1]; + mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV]; }break; + case OC_MODE_INTER_MV_LAST:mv=last_mv;break; + case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break; case OC_MODE_GOLDEN_MV:{ - dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0]; - dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1]; + mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD]; }break; } for(bi=0;bi<4;bi++){ fragi=mb_maps[mbi][0][bi]; - frag_mvs[fragi][0]=(signed char)dx; - frag_mvs[fragi][1]=(signed char)dy; + frag_mvs[fragi]=mv; } } for(bi=0;bi<4;bi++){ fragi=sb_maps[mbi>>2][mbi&3][bi]; frags[fragi].qii=modes[mb_mode].qii[bi]; } - if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi, - modes[mb_mode].overhead>>OC_BIT_SCALE)>0){ + if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi, + modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){ int orig_mb_mode; orig_mb_mode=mb_mode; mb_mode=mb_modes[mbi]; + refi=OC_FRAME_FOR_MODE(mb_mode); switch(mb_mode){ case OC_MODE_INTER_MV:{ - memcpy(prior_mv,last_mv,sizeof(prior_mv)); + prior_mv=last_mv; /*If we're backing out from 4MV, find the MV we're actually using.*/ if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){ for(bi=0;;bi++){ fragi=mb_maps[mbi][0][bi]; if(frags[fragi].coded){ - memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv)); - dx=frag_mvs[fragi][0]; - dy=frag_mvs[fragi][1]; + mv=last_mv=frag_mvs[fragi]; break; } } - mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31]; + mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31] + +OC_MV_BITS[0][OC_MV_Y(mv)+31]; } /*Otherwise we used the original analysis MV.*/ - else{ - memcpy(last_mv, - embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv)); - } + else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV]; _enc->mv_bits[0]+=mb_mv_bits_0; _enc->mv_bits[1]+=12; }break; case OC_MODE_INTER_MV_LAST2:{ oc_mv tmp_mv; - memcpy(tmp_mv,prior_mv,sizeof(tmp_mv)); - memcpy(prior_mv,last_mv,sizeof(prior_mv)); - memcpy(last_mv,tmp_mv,sizeof(last_mv)); + tmp_mv=prior_mv; + prior_mv=last_mv; + last_mv=tmp_mv; }break; case OC_MODE_GOLDEN_MV:{ _enc->mv_bits[0]+=mb_gmv_bits_0; @@ -2131,28 +2599,28 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ case OC_MODE_INTER_MV_FOUR:{ oc_mv lbmvs[4]; oc_mv cbmvs[4]; - memcpy(prior_mv,last_mv,sizeof(prior_mv)); + prior_mv=last_mv; for(bi=0;bi<4;bi++){ fragi=mb_maps[mbi][0][bi]; if(frags[fragi].coded){ - memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv)); - memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi])); - _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31] - +OC_MV_BITS[0][frag_mvs[fragi][1]+31]; + lbmvs[bi]=last_mv=frag_mvs[fragi]; + _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31] + +OC_MV_BITS[0][OC_MV_Y(last_mv)+31]; _enc->mv_bits[1]+=12; } /*Replace the block MVs for not-coded blocks with (0,0).*/ - else memset(lbmvs[bi],0,sizeof(lbmvs[bi])); + else lbmvs[bi]=0; } - (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs); + (*set_chroma_mvs)(cbmvs,lbmvs); for(mapii=4;mapii<nmap_idxs;mapii++){ mapi=map_idxs[mapii]; pli=mapi>>2; bi=mapi&3; fragi=mb_maps[mbi][pli][bi]; - frags[fragi].mb_mode=mb_mode; frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii]; - memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi])); + frags[fragi].refi=refi; + frags[fragi].mb_mode=mb_mode; + frag_mvs[fragi]=cbmvs[bi]; } }break; } @@ -2163,7 +2631,8 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ else{ *(uncoded_mbis-++nuncoded_mbis)=mbi; mb_mode=OC_MODE_INTER_NOMV; - dx=dy=0; + refi=OC_FRAME_PREV; + mv=0; } /*Propagate final MB mode and MVs to the chroma blocks. This has already been done for 4MV mode, since it requires individual @@ -2174,43 +2643,56 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ pli=mapi>>2; bi=mapi&3; fragi=mb_maps[mbi][pli][bi]; - frags[fragi].mb_mode=mb_mode; /*If we switched from 4MV mode to INTER_MV mode, then the qii values won't have been chosen with the right MV, but it's probaby not worth re-estimating them.*/ frags[fragi].qii=modes[mb_mode].qii[mapii]; - frag_mvs[fragi][0]=(signed char)dx; - frag_mvs[fragi][1]=(signed char)dy; + frags[fragi].refi=refi; + frags[fragi].mb_mode=mb_mode; + frag_mvs[fragi]=mv; } } + /*Save masking scale factors for chroma blocks.*/ + for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){ + mapi=map_idxs[mapii]; + bi=mapi&3; + fragi=mb_maps[mbi][1][bi]; + mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4]; + mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4]; + } } - oc_fr_state_flush_sb(pipe.fr+0); - sb_flags[sbi].coded_fully=pipe.fr[0].sb_full; - sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial; + oc_fr_state_flush_sb(_enc->pipe.fr+0); + sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full; + sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial; } - oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone); + oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone); /*Code chroma planes.*/ for(pli=1;pli<3;pli++){ - oc_enc_sb_transform_quantize_chroma(_enc,&pipe, - pli,pipe.sbi0[pli],pipe.sbi_end[pli]); - oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone); + oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe, + pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]); + oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone); } notstart=1; } + /*Update the average block activity and MB luma score for the frame. + We could use a Bessel follower here, but fast reaction is probably almost + always best.*/ + _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN, + (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/ + _enc->state.fplanes[0].nfrags)); + _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs); /*Finish filling in the reference frame borders.*/ refi=_enc->state.ref_frame_idx[OC_FRAME_SELF]; for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli); /*Finish adding flagging overhead costs to inter bit counts to determine if we should have coded a key frame instead.*/ if(_allow_keyframe){ - if(interbits>intrabits)return 1; /*Technically the chroma plane counts are over-estimations, because they don't account for continuing runs from the luma planes, but the - inaccuracy is small.*/ - for(pli=0;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE; - interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE; - interbits+= - _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE; + inaccuracy is small. + We don't need to add the luma plane coding flag costs, because they are + already included in the MB rate estimates.*/ + for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE; if(interbits>intrabits)return 1; } _enc->ncoded_mbis=ncoded_mbis; @@ -2228,482 +2710,3 @@ int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ } return 0; } - -#if defined(OC_COLLECT_METRICS) -# include <stdio.h> -# include <math.h> - -/*TODO: It may be helpful (for block-level quantizers especially) to separate - out the contributions from AC and DC into separate tables.*/ - -# define OC_ZWEIGHT (0.25) - -static void oc_mode_metrics_add(oc_mode_metrics *_metrics, - double _w,int _satd,int _rate,double _rmse){ - double rate; - /*Accumulate statistics without the scaling; this lets us change the scale - factor yet still use old data.*/ - rate=ldexp(_rate,-OC_BIT_SCALE); - if(_metrics->fragw>0){ - double dsatd; - double drate; - double drmse; - double w; - dsatd=_satd-_metrics->satd/_metrics->fragw; - drate=rate-_metrics->rate/_metrics->fragw; - drmse=_rmse-_metrics->rmse/_metrics->fragw; - w=_metrics->fragw*_w/(_metrics->fragw+_w); - _metrics->satd2+=dsatd*dsatd*w; - _metrics->satdrate+=dsatd*drate*w; - _metrics->rate2+=drate*drate*w; - _metrics->satdrmse+=dsatd*drmse*w; - _metrics->rmse2+=drmse*drmse*w; - } - _metrics->fragw+=_w; - _metrics->satd+=_satd*_w; - _metrics->rate+=rate*_w; - _metrics->rmse+=_rmse*_w; -} - -static void oc_mode_metrics_merge(oc_mode_metrics *_dst, - const oc_mode_metrics *_src,int _n){ - int i; - /*Find a non-empty set of metrics.*/ - for(i=0;i<_n&&_src[i].fragw<=0;i++); - if(i>=_n){ - memset(_dst,0,sizeof(*_dst)); - return; - } - memcpy(_dst,_src+i,sizeof(*_dst)); - /*And iterate over the remaining non-empty sets of metrics.*/ - for(i++;i<_n;i++)if(_src[i].fragw>0){ - double wa; - double wb; - double dsatd; - double drate; - double drmse; - double w; - wa=_dst->fragw; - wb=_src[i].fragw; - dsatd=_src[i].satd/wb-_dst->satd/wa; - drate=_src[i].rate/wb-_dst->rate/wa; - drmse=_src[i].rmse/wb-_dst->rmse/wa; - w=wa*wb/(wa+wb); - _dst->fragw+=_src[i].fragw; - _dst->satd+=_src[i].satd; - _dst->rate+=_src[i].rate; - _dst->rmse+=_src[i].rmse; - _dst->satd2+=_src[i].satd2+dsatd*dsatd*w; - _dst->satdrate+=_src[i].satdrate+dsatd*drate*w; - _dst->rate2+=_src[i].rate2+drate*drate*w; - _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w; - _dst->rmse2+=_src[i].rmse2+drmse*drmse*w; - } -} - -/*Compile collected SATD/rate/RMSE metrics into a form that's immediately - useful for mode decision.*/ -static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){ - int pli; - int qti; - oc_restore_fpu(&_enc->state); - /*Convert raw collected data into cleaned up sample points.*/ - for(pli=0;pli<3;pli++){ - for(qti=0;qti<2;qti++){ - double fragw; - int bin0; - int bin1; - int bin; - fragw=0; - bin0=bin1=0; - for(bin=0;bin<OC_SAD_BINS;bin++){ - oc_mode_metrics metrics; - OC_MODE_RD[_qi][pli][qti][bin].rate=0; - OC_MODE_RD[_qi][pli][qti][bin].rmse=0; - /*Find some points on either side of the current bin.*/ - while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){ - fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw; - } - while(bin0+1<bin&&bin0+1<bin1&& - fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){ - fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw; - } - /*Merge statistics and fit lines.*/ - oc_mode_metrics_merge(&metrics, - OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0); - if(metrics.fragw>0&&metrics.satd2>0){ - double a; - double b; - double msatd; - double mrate; - double mrmse; - double rate; - double rmse; - msatd=metrics.satd/metrics.fragw; - mrate=metrics.rate/metrics.fragw; - mrmse=metrics.rmse/metrics.fragw; - /*Compute the points on these lines corresponding to the actual bin - value.*/ - b=metrics.satdrate/metrics.satd2; - a=mrate-b*msatd; - rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE); - OC_MODE_RD[_qi][pli][qti][bin].rate= - (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767); - b=metrics.satdrmse/metrics.satd2; - a=mrmse-b*msatd; - rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE); - OC_MODE_RD[_qi][pli][qti][bin].rmse= - (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767); - } - } - } - } -} - - - -/*The following token skipping code used to also be used in the decoder (and - even at one point other places in the encoder). - However, it was obsoleted by other optimizations, and is now only used here. - It has been moved here to avoid generating the code when it's not needed.*/ - -/*Determines the number of blocks or coefficients to be skipped for a given - token value. - _token: The token value to skip. - _extra_bits: The extra bits attached to this token. - Return: A positive value indicates that number of coefficients are to be - skipped in the current block. - Otherwise, the negative of the return value indicates that number of - blocks are to be ended.*/ -typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits); - -/*Handles the simple end of block tokens.*/ -static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){ - int nblocks_adjust; - nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1; - return -_extra_bits-nblocks_adjust; -} - -/*The last EOB token has a special case, where an EOB run of size zero ends all - the remaining blocks in the frame.*/ -static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){ - /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not - yet available everywhere; this should be equivalent.*/ - if(!_extra_bits)return -(~(size_t)0>>1); - return -_extra_bits; -} - -/*Handles the pure zero run tokens.*/ -static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){ - return _extra_bits+1; -} - -/*Handles a normal coefficient value token.*/ -static ptrdiff_t oc_token_skip_val(void){ - return 1; -} - -/*Handles a category 1A zero run/coefficient value combo token.*/ -static ptrdiff_t oc_token_skip_run_cat1a(int _token){ - return _token-OC_DCT_RUN_CAT1A+2; -} - -/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/ -static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){ - int run_cati; - int ncoeffs_mask; - int ncoeffs_adjust; - run_cati=_token-OC_DCT_RUN_CAT1B; - ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati); - ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati); - return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust; -} - -/*A jump table for computing the number of coefficients or blocks to skip for - a given token value. - This reduces all the conditional branches, etc., needed to parse these token - values down to one indirect jump.*/ -static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={ - oc_token_skip_eob, - oc_token_skip_eob, - oc_token_skip_eob, - oc_token_skip_eob, - oc_token_skip_eob, - oc_token_skip_eob, - oc_token_skip_eob6, - oc_token_skip_zrl, - oc_token_skip_zrl, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_val, - (oc_token_skip_func)oc_token_skip_run_cat1a, - (oc_token_skip_func)oc_token_skip_run_cat1a, - (oc_token_skip_func)oc_token_skip_run_cat1a, - (oc_token_skip_func)oc_token_skip_run_cat1a, - (oc_token_skip_func)oc_token_skip_run_cat1a, - oc_token_skip_run, - oc_token_skip_run, - oc_token_skip_run, - oc_token_skip_run -}; - -/*Determines the number of blocks or coefficients to be skipped for a given - token value. - _token: The token value to skip. - _extra_bits: The extra bits attached to this token. - Return: A positive value indicates that number of coefficients are to be - skipped in the current block. - Otherwise, the negative of the return value indicates that number of - blocks are to be ended. - 0 will never be returned, so that at least one coefficient in one - block will always be decoded for every token.*/ -static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){ - return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits); -} - - - -void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){ - static const unsigned char OC_ZZI_HUFF_OFFSET[64]={ - 0,16,16,16,16,16,32,32, - 32,32,32,32,32,32,32,48, - 48,48,48,48,48,48,48,48, - 48,48,48,48,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64, - 64,64,64,64,64,64,64,64 - }; - const oc_fragment *frags; - const unsigned *frag_satd; - const unsigned *frag_ssd; - const ptrdiff_t *coded_fragis; - ptrdiff_t ncoded_fragis; - ptrdiff_t fragii; - double fragw; - int qti; - int qii; - int qi; - int pli; - int zzi; - int token; - int eb; - oc_restore_fpu(&_enc->state); - /*Load any existing mode metrics if we haven't already.*/ - if(!oc_has_mode_metrics){ - FILE *fmetrics; - memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS)); - fmetrics=fopen("modedec.stats","rb"); - if(fmetrics!=NULL){ - fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics); - fclose(fmetrics); - } - for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi); - oc_has_mode_metrics=1; - } - qti=_enc->state.frame_type; - frags=_enc->state.frags; - frag_satd=_enc->frag_satd; - frag_ssd=_enc->frag_ssd; - coded_fragis=_enc->state.coded_fragis; - ncoded_fragis=fragii=0; - /*Weight the fragments by the inverse frame size; this prevents HD content - from dominating the statistics.*/ - fragw=1.0/_enc->state.nfrags; - for(pli=0;pli<3;pli++){ - ptrdiff_t ti[64]; - int eob_token[64]; - int eob_run[64]; - /*Set up token indices and eob run counts. - We don't bother trying to figure out the real cost of the runs that span - coefficients; instead we use the costs that were available when R-D - token optimization was done.*/ - for(zzi=0;zzi<64;zzi++){ - ti[zzi]=_enc->dct_token_offs[pli][zzi]; - if(ti[zzi]>0){ - token=_enc->dct_tokens[pli][zzi][0]; - eb=_enc->extra_bits[pli][zzi][0]; - eob_token[zzi]=token; - eob_run[zzi]=-oc_dct_token_skip(token,eb); - } - else{ - eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; - eob_run[zzi]=0; - } - } - /*Scan the list of coded fragments for this plane.*/ - ncoded_fragis+=_enc->state.ncoded_fragis[pli]; - for(;fragii<ncoded_fragis;fragii++){ - ptrdiff_t fragi; - ogg_uint32_t frag_bits; - int huffi; - int skip; - int mb_mode; - unsigned satd; - int bin; - fragi=coded_fragis[fragii]; - frag_bits=0; - for(zzi=0;zzi<64;){ - if(eob_run[zzi]>0){ - /*We've reached the end of the block.*/ - eob_run[zzi]--; - break; - } - huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1] - +OC_ZZI_HUFF_OFFSET[zzi]; - if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){ - /*This token caused an EOB run to be flushed. - Therefore it gets the bits associated with it.*/ - frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits - +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]]; - eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; - } - token=_enc->dct_tokens[pli][zzi][ti[zzi]]; - eb=_enc->extra_bits[pli][zzi][ti[zzi]]; - ti[zzi]++; - skip=oc_dct_token_skip(token,eb); - if(skip<0){ - eob_token[zzi]=token; - eob_run[zzi]=-skip; - } - else{ - /*A regular DCT value token; accumulate the bits for it.*/ - frag_bits+=_enc->huff_codes[huffi][token].nbits - +OC_DCT_TOKEN_EXTRA_BITS[token]; - zzi+=skip; - } - } - mb_mode=frags[fragi].mb_mode; - qi=_enc->state.qis[frags[fragi].qii]; - satd=frag_satd[fragi]<<(pli+1&2); - bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1); - oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin, - fragw,satd,frag_bits<<OC_BIT_SCALE,sqrt(frag_ssd[fragi])); - } - } - /*Update global SATD/rate/RMSE estimation matrix.*/ - for(qii=0;qii<_enc->state.nqis;qii++){ - oc_enc_mode_metrics_update(_enc,_enc->state.qis[qii]); - } -} - -void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){ - FILE *fmetrics; - int qi; - /*Generate sample points for complete list of QI values.*/ - for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi); - fmetrics=fopen("modedec.stats","wb"); - if(fmetrics!=NULL){ - fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics); - fclose(fmetrics); - } - fprintf(stdout, - "/*File generated by libtheora with OC_COLLECT_METRICS" - " defined at compile time.*/\n" - "#if !defined(_modedec_H)\n" - "# define _modedec_H (1)\n" - "\n" - "\n" - "\n" - "# if defined(OC_COLLECT_METRICS)\n" - "typedef struct oc_mode_metrics oc_mode_metrics;\n" - "# endif\n" - "typedef struct oc_mode_rd oc_mode_rd;\n" - "\n" - "\n" - "\n" - "/*The number of extra bits of precision at which to store rate" - " metrics.*/\n" - "# define OC_BIT_SCALE (%i)\n" - "/*The number of extra bits of precision at which to store RMSE metrics.\n" - " This must be at least half OC_BIT_SCALE (rounded up).*/\n" - "# define OC_RMSE_SCALE (%i)\n" - "/*The number of bins to partition statistics into.*/\n" - "# define OC_SAD_BINS (%i)\n" - "/*The number of bits of precision to drop" - " from SAD scores to assign them to a\n" - " bin.*/\n" - "# define OC_SAD_SHIFT (%i)\n" - "\n" - "\n" - "\n" - "# if defined(OC_COLLECT_METRICS)\n" - "struct oc_mode_metrics{\n" - " double fragw;\n" - " double satd;\n" - " double rate;\n" - " double rmse;\n" - " double satd2;\n" - " double satdrate;\n" - " double rate2;\n" - " double satdrmse;\n" - " double rmse2;\n" - "};\n" - "\n" - "\n" - "int oc_has_mode_metrics;\n" - "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n" - "# endif\n" - "\n" - "\n" - "\n" - "struct oc_mode_rd{\n" - " ogg_int16_t rate;\n" - " ogg_int16_t rmse;\n" - "};\n" - "\n" - "\n" - "# if !defined(OC_COLLECT_METRICS)\n" - "static const\n" - "# endif\n" - "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n", - OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT); - for(qi=0;qi<64;qi++){ - int pli; - fprintf(stdout," {\n"); - for(pli=0;pli<3;pli++){ - int qti; - fprintf(stdout," {\n"); - for(qti=0;qti<2;qti++){ - int bin; - static const char *pl_names[3]={"Y'","Cb","Cr"}; - static const char *qti_names[2]={"INTRA","INTER"}; - fprintf(stdout," /*%s qi=%i %s*/\n", - pl_names[pli],qi,qti_names[qti]); - fprintf(stdout," {\n"); - fprintf(stdout," "); - for(bin=0;bin<OC_SAD_BINS;bin++){ - if(bin&&!(bin&0x3))fprintf(stdout,"\n "); - fprintf(stdout,"{%5i,%5i}", - OC_MODE_RD[qi][pli][qti][bin].rate, - OC_MODE_RD[qi][pli][qti][bin].rmse); - if(bin+1<OC_SAD_BINS)fprintf(stdout,","); - } - fprintf(stdout,"\n }"); - if(qti<1)fprintf(stdout,","); - fprintf(stdout,"\n"); - } - fprintf(stdout," }"); - if(pli<2)fprintf(stdout,","); - fprintf(stdout,"\n"); - } - fprintf(stdout," }"); - if(qi<63)fprintf(stdout,","); - fprintf(stdout,"\n"); - } - fprintf(stdout, - "};\n" - "\n" - "#endif\n"); -} -#endif diff --git a/thirdparty/libtheora/apiwrapper.c b/thirdparty/libtheora/apiwrapper.c index dc959b8d13..87b4e939f2 100644 --- a/thirdparty/libtheora/apiwrapper.c +++ b/thirdparty/libtheora/apiwrapper.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/apiwrapper.h b/thirdparty/libtheora/apiwrapper.h index 93454d7bda..ff45e0a4d6 100644 --- a/thirdparty/libtheora/apiwrapper.h +++ b/thirdparty/libtheora/apiwrapper.h @@ -21,7 +21,7 @@ # include <theora/theora.h> # include "theora/theoradec.h" # include "theora/theoraenc.h" -# include "internal.h" +# include "state.h" typedef struct th_api_wrapper th_api_wrapper; typedef struct th_api_info th_api_info; diff --git a/thirdparty/libtheora/bitpack.c b/thirdparty/libtheora/bitpack.c index 8195003bad..5125dde6b0 100644 --- a/thirdparty/libtheora/bitpack.c +++ b/thirdparty/libtheora/bitpack.c @@ -11,7 +11,7 @@ ******************************************************************** function: packing variable sized words into an octet stream - last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <string.h> @@ -32,15 +32,18 @@ static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){ const unsigned char *stop; oc_pb_window window; int available; + unsigned shift; + stop=_b->stop; + ptr=_b->ptr; window=_b->window; available=_b->bits; - ptr=_b->ptr; - stop=_b->stop; - while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){ - available+=8; - window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available; + shift=OC_PB_WINDOW_SIZE-available; + while(7<shift&&ptr<stop){ + shift-=8; + window|=(oc_pb_window)*ptr++<<shift; } _b->ptr=ptr; + available=OC_PB_WINDOW_SIZE-shift; if(_bits>available){ if(ptr>=stop){ _b->eof=1; @@ -67,7 +70,7 @@ void oc_pack_adv1(oc_pack_buf *_b){ } /*Here we assume that 0<=_bits&&_bits<=32.*/ -long oc_pack_read(oc_pack_buf *_b,int _bits){ +long oc_pack_read_c(oc_pack_buf *_b,int _bits){ oc_pb_window window; int available; long result; @@ -82,12 +85,12 @@ long oc_pack_read(oc_pack_buf *_b,int _bits){ available-=_bits; window<<=1; window<<=_bits-1; - _b->bits=available; _b->window=window; + _b->bits=available; return result; } -int oc_pack_read1(oc_pack_buf *_b){ +int oc_pack_read1_c(oc_pack_buf *_b){ oc_pb_window window; int available; int result; @@ -100,8 +103,8 @@ int oc_pack_read1(oc_pack_buf *_b){ result=window>>OC_PB_WINDOW_SIZE-1; available--; window<<=1; - _b->bits=available; _b->window=window; + _b->bits=available; return result; } diff --git a/thirdparty/libtheora/bitpack.h b/thirdparty/libtheora/bitpack.h index a020a292f5..237b584055 100644 --- a/thirdparty/libtheora/bitpack.h +++ b/thirdparty/libtheora/bitpack.h @@ -16,15 +16,32 @@ ********************************************************************/ #if !defined(_bitpack_H) # define _bitpack_H (1) +# include <stddef.h> # include <limits.h> +# include "internal.h" -typedef unsigned long oc_pb_window; +typedef size_t oc_pb_window; typedef struct oc_pack_buf oc_pack_buf; +/*Custom bitpacker implementations.*/ +# if defined(OC_ARM_ASM) +# include "arm/armbits.h" +# endif + +# if !defined(oc_pack_read) +# define oc_pack_read oc_pack_read_c +# endif +# if !defined(oc_pack_read1) +# define oc_pack_read1 oc_pack_read1_c +# endif +# if !defined(oc_huff_token_decode) +# define oc_huff_token_decode oc_huff_token_decode_c +# endif + # define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT) /*This is meant to be a large, positive constant that can still be efficiently loaded as an immediate (on platforms like ARM, for example). @@ -34,9 +51,9 @@ typedef struct oc_pack_buf oc_pack_buf; struct oc_pack_buf{ - oc_pb_window window; - const unsigned char *ptr; const unsigned char *stop; + const unsigned char *ptr; + oc_pb_window window; int bits; int eof; }; @@ -45,8 +62,8 @@ void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes); int oc_pack_look1(oc_pack_buf *_b); void oc_pack_adv1(oc_pack_buf *_b); /*Here we assume 0<=_bits&&_bits<=32.*/ -long oc_pack_read(oc_pack_buf *_b,int _bits); -int oc_pack_read1(oc_pack_buf *_b); +long oc_pack_read_c(oc_pack_buf *_b,int _bits); +int oc_pack_read1_c(oc_pack_buf *_b); /* returns -1 for read beyond EOF, or the number of whole bytes available */ long oc_pack_bytes_left(oc_pack_buf *_b); diff --git a/thirdparty/libtheora/collect.c b/thirdparty/libtheora/collect.c new file mode 100644 index 0000000000..c0d8a2733f --- /dev/null +++ b/thirdparty/libtheora/collect.c @@ -0,0 +1,974 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: mode selection code + last mod: $Id$ + + ********************************************************************/ +#include <stdio.h> +#include <limits.h> +#include <math.h> +#include <string.h> +#include "collect.h" + +#if defined(OC_COLLECT_METRICS) + +int OC_HAS_MODE_METRICS; +double OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; +double OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; +oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; +oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; +const char *OC_MODE_METRICS_FILENAME="modedec.stats"; + +void oc_mode_metrics_add(oc_mode_metrics *_metrics, + double _w,int _s,int _q,int _r,double _d){ + if(_metrics->w>0){ + double ds; + double dq; + double dr; + double dd; + double ds2; + double dq2; + double s2; + double sq; + double q2; + double sr; + double qr; + double sd; + double qd; + double s2q; + double sq2; + double w; + double wa; + double rwa; + double rwa2; + double rwb; + double rwb2; + double rw2; + double rw3; + double rw4; + wa=_metrics->w; + ds=_s-_metrics->s/wa; + dq=_q-_metrics->q/wa; + dr=_r-_metrics->r/wa; + dd=_d-_metrics->d/wa; + ds2=ds*ds; + dq2=dq*dq; + s2=_metrics->s2; + sq=_metrics->sq; + q2=_metrics->q2; + sr=_metrics->sr; + qr=_metrics->qr; + sd=_metrics->sd; + qd=_metrics->qd; + s2q=_metrics->s2q; + sq2=_metrics->sq2; + w=wa+_w; + rwa=wa/w; + rwb=_w/w; + rwa2=rwa*rwa; + rwb2=rwb*rwb; + rw2=wa*rwb; + rw3=rw2*(rwa2-rwb2); + rw4=_w*rwa2*rwa2+wa*rwb2*rwb2; + _metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb + +(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4; + _metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3; + _metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3; + _metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3; + _metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3; + _metrics->s2+=ds2*rw2; + _metrics->sq+=ds*dq*rw2; + _metrics->q2+=dq2*rw2; + _metrics->sr+=ds*dr*rw2; + _metrics->qr+=dq*dr*rw2; + _metrics->r2+=dr*dr*rw2; + _metrics->sd+=ds*dd*rw2; + _metrics->qd+=dq*dd*rw2; + _metrics->d2+=dd*dd*rw2; + } + _metrics->w+=_w; + _metrics->s+=_s*_w; + _metrics->q+=_q*_w; + _metrics->r+=_r*_w; + _metrics->d+=_d*_w; +} + +void oc_mode_metrics_merge(oc_mode_metrics *_dst, + const oc_mode_metrics *_src,int _n){ + int i; + /*Find a non-empty set of metrics.*/ + for(i=0;i<_n&&_src[i].w==0;i++); + if(i>=_n){ + memset(_dst,0,sizeof(*_dst)); + return; + } + memcpy(_dst,_src+i,sizeof(*_dst)); + /*And iterate over the remaining non-empty sets of metrics.*/ + for(i++;i<_n;i++)if(_src[i].w!=0){ + double ds; + double dq; + double dr; + double dd; + double ds2; + double dq2; + double s2a; + double s2b; + double sqa; + double sqb; + double q2a; + double q2b; + double sra; + double srb; + double qra; + double qrb; + double sda; + double sdb; + double qda; + double qdb; + double s2qa; + double s2qb; + double sq2a; + double sq2b; + double w; + double wa; + double wb; + double rwa; + double rwb; + double rwa2; + double rwb2; + double rw2; + double rw3; + double rw4; + wa=_dst->w; + wb=_src[i].w; + ds=_src[i].s/wb-_dst->s/wa; + dq=_src[i].q/wb-_dst->q/wa; + dr=_src[i].r/wb-_dst->r/wa; + dd=_src[i].d/wb-_dst->d/wa; + ds2=ds*ds; + dq2=dq*dq; + s2a=_dst->s2; + sqa=_dst->sq; + q2a=_dst->q2; + sra=_dst->sr; + qra=_dst->qr; + sda=_dst->sd; + qda=_dst->qd; + s2qa=_dst->s2q; + sq2a=_dst->sq2; + s2b=_src[i].s2; + sqb=_src[i].sq; + q2b=_src[i].q2; + srb=_src[i].sr; + qrb=_src[i].qr; + sdb=_src[i].sd; + qdb=_src[i].qd; + s2qb=_src[i].s2q; + sq2b=_src[i].sq2; + w=wa+wb; + if(w==0)rwa=rwb=0; + else{ + rwa=wa/w; + rwb=wb/w; + } + rwa2=rwa*rwa; + rwb2=rwb*rwb; + rw2=wa*rwb; + rw3=rw2*(rwa2-rwb2); + rw4=wb*rwa2*rwa2+wa*rwb2*rwb2; + /* + (1,1,1) -> + (0,0,0)# + (1,0,0) C(1,1)*C(1,0)*C(1,0)-> d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1}) + (0,1,0) C(1,0)*C(1,1)*C(1,0)-> d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1}) + (0,0,1) C(1,0)*C(1,0)*C(1,1)-> d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0}) + (1,1,0)* + (1,0,1)* + (0,1,1)* + (1,1,1) C(1,1)*C(1,1)*C(1,1)-> d^{1,1,1}*(rwa^3*wb-rwb^3*wa) + (2,1) -> + (0,0)# + (1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1}) + (0,1) C(2,0)*C(1,1)-> d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0}) + (2,0)* + (1,1)* + (2,1) C(2,2)*C(1,1)-> d^{2,1}*(rwa^3*wb-rwb^3*wa) + (2,2) -> + (0,0)# + (1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2}) + (0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1}) + (2,0) C(2,2)*C(2,0)-> d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2}) + (1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1}) + (0,2) C(2,0)*C(2,2)-> d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0}) + (1,2)* + (2,1)* + (2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa) + */ + _dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa)) + +ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa) + +dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4; + _dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa) + +dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3; + _dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a) + +2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3; + _dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra) + +dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3; + _dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda) + +dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3; + _dst->s2+=_src[i].s2+ds2*rw2; + _dst->sq+=_src[i].sq+ds*dq*rw2; + _dst->q2+=_src[i].q2+dq2*rw2; + _dst->sr+=_src[i].sr+ds*dr*rw2; + _dst->qr+=_src[i].qr+dq*dr*rw2; + _dst->r2+=_src[i].r2+dr*dr*rw2; + _dst->sd+=_src[i].sd+ds*dd*rw2; + _dst->qd+=_src[i].qd+dq*dd*rw2; + _dst->d2+=_src[i].d2+dd*dd*rw2; + _dst->w+=_src[i].w; + _dst->s+=_src[i].s; + _dst->q+=_src[i].q; + _dst->r+=_src[i].r; + _dst->d+=_src[i].d; + } +} + +/*Adjust a single corner of a set of metric bins to minimize the squared + prediction error of R and D. + Each bin is assumed to cover a quad like so: + (s0,q0) (s1,q0) + A----------B + | | + | | + | | + | | + C----------Z + (s0,q1) (s1,q1) + The values A, B, and C are fixed, and Z is the free parameter. + Then, for example, R_i is predicted via bilinear interpolation as + x_i=(s_i-s0)/(s1-s0) + y_i=(q_i-q0)/(q1-q0) + dRds1_i=A+(B-A)*x_i + dRds2_i=C+(Z-C)*x_i + R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i + To find the Z that minimizes the squared prediction error over i, this can + be rewritten as + R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z + Letting X={...,x_i*y_i,...}^T and + Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T, + the optimal Z is given by Z=(X^T.Y)/(X^T.X). + Now, we need to compute these dot products without actually storing data for + each sample. + Starting with X^T.X, we have + X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2). + Expanding the interior of the sum in a monomial basis of s_i and q_i gives + s0^2*q0^2 *(1) + -2*s0*q0^2*(s_i) + -2*s0^2*q0*(q_i) + +q0^2 *(s_i^2) + +4*s0*q0 *(s_i*q_i) + +s0^2 *(q_i^2) + -2*q0 *(s_i^2*q_i) + -2*s0 *(s_i*q_i^2) + +1 *(s_i^2*q_i^2). + However, computing things directly in this basis leads to gross numerical + errors, as most of the terms will have similar size and destructive + cancellation results. + A much better basis is the central (co-)moment basis: + {1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2, + (s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2}, + where sbar and qbar are the average s and q values over the bin, + respectively. + In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is + ds^2*dq^2*(1) + +dq^2 *((s_i-sbar)^2) + +4*ds*dq*((s_i-sbar)*(q_i-qbar)) + +ds^2 *((q_i-qbar)^2) + +2*dq *((s_i-sbar)^2*(q_i-qbar)) + +2*ds *((s_i-sbar)*(q_i-qbar)^2) + +1 *((s_i-sbar)^2*(q_i-qbar)^2). + With these expressions in the central (co-)moment bases, all we need to do + is compute sums over the (co-)moment terms, which can be done + incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()), + with no need to store the individual samples. + Now, for X^T.Y, we have + X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0) + -((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/ + ((s1-s0)*(q1-q0)), + or, rewriting the constants to simplify notation, + X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0) + +C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)). + Again, converting to the central (co-)moment basis, the interior of the + above sum is + ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq) *(1) + +(C1*dq+C3*dq^2) *((s_i-sbar)^2) + +(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar)) + +(C2*ds+C3*ds^2) *((q_i-qbar)^2) + +dq *((s_i-sbar)*(r_i-rbar)) + +ds *((q_i-qbar)*(r_i-rbar)) + +(C1+2*C3*dq) *((s_i-sbar)^2*(q_i-qbar)) + +(C2+2*C3*ds) *((s_i-sbar)*(q_i-qbar)^2) + +1 *((s_i-sbar)*(q_i-qbar)*(r_i-rbar)) + +C3 *((s_i-sbar)^2*(q_i-qbar)^2). + You might think it would be easier (if perhaps slightly less robust) to + accumulate terms directly around s0 and q0. + However, we update each corner of the bins in turn, so we would have to + change basis to move the sums from corner to corner anyway.*/ +double oc_mode_metrics_solve(double *_r,double *_d, + const oc_mode_metrics *_metrics,const int *_s0,const int *_s1, + const int *_q0,const int *_q1, + const double *_ra,const double *_rb,const double *_rc, + const double *_da,const double *_db,const double *_dc,int _n){ + double xx; + double rxy; + double dxy; + double wt; + int i; + xx=rxy=dxy=wt=0; + for(i=0;i<_n;i++)if(_metrics[i].w>0){ + double s10; + double q10; + double sq10; + double ds; + double dq; + double ds2; + double dq2; + double r; + double d; + double s2; + double sq; + double q2; + double sr; + double qr; + double sd; + double qd; + double s2q; + double sq2; + double sqr; + double sqd; + double s2q2; + double c0; + double c1; + double c2; + double c3; + double w; + w=_metrics[i].w; + wt+=w; + s10=_s1[i]-_s0[i]; + q10=_q1[i]-_q0[i]; + sq10=s10*q10; + ds=_metrics[i].s/w-_s0[i]; + dq=_metrics[i].q/w-_q0[i]; + ds2=ds*ds; + dq2=dq*dq; + s2=_metrics[i].s2; + sq=_metrics[i].sq; + q2=_metrics[i].q2; + s2q=_metrics[i].s2q; + sq2=_metrics[i].sq2; + s2q2=_metrics[i].s2q2; + xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10); + r=_metrics[i].r/w; + sr=_metrics[i].sr; + qr=_metrics[i].qr; + sqr=_metrics[i].sqr; + c0=-_ra[i]; + c1=-(_rb[i]-_ra[i])/s10; + c2=-(_rc[i]-_ra[i])/q10; + c3=-(_ra[i]-_rb[i]-_rc[i])/sq10; + rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2 + +(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr + +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10; + d=_metrics[i].d/w; + sd=_metrics[i].sd; + qd=_metrics[i].qd; + sqd=_metrics[i].sqd; + c0=-_da[i]; + c1=-(_db[i]-_da[i])/s10; + c2=-(_dc[i]-_da[i])/q10; + c3=-(_da[i]-_db[i]-_dc[i])/sq10; + dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2 + +(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd + +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10; + } + if(xx>1E-3){ + *_r=rxy/xx; + *_d=dxy/xx; + } + else{ + *_r=0; + *_d=0; + } + return wt; +} + +/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately + useful for mode decision.*/ +void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS], + int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS], + int _shift,double (*_weight)[3][2][OC_COMP_BINS]){ + int niters; + int prevdr; + int prevdd; + int dr; + int dd; + int pli; + int qti; + int qi; + int si; + dd=dr=INT_MAX; + niters=0; + /*The encoder interpolates rate and RMSE terms bilinearly from an + OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table. + To find the sample values at the grid points that minimize the total + squared prediction error actually requires solving a relatively sparse + linear system with a number of variables equal to the number of grid + points. + Instead of writing a general sparse linear system solver, we just use + Gauss-Seidel iteration, i.e., we update one grid point at time until + they stop changing.*/ + do{ + prevdr=dr; + prevdd=dd; + dd=dr=0; + for(pli=0;pli<3;pli++){ + for(qti=0;qti<2;qti++){ + for(qi=0;qi<OC_LOGQ_BINS;qi++){ + for(si=0;si<OC_COMP_BINS;si++){ + oc_mode_metrics m[4]; + int s0[4]; + int s1[4]; + int q0[4]; + int q1[4]; + double ra[4]; + double rb[4]; + double rc[4]; + double da[4]; + double db[4]; + double dc[4]; + double r; + double d; + int rate; + int rmse; + int ds; + int n; + n=0; + /*Collect the statistics for the (up to) four bins grid point + (si,qi) touches.*/ + if(qi>0&&si>0){ + q0[n]=OC_MODE_LOGQ[qi-1][pli][qti]; + q1[n]=OC_MODE_LOGQ[qi][pli][qti]; + s0[n]=si-1<<_shift; + s1[n]=si<<_shift; + ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE); + da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE); + rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE); + db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE); + rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE); + dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE); + *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1); + } + if(qi>0){ + ds=si+1<OC_COMP_BINS?1:-1; + q0[n]=OC_MODE_LOGQ[qi-1][pli][qti]; + q1[n]=OC_MODE_LOGQ[qi][pli][qti]; + s0[n]=si+ds<<_shift; + s1[n]=si<<_shift; + ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE); + da[n]= + ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); + rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE); + db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE); + rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE); + dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); + *(m+n++)=*(_metrics[qi-1][pli][qti]+si); + } + if(qi+1<OC_LOGQ_BINS&&si>0){ + q0[n]=OC_MODE_LOGQ[qi+1][pli][qti]; + q1[n]=OC_MODE_LOGQ[qi][pli][qti]; + s0[n]=si-1<<_shift; + s1[n]=si<<_shift; + ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE); + da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE); + rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE); + db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE); + rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE); + dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE); + *(m+n++)=*(_metrics[qi][pli][qti]+si-1); + } + if(qi+1<OC_LOGQ_BINS){ + ds=si+1<OC_COMP_BINS?1:-1; + q0[n]=OC_MODE_LOGQ[qi+1][pli][qti]; + q1[n]=OC_MODE_LOGQ[qi][pli][qti]; + s0[n]=si+ds<<_shift; + s1[n]=si<<_shift; + ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE); + da[n]= + ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); + rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE); + db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE); + rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE); + dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE); + *(m+n++)=*(_metrics[qi][pli][qti]+si); + } + /*On the first pass, initialize with a simple weighted average of + the neighboring bins.*/ + if(!OC_HAS_MODE_METRICS&&niters==0){ + double w; + w=r=d=0; + while(n-->0){ + w+=m[n].w; + r+=m[n].r; + d+=m[n].d; + } + r=w>1E-3?r/w:0; + d=w>1E-3?d/w:0; + _weight[qi][pli][qti][si]=w; + } + else{ + /*Update the grid point and save the weight for later.*/ + _weight[qi][pli][qti][si]= + oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n); + } + rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767); + rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767); + dr+=abs(rate-_table[qi][pli][qti][si].rate); + dd+=abs(rmse-_table[qi][pli][qti][si].rmse); + _table[qi][pli][qti][si].rate=(ogg_int16_t)rate; + _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse; + } + } + } + } + } + /*After a fixed number of initial iterations, only iterate so long as the + total change is decreasing. + This ensures we don't oscillate forever, which is a danger, as all of our + results are rounded fairly coarsely.*/ + while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd))); + if(_reweight){ + /*Now, reduce the values of the optimal solution until we get enough + samples in each bin to overcome the constant OC_ZWEIGHT factor. + This encourages sampling under-populated bins and prevents a single large + sample early on from discouraging coding in that bin ever again.*/ + for(pli=0;pli<3;pli++){ + for(qti=0;qti<2;qti++){ + for(qi=0;qi<OC_LOGQ_BINS;qi++){ + for(si=0;si<OC_COMP_BINS;si++){ + double wt; + wt=_weight[qi][pli][qti][si]; + wt/=OC_ZWEIGHT+wt; + _table[qi][pli][qti][si].rate=(ogg_int16_t) + (_table[qi][pli][qti][si].rate*wt+0.5); + _table[qi][pli][qti][si].rmse=(ogg_int16_t) + (_table[qi][pli][qti][si].rmse*wt+0.5); + } + } + } + } + } +} + +/*Dump the in memory mode metrics to a file. + Note this data format isn't portable between different platforms.*/ +void oc_mode_metrics_dump(void){ + FILE *fmetrics; + fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb"); + if(fmetrics!=NULL){ + (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics); + (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics); + (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics); + fclose(fmetrics); + } +} + +void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name, +#if !defined(OC_COLLECT_METRICS) + const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){ +#else + oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){ +#endif + int qii; + fprintf(_fout, + "# if !defined(OC_COLLECT_METRICS)\n" + "static const\n" + "# endif\n" + "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name); + for(qii=0;qii<OC_LOGQ_BINS;qii++){ + int pli; + fprintf(_fout," {\n"); + for(pli=0;pli<3;pli++){ + int qti; + fprintf(_fout," {\n"); + for(qti=0;qti<2;qti++){ + int bin; + int qi; + static const char *pl_names[3]={"Y'","Cb","Cr"}; + static const char *qti_names[2]={"INTRA","INTER"}; + qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1); + fprintf(_fout," /*%s qi=%i %s*/\n", + pl_names[pli],qi,qti_names[qti]); + fprintf(_fout," {\n"); + fprintf(_fout," "); + for(bin=0;bin<OC_COMP_BINS;bin++){ + if(bin&&!(bin&0x3))fprintf(_fout,"\n "); + fprintf(_fout,"{%5i,%5i}", + _mode_rd_table[qii][pli][qti][bin].rate, + _mode_rd_table[qii][pli][qti][bin].rmse); + if(bin+1<OC_COMP_BINS)fprintf(_fout,","); + } + fprintf(_fout,"\n }"); + if(qti<1)fprintf(_fout,","); + fprintf(_fout,"\n"); + } + fprintf(_fout," }"); + if(pli<2)fprintf(_fout,","); + fprintf(_fout,"\n"); + } + fprintf(_fout," }"); + if(qii+1<OC_LOGQ_BINS)fprintf(_fout,","); + fprintf(_fout,"\n"); + } + fprintf(_fout, + "};\n" + "\n"); +} + +void oc_mode_metrics_print(FILE *_fout){ + int qii; + fprintf(_fout, + "/*File generated by libtheora with OC_COLLECT_METRICS" + " defined at compile time.*/\n" + "#if !defined(_modedec_H)\n" + "# define _modedec_H (1)\n" + "# include \"encint.h\"\n" + "\n" + "\n" + "\n" + "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n" + " (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n" + " The actual statistics used by the encoder will be interpolated from\n" + " that table based on log_plq for the actual quantization matrix used.*/\n" + "# if !defined(OC_COLLECT_METRICS)\n" + "static const\n" + "# endif\n" + "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n"); + for(qii=0;qii<OC_LOGQ_BINS;qii++){ + fprintf(_fout," { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n", + OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0], + OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1], + qii+1<OC_LOGQ_BINS?",":""); + } + fprintf(_fout, + "};\n" + "\n"); + oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD); + oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD); + fprintf(_fout, + "#endif\n"); +} + + +# if !defined(OC_COLLECT_NO_ENC_FUNCS) +void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){ + oc_restore_fpu(&_enc->state); + /*Load any existing mode metrics if we haven't already.*/ + if(!OC_HAS_MODE_METRICS){ + FILE *fmetrics; + memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD)); + memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD)); + fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb"); + if(fmetrics!=NULL){ + /*Read in the binary structures as written my oc_mode_metrics_dump(). + Note this format isn't portable between different platforms.*/ + (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics); + (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics); + (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics); + fclose(fmetrics); + } + else{ + int qii; + int qi; + int pli; + int qti; + for(qii=0;qii<OC_LOGQ_BINS;qii++){ + qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1); + for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ + OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti]; + } + } + } + oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1, + OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD); + oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1, + OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD); + OC_HAS_MODE_METRICS=1; + } +} + +/*The following token skipping code used to also be used in the decoder (and + even at one point other places in the encoder). + However, it was obsoleted by other optimizations, and is now only used here. + It has been moved here to avoid generating the code when it's not needed.*/ + +/*Determines the number of blocks or coefficients to be skipped for a given + token value. + _token: The token value to skip. + _extra_bits: The extra bits attached to this token. + Return: A positive value indicates that number of coefficients are to be + skipped in the current block. + Otherwise, the negative of the return value indicates that number of + blocks are to be ended.*/ +typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits); + +/*Handles the simple end of block tokens.*/ +static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){ + int nblocks_adjust; + nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1; + return -_extra_bits-nblocks_adjust; +} + +/*The last EOB token has a special case, where an EOB run of size zero ends all + the remaining blocks in the frame.*/ +static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){ + /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not + yet available everywhere; this should be equivalent.*/ + if(!_extra_bits)return -(~(size_t)0>>1); + return -_extra_bits; +} + +/*Handles the pure zero run tokens.*/ +static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){ + return _extra_bits+1; +} + +/*Handles a normal coefficient value token.*/ +static ptrdiff_t oc_token_skip_val(void){ + return 1; +} + +/*Handles a category 1A zero run/coefficient value combo token.*/ +static ptrdiff_t oc_token_skip_run_cat1a(int _token){ + return _token-OC_DCT_RUN_CAT1A+2; +} + +/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/ +static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){ + int run_cati; + int ncoeffs_mask; + int ncoeffs_adjust; + run_cati=_token-OC_DCT_RUN_CAT1B; + ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati); + ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati); + return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust; +} + +/*A jump table for computing the number of coefficients or blocks to skip for + a given token value. + This reduces all the conditional branches, etc., needed to parse these token + values down to one indirect jump.*/ +static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={ + oc_token_skip_eob, + oc_token_skip_eob, + oc_token_skip_eob, + oc_token_skip_eob, + oc_token_skip_eob, + oc_token_skip_eob, + oc_token_skip_eob6, + oc_token_skip_zrl, + oc_token_skip_zrl, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_val, + (oc_token_skip_func)oc_token_skip_run_cat1a, + (oc_token_skip_func)oc_token_skip_run_cat1a, + (oc_token_skip_func)oc_token_skip_run_cat1a, + (oc_token_skip_func)oc_token_skip_run_cat1a, + (oc_token_skip_func)oc_token_skip_run_cat1a, + oc_token_skip_run, + oc_token_skip_run, + oc_token_skip_run, + oc_token_skip_run +}; + +/*Determines the number of blocks or coefficients to be skipped for a given + token value. + _token: The token value to skip. + _extra_bits: The extra bits attached to this token. + Return: A positive value indicates that number of coefficients are to be + skipped in the current block. + Otherwise, the negative of the return value indicates that number of + blocks are to be ended. + 0 will never be returned, so that at least one coefficient in one + block will always be decoded for every token.*/ +static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){ + return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits); +} + + +void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){ + static const unsigned char OC_ZZI_HUFF_OFFSET[64]={ + 0,16,16,16,16,16,32,32, + 32,32,32,32,32,32,32,48, + 48,48,48,48,48,48,48,48, + 48,48,48,48,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 + }; + const oc_fragment *frags; + const unsigned *frag_sad; + const unsigned *frag_satd; + const unsigned *frag_ssd; + const ptrdiff_t *coded_fragis; + ptrdiff_t ncoded_fragis; + ptrdiff_t fragii; + double fragw; + int modelines[3][3][2]; + int qti; + int qii; + int qi; + int pli; + int zzi; + int token; + int eb; + oc_restore_fpu(&_enc->state); + /*Figure out which metric bins to use for this frame's quantizers.*/ + for(qii=0;qii<_enc->state.nqis;qii++){ + for(pli=0;pli<3;pli++){ + for(qti=0;qti<2;qti++){ + int log_plq; + int modeline; + log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti]; + for(modeline=0;modeline<OC_LOGQ_BINS-1&& + OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++); + modelines[qii][pli][qti]=modeline; + } + } + } + qti=_enc->state.frame_type; + frags=_enc->state.frags; + frag_sad=_enc->frag_sad; + frag_satd=_enc->frag_satd; + frag_ssd=_enc->frag_ssd; + coded_fragis=_enc->state.coded_fragis; + ncoded_fragis=fragii=0; + /*Weight the fragments by the inverse frame size; this prevents HD content + from dominating the statistics.*/ + fragw=1.0/_enc->state.nfrags; + for(pli=0;pli<3;pli++){ + ptrdiff_t ti[64]; + int eob_token[64]; + int eob_run[64]; + /*Set up token indices and eob run counts. + We don't bother trying to figure out the real cost of the runs that span + coefficients; instead we use the costs that were available when R-D + token optimization was done.*/ + for(zzi=0;zzi<64;zzi++){ + ti[zzi]=_enc->dct_token_offs[pli][zzi]; + if(ti[zzi]>0){ + token=_enc->dct_tokens[pli][zzi][0]; + eb=_enc->extra_bits[pli][zzi][0]; + eob_token[zzi]=token; + eob_run[zzi]=-oc_dct_token_skip(token,eb); + } + else{ + eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; + eob_run[zzi]=0; + } + } + /*Scan the list of coded fragments for this plane.*/ + ncoded_fragis+=_enc->state.ncoded_fragis[pli]; + for(;fragii<ncoded_fragis;fragii++){ + ptrdiff_t fragi; + int frag_bits; + int huffi; + int skip; + int mb_mode; + unsigned sad; + unsigned satd; + double sqrt_ssd; + int bin; + int qtj; + fragi=coded_fragis[fragii]; + frag_bits=0; + for(zzi=0;zzi<64;){ + if(eob_run[zzi]>0){ + /*We've reached the end of the block.*/ + eob_run[zzi]--; + break; + } + huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1] + +OC_ZZI_HUFF_OFFSET[zzi]; + if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){ + /*This token caused an EOB run to be flushed. + Therefore it gets the bits associated with it.*/ + frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits + +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]]; + eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX; + } + token=_enc->dct_tokens[pli][zzi][ti[zzi]]; + eb=_enc->extra_bits[pli][zzi][ti[zzi]]; + ti[zzi]++; + skip=oc_dct_token_skip(token,eb); + if(skip<0){ + eob_token[zzi]=token; + eob_run[zzi]=-skip; + } + else{ + /*A regular DCT value token; accumulate the bits for it.*/ + frag_bits+=_enc->huff_codes[huffi][token].nbits + +OC_DCT_TOKEN_EXTRA_BITS[token]; + zzi+=skip; + } + } + mb_mode=frags[fragi].mb_mode; + qii=frags[fragi].qii; + qi=_enc->state.qis[qii]; + sad=frag_sad[fragi]<<(pli+1&2); + satd=frag_satd[fragi]<<(pli+1&2); + sqrt_ssd=sqrt(frag_ssd[fragi]); + qtj=mb_mode!=OC_MODE_INTRA; + /*Accumulate statistics. + The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by + OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor + yet still use old data.*/ + bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1); + oc_mode_metrics_add( + OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin, + fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd); + bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1); + oc_mode_metrics_add( + OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin, + fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd); + } + } + /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/ + oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1, + OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD); + oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1, + OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD); +} +# endif + +#endif diff --git a/thirdparty/libtheora/collect.h b/thirdparty/libtheora/collect.h new file mode 100644 index 0000000000..9458b84e3f --- /dev/null +++ b/thirdparty/libtheora/collect.h @@ -0,0 +1,109 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: mode selection code + last mod: $Id$ + + ********************************************************************/ +#if !defined(_collect_H) +# define _collect_H (1) +# include "encint.h" +# if defined(OC_COLLECT_METRICS) +# include <stdio.h> + + + +typedef struct oc_mode_metrics oc_mode_metrics; + + + +/**Sets the file name to load/store mode metrics from/to. + * The file name string is stored by reference, and so must be valid for the + * lifetime of the encoder. + * Mode metric collection uses global tables; do not attempt to perform + * multiple collections at once. + * \param[in] _buf <tt>char[]</tt> The file name. + * \retval TH_EIMPL Not supported by this implementation.*/ +#define TH_ENCCTL_SET_METRICS_FILE (0x8000) + + + +/*Accumulates various weighted sums of the measurements. + w -> weight + s -> SATD + q -> log quantizer + r -> rate (in bits) + d -> RMSE + All of the single letters correspond to direct, weighted sums, e.g., + w=sum(w_i), s=sum(s_i*w_i), etc. + The others correspond to central moments (or co-moments) of the given order, + e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i). + Because we need some moments up to fourth order, we use central moments to + minimize the dynamic range and prevent rounding error from dominating the + calculations.*/ +struct oc_mode_metrics{ + double w; + double s; + double q; + double r; + double d; + double s2; + double sq; + double q2; + double sr; + double qr; + double r2; + double sd; + double qd; + double d2; + double s2q; + double sq2; + double sqr; + double sqd; + double s2q2; +}; + + +# define OC_ZWEIGHT (0.25) + +/*TODO: It may be helpful (for block-level quantizers especially) to separate + out the contributions from AC and DC into separate tables.*/ + +extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]; +extern oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; +extern oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]; + +extern int OC_HAS_MODE_METRICS; +extern oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; +extern oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS]; +extern const char *OC_MODE_METRICS_FILENAME; + +void oc_mode_metrics_dump(); +void oc_mode_metrics_print(FILE *_fout); + +void oc_mode_metrics_add(oc_mode_metrics *_metrics, + double _w,int _s,int _q,int _r,double _d); +void oc_mode_metrics_merge(oc_mode_metrics *_dst, + const oc_mode_metrics *_src,int _n); +double oc_mode_metrics_solve(double *_r,double *_d, + const oc_mode_metrics *_metrics,const int *_s0,const int *_s1, + const int *_q0,const int *_q1, + const double *_ra,const double *_rb,const double *_rc, + const double *_da,const double *_db,const double *_dc,int _n); +void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS], + int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS], + int shift,double (*_weight)[3][2][OC_COMP_BINS]); +void oc_enc_mode_metrics_load(oc_enc_ctx *_enc); +void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc); + +# endif +#endif diff --git a/thirdparty/libtheora/dct.h b/thirdparty/libtheora/dct.h index 24ba6f111a..8052ea6bc1 100644 --- a/thirdparty/libtheora/dct.h +++ b/thirdparty/libtheora/dct.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/decinfo.c b/thirdparty/libtheora/decinfo.c index 845eb1361c..a91e740b15 100644 --- a/thirdparty/libtheora/decinfo.c +++ b/thirdparty/libtheora/decinfo.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -20,6 +20,11 @@ #include <limits.h> #include "decint.h" +/*Only used for fuzzing.*/ +#if defined(HAVE_MEMORY_CONSTRAINT) +static const int MAX_FUZZING_WIDTH = 16384; +static const int MAX_FUZZING_HEIGHT = 16384; +#endif /*Unpacks a series of octets from a given byte array into the pack buffer. @@ -55,8 +60,8 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){ /*verify we can parse this bitstream version. We accept earlier minors and all subminors, by spec*/ if(_info->version_major>TH_VERSION_MAJOR|| - _info->version_major==TH_VERSION_MAJOR&& - _info->version_minor>TH_VERSION_MINOR){ + (_info->version_major==TH_VERSION_MAJOR&& + _info->version_minor>TH_VERSION_MINOR)){ return TH_EVERSION; } /*Read the encoded frame description.*/ @@ -82,6 +87,11 @@ static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){ _info->fps_numerator==0||_info->fps_denominator==0){ return TH_EBADHEADER; } +#if defined(HAVE_MEMORY_CONSTRAINT) + if(_info->frame_width>=MAX_FUZZING_WIDTH&&_info->frame_height>=MAX_FUZZING_HEIGHT){ + return TH_EBADHEADER; + } +#endif /*Note: The sense of pic_y is inverted in what we pass back to the application compared to how it is stored in the bitstream. This is because the bitstream uses a right-handed coordinate system, while @@ -128,6 +138,10 @@ static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){ _tc->comments*sizeof(_tc->comment_lengths[0])); _tc->user_comments=(char **)_ogg_malloc( _tc->comments*sizeof(_tc->user_comments[0])); + if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){ + _tc->comments=0; + return TH_EFAULT; + } for(i=0;i<_tc->comments;i++){ len=oc_unpack_length(_opb); if(len<0||len>oc_pack_bytes_left(_opb)){ @@ -168,9 +182,23 @@ static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info, int ret; val=oc_pack_read(_opb,8); packtype=(int)val; - /*If we're at a data packet and we have received all three headers, we're - done.*/ - if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){ + /*If we're at a data packet...*/ + if(!(packtype&0x80)){ + /*Check to make sure we received all three headers... + If we haven't seen any valid headers, assume this is not actually + Theora.*/ + if(_info->frame_width<=0)return TH_ENOTFORMAT; + /*Follow our documentation, which says we'll return TH_EFAULT if this + are NULL (_info was checked by our caller).*/ + if(_tc==NULL)return TH_EFAULT; + /*And if any other headers were missing, declare this packet "out of + sequence" instead.*/ + if(_tc->vendor==NULL)return TH_EBADHEADER; + /*Don't check this until it's needed, since we allow passing NULL for the + arguments that we're not expecting the next header to fill in yet.*/ + if(_setup==NULL)return TH_EFAULT; + if(*_setup==NULL)return TH_EBADHEADER; + /*If we got everything, we're done.*/ return 0; } /*Check the codec string.*/ diff --git a/thirdparty/libtheora/decint.h b/thirdparty/libtheora/decint.h index 261b67631a..3cea6b1439 100644 --- a/thirdparty/libtheora/decint.h +++ b/thirdparty/libtheora/decint.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -19,15 +19,39 @@ #if !defined(_decint_H) # define _decint_H (1) # include "theora/theoradec.h" -# include "internal.h" +# include "state.h" # include "bitpack.h" - -typedef struct th_setup_info oc_setup_info; -typedef struct th_dec_ctx oc_dec_ctx; - # include "huffdec.h" # include "dequant.h" +typedef struct th_setup_info oc_setup_info; +typedef struct oc_dec_opt_vtable oc_dec_opt_vtable; +typedef struct oc_dec_pipeline_state oc_dec_pipeline_state; +typedef struct th_dec_ctx oc_dec_ctx; + + + +/*Decoder-specific accelerated functions.*/ +# if defined(OC_C64X_ASM) +# include "c64x/c64xdec.h" +# endif + +# if !defined(oc_dec_accel_init) +# define oc_dec_accel_init oc_dec_accel_init_c +# endif +# if defined(OC_DEC_USE_VTABLE) +# if !defined(oc_dec_dc_unpredict_mcu_plane) +# define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \ + ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli)) +# endif +# else +# if !defined(oc_dec_dc_unpredict_mcu_plane) +# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c +# endif +# endif + + + /*Constants for the packet-in state machine specific to the decoder.*/ /*Next packet to read: Data packet.*/ @@ -37,71 +61,125 @@ typedef struct th_dec_ctx oc_dec_ctx; struct th_setup_info{ /*The Huffman codes.*/ - oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES]; + ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES]; /*The quantization parameters.*/ th_quant_info qinfo; }; +/*Decoder specific functions with accelerated variants.*/ +struct oc_dec_opt_vtable{ + void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec, + oc_dec_pipeline_state *_pipe,int _pli); +}; + + + +struct oc_dec_pipeline_state{ + /*Decoded DCT coefficients. + These are placed here instead of on the stack so that they can persist + between blocks, which makes clearing them back to zero much faster when + only a few non-zero coefficients were decoded. + It requires at least 65 elements because the zig-zag index array uses the + 65th element as a dumping ground for out-of-range indices to protect us + from buffer overflow. + We make it fully twice as large so that the second half can serve as the + reconstruction buffer, which saves passing another parameter to all the + acceleration functios. + It also solves problems with 16-byte alignment for NEON on ARM. + gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte + alignment, and silently produces incorrect results if you ask for 16. + Finally, keeping it off the stack means there's less likely to be a data + hazard beween the NEON co-processor and the regular ARM core, which avoids + unnecessary stalls.*/ + OC_ALIGN16(ogg_int16_t dct_coeffs[128]); + OC_ALIGN16(signed char bounding_values[256]); + ptrdiff_t ti[3][64]; + ptrdiff_t ebi[3][64]; + ptrdiff_t eob_runs[3][64]; + const ptrdiff_t *coded_fragis[3]; + const ptrdiff_t *uncoded_fragis[3]; + ptrdiff_t ncoded_fragis[3]; + ptrdiff_t nuncoded_fragis[3]; + const ogg_uint16_t *dequant[3][3][2]; + int fragy0[3]; + int fragy_end[3]; + int pred_last[3][4]; + int mcu_nvfrags; + int loop_filter; + int pp_level; +}; + + struct th_dec_ctx{ /*Shared encoder/decoder state.*/ - oc_theora_state state; + oc_theora_state state; /*Whether or not packets are ready to be emitted. This takes on negative values while there are remaining header packets to be emitted, reaches 0 when the codec is ready for input, and goes to 1 when a frame has been processed and a data packet is ready.*/ - int packet_state; + int packet_state; /*Buffer in which to assemble packets.*/ - oc_pack_buf opb; + oc_pack_buf opb; /*Huffman decode trees.*/ - oc_huff_node *huff_tables[TH_NHUFFMAN_TABLES]; + ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES]; /*The index of the first token in each plane for each coefficient.*/ - ptrdiff_t ti0[3][64]; + ptrdiff_t ti0[3][64]; /*The number of outstanding EOB runs at the start of each coefficient in each plane.*/ - ptrdiff_t eob_runs[3][64]; + ptrdiff_t eob_runs[3][64]; /*The DCT token lists.*/ - unsigned char *dct_tokens; + unsigned char *dct_tokens; /*The extra bits associated with DCT tokens.*/ - unsigned char *extra_bits; + unsigned char *extra_bits; /*The number of dct tokens unpacked so far.*/ - int dct_tokens_count; + int dct_tokens_count; /*The out-of-loop post-processing level.*/ - int pp_level; + int pp_level; /*The DC scale used for out-of-loop deblocking.*/ - int pp_dc_scale[64]; + int pp_dc_scale[64]; /*The sharpen modifier used for out-of-loop deringing.*/ - int pp_sharp_mod[64]; + int pp_sharp_mod[64]; /*The DC quantization index of each block.*/ - unsigned char *dc_qis; + unsigned char *dc_qis; /*The variance of each block.*/ - int *variances; + int *variances; /*The storage for the post-processed frame buffer.*/ - unsigned char *pp_frame_data; + unsigned char *pp_frame_data; /*Whether or not the post-processsed frame buffer has space for chroma.*/ - int pp_frame_state; + int pp_frame_state; /*The buffer used for the post-processed frame. Note that this is _not_ guaranteed to have the same strides and offsets as the reference frame buffers.*/ - th_ycbcr_buffer pp_frame_buf; + th_ycbcr_buffer pp_frame_buf; /*The striped decode callback function.*/ - th_stripe_callback stripe_cb; + th_stripe_callback stripe_cb; + oc_dec_pipeline_state pipe; +# if defined(OC_DEC_USE_VTABLE) + /*Table for decoder acceleration functions.*/ + oc_dec_opt_vtable opt_vtable; +# endif # if defined(HAVE_CAIRO) /*Output metrics for debugging.*/ - int telemetry; - int telemetry_mbmode; - int telemetry_mv; - int telemetry_qi; - int telemetry_bits; - int telemetry_frame_bytes; - int telemetry_coding_bytes; - int telemetry_mode_bytes; - int telemetry_mv_bytes; - int telemetry_qi_bytes; - int telemetry_dc_bytes; - unsigned char *telemetry_frame_data; + int telemetry_mbmode; + int telemetry_mv; + int telemetry_qi; + int telemetry_bits; + int telemetry_frame_bytes; + int telemetry_coding_bytes; + int telemetry_mode_bytes; + int telemetry_mv_bytes; + int telemetry_qi_bytes; + int telemetry_dc_bytes; + unsigned char *telemetry_frame_data; # endif }; +/*Default pure-C implementations of decoder-specific accelerated functions.*/ +void oc_dec_accel_init_c(oc_dec_ctx *_dec); + +void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec, + oc_dec_pipeline_state *_pipe,int _pli); + #endif diff --git a/thirdparty/libtheora/decode.c b/thirdparty/libtheora/decode.c index bde967b794..fad26e0927 100644 --- a/thirdparty/libtheora/decode.c +++ b/thirdparty/libtheora/decode.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: decode.c 16581 2009-09-25 22:56:16Z gmaxwell $ + last mod: $Id$ ********************************************************************/ @@ -118,7 +118,7 @@ static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={ /*Whether or not an internal token needs any additional extra bits.*/ #define OC_DCT_TOKEN_NEEDS_MORE(token) \ - (token<(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \ + (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \ sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS))) /*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/ @@ -129,7 +129,7 @@ static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={ is not yet available everywhere; this should be equivalent.*/ #define OC_DCT_EOB_FINISH (~(size_t)0>>1) -/*The location of the (6) run legth bits in the code word. +/*The location of the (6) run length bits in the code word. These are placed at index 0 and given 8 bits (even though 6 would suffice) because it may be faster to extract the lower byte on some platforms.*/ #define OC_DCT_CW_RLEN_SHIFT (0) @@ -297,8 +297,6 @@ static const ogg_int32_t OC_DCT_CODE_WORD[92]={ static int oc_sb_run_unpack(oc_pack_buf *_opb){ - long bits; - int ret; /*Coding scheme: Codeword Run Length 0 1 @@ -308,32 +306,26 @@ static int oc_sb_run_unpack(oc_pack_buf *_opb){ 11110xxx 10-17 111110xxxx 18-33 111111xxxxxxxxxxxx 34-4129*/ - bits=oc_pack_read1(_opb); - if(bits==0)return 1; - bits=oc_pack_read(_opb,2); - if((bits&2)==0)return 2+(int)bits; - else if((bits&1)==0){ - bits=oc_pack_read1(_opb); - return 4+(int)bits; - } - bits=oc_pack_read(_opb,3); - if((bits&4)==0)return 6+(int)bits; - else if((bits&2)==0){ - ret=10+((bits&1)<<2); - bits=oc_pack_read(_opb,2); - return ret+(int)bits; - } - else if((bits&1)==0){ - bits=oc_pack_read(_opb,4); - return 18+(int)bits; + static const ogg_int16_t OC_SB_RUN_TREE[22]={ + 4, + -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1), + -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1), + -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3), + -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17, + 2, + -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6) + }; + int ret; + ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE); + if(ret>=0x10){ + int offs; + offs=ret&0x1F; + ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4); } - bits=oc_pack_read(_opb,12); - return 34+(int)bits; + return ret; } static int oc_block_run_unpack(oc_pack_buf *_opb){ - long bits; - long bits2; /*Coding scheme: Codeword Run Length 0x 1-2 @@ -342,26 +334,37 @@ static int oc_block_run_unpack(oc_pack_buf *_opb){ 1110xx 7-10 11110xx 11-14 11111xxxx 15-30*/ - bits=oc_pack_read(_opb,2); - if((bits&2)==0)return 1+(int)bits; - else if((bits&1)==0){ - bits=oc_pack_read1(_opb); - return 3+(int)bits; - } - bits=oc_pack_read(_opb,2); - if((bits&2)==0)return 5+(int)bits; - else if((bits&1)==0){ - bits=oc_pack_read(_opb,2); - return 7+(int)bits; - } - bits=oc_pack_read(_opb,3); - if((bits&4)==0)return 11+bits; - bits2=oc_pack_read(_opb,2); - return 15+((bits&3)<<2)+bits2; + static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={ + 5, + -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), + -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), + -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2), + -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2), + -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3), + -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4), + -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6), + 33, 36, 39, 44, + 1,-(1<<8|7),-(1<<8|8), + 1,-(1<<8|9),-(1<<8|10), + 2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14), + 4, + -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18), + -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22), + -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26), + -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30) + }; + return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE); } +void oc_dec_accel_init_c(oc_dec_ctx *_dec){ +# if defined(OC_DEC_USE_VTABLE) + _dec->opt_vtable.dc_unpredict_mcu_plane= + oc_dec_dc_unpredict_mcu_plane_c; +# endif +} + static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, const th_setup_info *_setup){ int qti; @@ -371,7 +374,7 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, ret=oc_state_init(&_dec->state,_info,3); if(ret<0)return ret; ret=oc_huff_trees_copy(_dec->huff_tables, - (const oc_huff_node *const *)_setup->huff_tables); + (const ogg_int16_t *const *)_setup->huff_tables); if(ret<0){ oc_state_clear(&_dec->state); return ret; @@ -406,6 +409,7 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, } memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits, sizeof(_dec->state.loop_filter_limits)); + oc_dec_accel_init(_dec); _dec->pp_level=OC_PP_LEVEL_DISABLED; _dec->dc_qis=NULL; _dec->variances=NULL; @@ -413,7 +417,6 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, _dec->stripe_cb.ctx=NULL; _dec->stripe_cb.stripe_decoded=NULL; #if defined(HAVE_CAIRO) - _dec->telemetry=0; _dec->telemetry_bits=0; _dec->telemetry_qi=0; _dec->telemetry_mbmode=0; @@ -504,6 +507,7 @@ static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){ fragi=sb_maps[sbi][quadi][bi]; if(fragi>=0){ frags[fragi].coded=1; + frags[fragi].refi=OC_FRAME_SELF; frags[fragi].mb_mode=OC_MODE_INTRA; coded_fragis[ncoded_fragis++]=fragi; } @@ -595,6 +599,7 @@ static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ const oc_sb_map *sb_maps; const oc_sb_flags *sb_flags; + signed char *mb_modes; oc_fragment *frags; unsigned nsbs; unsigned sbi; @@ -617,6 +622,7 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ else flag=0; sb_maps=(const oc_sb_map *)_dec->state.sb_maps; sb_flags=_dec->state.sb_flags; + mb_modes=_dec->state.mb_modes; frags=_dec->state.frags; sbi=nsbs=run_count=0; coded_fragis=_dec->state.coded_fragis; @@ -627,7 +633,9 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ for(;sbi<nsbs;sbi++){ int quadi; for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ + int quad_coded; int bi; + quad_coded=0; for(bi=0;bi<4;bi++){ ptrdiff_t fragi; fragi=sb_maps[sbi][quadi][bi]; @@ -645,9 +653,13 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ } if(coded)coded_fragis[ncoded_fragis++]=fragi; else *(uncoded_fragis-++nuncoded_fragis)=fragi; + quad_coded|=coded; frags[fragi].coded=coded; + frags[fragi].refi=OC_FRAME_NONE; } } + /*Remember if there's a coded luma block in this macro block.*/ + if(!pli)mb_modes[sbi<<2|quadi]=quad_coded; } } _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis; @@ -659,33 +671,39 @@ static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){ } +/*Coding scheme: + Codeword Mode Index + 0 0 + 10 1 + 110 2 + 1110 3 + 11110 4 + 111110 5 + 1111110 6 + 1111111 7*/ +static const ogg_int16_t OC_VLC_MODE_TREE[26]={ + 4, + -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0), + -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0), + -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1), + -(3<<8|2),-(3<<8|2),-(4<<8|3),17, + 3, + -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4), + -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7) +}; -typedef int (*oc_mode_unpack_func)(oc_pack_buf *_opb); - -static int oc_vlc_mode_unpack(oc_pack_buf *_opb){ - long val; - int i; - for(i=0;i<7;i++){ - val=oc_pack_read1(_opb); - if(!val)break; - } - return i; -} - -static int oc_clc_mode_unpack(oc_pack_buf *_opb){ - long val; - val=oc_pack_read(_opb,3); - return (int)val; -} +static const ogg_int16_t OC_CLC_MODE_TREE[9]={ + 3, + -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3), + -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7) +}; /*Unpacks the list of macro block modes for INTER frames.*/ static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){ - const oc_mb_map *mb_maps; signed char *mb_modes; - const oc_fragment *frags; const unsigned char *alphabet; unsigned char scheme0_alphabet[8]; - oc_mode_unpack_func mode_unpack; + const ogg_int16_t *mode_tree; size_t nmbs; size_t mbi; long val; @@ -707,65 +725,80 @@ static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){ alphabet=scheme0_alphabet; } else alphabet=OC_MODE_ALPHABETS[mode_scheme-1]; - if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack; - else mode_unpack=oc_vlc_mode_unpack; + mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE; mb_modes=_dec->state.mb_modes; - mb_maps=(const oc_mb_map *)_dec->state.mb_maps; nmbs=_dec->state.nmbs; - frags=_dec->state.frags; for(mbi=0;mbi<nmbs;mbi++){ - if(mb_modes[mbi]!=OC_MODE_INVALID){ - int bi; - /*Check for a coded luma block in this macro block.*/ - for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++); - /*We found one, decode a mode.*/ - if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)]; - /*There were none: INTER_NOMV is forced.*/ - else mb_modes[mbi]=OC_MODE_INTER_NOMV; + if(mb_modes[mbi]>0){ + /*We have a coded luma block; decode a mode.*/ + mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)]; } + /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the + fact that OC_MODE_INTER_NOMV is already 0.*/ } } -typedef int (*oc_mv_comp_unpack_func)(oc_pack_buf *_opb); +static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={ + 5, + -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0), + -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1), + -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1), + -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2), + -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3), + 33, 36, 39, 42, + 45, 50, 55, 60, + 65, 74, 83, 92, + 1,-(1<<8|32+4),-(1<<8|32-4), + 1,-(1<<8|32+5),-(1<<8|32-5), + 1,-(1<<8|32+6),-(1<<8|32-6), + 1,-(1<<8|32+7),-(1<<8|32-7), + 2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9), + 2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11), + 2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13), + 2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15), + 3, + -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17), + -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19), + 3, + -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21), + -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23), + 3, + -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25), + -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27), + 3, + -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29), + -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31) +}; + +static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={ + 6, + -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1), + -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3), + -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5), + -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7), + -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9), + -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11), + -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13), + -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15), + -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17), + -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19), + -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21), + -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23), + -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25), + -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27), + -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29), + -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31) +}; -static int oc_vlc_mv_comp_unpack(oc_pack_buf *_opb){ - long bits; - int mask; - int mv; - bits=oc_pack_read(_opb,3); - switch(bits){ - case 0:return 0; - case 1:return 1; - case 2:return -1; - case 3: - case 4:{ - mv=(int)(bits-1); - bits=oc_pack_read1(_opb); - }break; - /*case 5: - case 6: - case 7:*/ - default:{ - mv=1<<bits-3; - bits=oc_pack_read(_opb,bits-2); - mv+=(int)(bits>>1); - bits&=1; - }break; - } - mask=-(int)bits; - return mv+mask^mask; -} -static int oc_clc_mv_comp_unpack(oc_pack_buf *_opb){ - long bits; - int mask; - int mv; - bits=oc_pack_read(_opb,6); - mv=(int)bits>>1; - mask=-((int)bits&1); - return mv+mask^mask; +static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){ + int dx; + int dy; + dx=oc_huff_token_decode(_opb,_tree)-32; + dy=oc_huff_token_decode(_opb,_tree)-32; + return OC_MV(dx,dy); } /*Unpacks the list of motion vectors for INTER frames, and propagtes the macro @@ -774,105 +807,93 @@ static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){ const oc_mb_map *mb_maps; const signed char *mb_modes; oc_set_chroma_mvs_func set_chroma_mvs; - oc_mv_comp_unpack_func mv_comp_unpack; + const ogg_int16_t *mv_comp_tree; oc_fragment *frags; oc_mv *frag_mvs; const unsigned char *map_idxs; int map_nidxs; - oc_mv last_mv[2]; + oc_mv last_mv; + oc_mv prior_mv; oc_mv cbmvs[4]; size_t nmbs; size_t mbi; long val; set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt]; val=oc_pack_read1(&_dec->opb); - mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack; + mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE; map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt]; map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt]; - memset(last_mv,0,sizeof(last_mv)); + prior_mv=last_mv=0; frags=_dec->state.frags; frag_mvs=_dec->state.frag_mvs; mb_maps=(const oc_mb_map *)_dec->state.mb_maps; mb_modes=_dec->state.mb_modes; nmbs=_dec->state.nmbs; for(mbi=0;mbi<nmbs;mbi++){ - int mb_mode; + int mb_mode; mb_mode=mb_modes[mbi]; if(mb_mode!=OC_MODE_INVALID){ - oc_mv mbmv; - ptrdiff_t fragi; - int coded[13]; - int codedi; - int ncoded; - int mapi; - int mapii; - /*Search for at least one coded fragment.*/ - ncoded=mapii=0; - do{ - mapi=map_idxs[mapii]; - fragi=mb_maps[mbi][mapi>>2][mapi&3]; - if(frags[fragi].coded)coded[ncoded++]=mapi; - } - while(++mapii<map_nidxs); - if(ncoded<=0)continue; - switch(mb_mode){ - case OC_MODE_INTER_MV_FOUR:{ - oc_mv lbmvs[4]; - int bi; - /*Mark the tail of the list, so we don't accidentally go past it.*/ - coded[ncoded]=-1; - for(bi=codedi=0;bi<4;bi++){ - if(coded[codedi]==bi){ - codedi++; - fragi=mb_maps[mbi][0][bi]; - frags[fragi].mb_mode=mb_mode; - lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb); - lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb); - memcpy(frag_mvs[fragi],lbmvs[bi],sizeof(lbmvs[bi])); - } - else lbmvs[bi][0]=lbmvs[bi][1]=0; - } - if(codedi>0){ - memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1])); - memcpy(last_mv[0],lbmvs[coded[codedi-1]],sizeof(last_mv[0])); + oc_mv mbmv; + ptrdiff_t fragi; + int mapi; + int mapii; + int refi; + if(mb_mode==OC_MODE_INTER_MV_FOUR){ + oc_mv lbmvs[4]; + int bi; + prior_mv=last_mv; + for(bi=0;bi<4;bi++){ + fragi=mb_maps[mbi][0][bi]; + if(frags[fragi].coded){ + frags[fragi].refi=OC_FRAME_PREV; + frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR; + lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree); + frag_mvs[fragi]=lbmvs[bi]; } - if(codedi<ncoded){ - (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs); - for(;codedi<ncoded;codedi++){ - mapi=coded[codedi]; - bi=mapi&3; - fragi=mb_maps[mbi][mapi>>2][bi]; - frags[fragi].mb_mode=mb_mode; - memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(cbmvs[bi])); - } + else lbmvs[bi]=0; + } + (*set_chroma_mvs)(cbmvs,lbmvs); + for(mapii=4;mapii<map_nidxs;mapii++){ + mapi=map_idxs[mapii]; + bi=mapi&3; + fragi=mb_maps[mbi][mapi>>2][bi]; + if(frags[fragi].coded){ + frags[fragi].refi=OC_FRAME_PREV; + frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR; + frag_mvs[fragi]=cbmvs[bi]; } - }break; - case OC_MODE_INTER_MV:{ - memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1])); - mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb); - mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb); - }break; - case OC_MODE_INTER_MV_LAST:memcpy(mbmv,last_mv[0],sizeof(mbmv));break; - case OC_MODE_INTER_MV_LAST2:{ - memcpy(mbmv,last_mv[1],sizeof(mbmv)); - memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1])); - memcpy(last_mv[0],mbmv,sizeof(last_mv[0])); - }break; - case OC_MODE_GOLDEN_MV:{ - mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb); - mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb); - }break; - default:memset(mbmv,0,sizeof(mbmv));break; + } } - /*4MV mode fills in the fragments itself. - For all other modes we can use this common code.*/ - if(mb_mode!=OC_MODE_INTER_MV_FOUR){ - for(codedi=0;codedi<ncoded;codedi++){ - mapi=coded[codedi]; + else{ + switch(mb_mode){ + case OC_MODE_INTER_MV:{ + prior_mv=last_mv; + last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree); + }break; + case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break; + case OC_MODE_INTER_MV_LAST2:{ + mbmv=prior_mv; + prior_mv=last_mv; + last_mv=mbmv; + }break; + case OC_MODE_GOLDEN_MV:{ + mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree); + }break; + default:mbmv=0;break; + } + /*Fill in the MVs for the fragments.*/ + refi=OC_FRAME_FOR_MODE(mb_mode); + mapii=0; + do{ + mapi=map_idxs[mapii]; fragi=mb_maps[mbi][mapi>>2][mapi&3]; - frags[fragi].mb_mode=mb_mode; - memcpy(frag_mvs[fragi],mbmv,sizeof(mbmv)); + if(frags[fragi].coded){ + frags[fragi].refi=refi; + frags[fragi].mb_mode=mb_mode; + frag_mvs[fragi]=mbmv; + } } + while(++mapii<map_nidxs); } } } @@ -1181,6 +1202,9 @@ static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){ static int oc_dec_postprocess_init(oc_dec_ctx *_dec){ + /*musl libc malloc()/realloc() calls might use floating point, so make sure + we've cleared the MMX state for them.*/ + oc_restore_fpu(&_dec->state); /*pp_level 0: disabled; free any memory used and return*/ if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){ if(_dec->dc_qis!=NULL){ @@ -1301,34 +1325,16 @@ static int oc_dec_postprocess_init(oc_dec_ctx *_dec){ } - -typedef struct{ - int bounding_values[256]; - ptrdiff_t ti[3][64]; - ptrdiff_t eob_runs[3][64]; - const ptrdiff_t *coded_fragis[3]; - const ptrdiff_t *uncoded_fragis[3]; - ptrdiff_t ncoded_fragis[3]; - ptrdiff_t nuncoded_fragis[3]; - const ogg_uint16_t *dequant[3][3][2]; - int fragy0[3]; - int fragy_end[3]; - int pred_last[3][3]; - int mcu_nvfrags; - int loop_filter; - int pp_level; -}oc_dec_pipeline_state; - - - /*Initialize the main decoding pipeline.*/ static void oc_dec_pipeline_init(oc_dec_ctx *_dec, oc_dec_pipeline_state *_pipe){ const ptrdiff_t *coded_fragis; const ptrdiff_t *uncoded_fragis; + int flimit; int pli; int qii; int qti; + int zzi; /*If chroma is sub-sampled in the vertical direction, we have to decode two super block rows of Y' for each super block row of Cb and Cr.*/ _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2); @@ -1360,8 +1366,9 @@ static void oc_dec_pipeline_init(oc_dec_ctx *_dec, /*Set the previous DC predictor to 0 for all color planes and frame types.*/ memset(_pipe->pred_last,0,sizeof(_pipe->pred_last)); /*Initialize the bounding value array for the loop filter.*/ - _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state, - _pipe->bounding_values); + flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]]; + _pipe->loop_filter=flimit!=0; + if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit); /*Initialize any buffers needed for post-processing. We also save the current post-processing level, to guard against the user changing it from a callback.*/ @@ -1374,13 +1381,15 @@ static void oc_dec_pipeline_init(oc_dec_ctx *_dec, _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]], sizeof(_dec->pp_frame_buf[0])*3); } + /*Clear down the DCT coefficient buffer for the first block.*/ + for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0; } /*Undo the DC prediction in a single plane of an MCU (one or two super block rows). As a side effect, the number of coded and uncoded fragments in this plane of the MCU is also computed.*/ -static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, +void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec, oc_dec_pipeline_state *_pipe,int _pli){ const oc_fragment_plane *fplane; oc_fragment *frags; @@ -1408,9 +1417,9 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, predictor for the same reference frame.*/ for(fragx=0;fragx<nhfrags;fragx++,fragi++){ if(frags[fragi].coded){ - int ref; - ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode); - pred_last[ref]=frags[fragi].dc+=pred_last[ref]; + int refi; + refi=frags[fragi].refi; + pred_last[refi]=frags[fragi].dc+=pred_last[refi]; ncoded_fragis++; } } @@ -1423,27 +1432,24 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, u_frags=frags-nhfrags; l_ref=-1; ul_ref=-1; - u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1; + u_ref=u_frags[fragi].refi; for(fragx=0;fragx<nhfrags;fragx++,fragi++){ int ur_ref; if(fragx+1>=nhfrags)ur_ref=-1; - else{ - ur_ref=u_frags[fragi+1].coded? - OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1; - } + else ur_ref=u_frags[fragi+1].refi; if(frags[fragi].coded){ int pred; - int ref; - ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode); + int refi; + refi=frags[fragi].refi; /*We break out a separate case based on which of our neighbors use the same reference frames. This is somewhat faster than trying to make a generic case which handles all of them, since it reduces lots of poorly predicted jumps to one switch statement, and also lets a number of the multiplications be optimized out by strength reduction.*/ - switch((l_ref==ref)|(ul_ref==ref)<<1| - (u_ref==ref)<<2|(ur_ref==ref)<<3){ - default:pred=pred_last[ref];break; + switch((l_ref==refi)|(ul_ref==refi)<<1| + (u_ref==refi)<<2|(ur_ref==refi)<<3){ + default:pred=pred_last[refi];break; case 1: case 3:pred=frags[fragi-1].dc;break; case 2:pred=u_frags[fragi-1].dc;break; @@ -1455,6 +1461,7 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, case 9: case 11: case 13:{ + /*The TI compiler mis-compiles this line.*/ pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128; }break; case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break; @@ -1476,9 +1483,9 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, else if(abs(pred-p1)>128)pred=p1; }break; } - pred_last[ref]=frags[fragi].dc+=pred; + pred_last[refi]=frags[fragi].dc+=pred; ncoded_fragis++; - l_ref=ref; + l_ref=refi; } else l_ref=-1; ul_ref=u_ref; @@ -1495,7 +1502,7 @@ static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec, /*Reconstructs all coded fragments in a single MCU (one or two super block rows). This requires that each coded fragment have a proper macro block mode and - motion vector (if not in INTRA mode), and have it's DC value decoded, with + motion vector (if not in INTRA mode), and have its DC value decoded, with the DC prediction process reversed, and the number of coded and uncoded fragments in this plane of the MCU be counted. The token lists for each color plane and coefficient should also be filled @@ -1522,16 +1529,11 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec, eob_runs=_pipe->eob_runs[_pli]; for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0]; for(fragii=0;fragii<ncoded_fragis;fragii++){ - /*This array is made one element larger because the zig-zag index array - uses the final element as a dumping ground for out-of-range indices - to protect us from buffer overflow.*/ - OC_ALIGN8(ogg_int16_t dct_coeffs[65]); const ogg_uint16_t *ac_quant; ptrdiff_t fragi; int last_zzi; int zzi; fragi=coded_fragis[fragii]; - for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0; qti=frags[fragi].mb_mode!=OC_MODE_INTRA; ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti]; /*Decode the AC coefficients.*/ @@ -1568,18 +1570,19 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec, eob_runs[zzi]=eob; ti[zzi]=lti; zzi+=rlen; - dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]); + _pipe->dct_coeffs[dct_fzig_zag[zzi]]= + (ogg_int16_t)(coeff*(int)ac_quant[zzi]); zzi+=!eob; } } /*TODO: zzi should be exactly 64 here. If it's not, we should report some kind of warning.*/ zzi=OC_MINI(zzi,64); - dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc; + _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc; /*last_zzi is always initialized. If your compiler thinks otherwise, it is dumb.*/ oc_state_frag_recon(&_dec->state,fragi,_pli, - dct_coeffs,last_zzi,dc_quant[qti]); + _pipe->dct_coeffs,last_zzi,dc_quant[qti]); } _pipe->coded_fragis[_pli]+=ncoded_fragis; /*Right now the reconstructed MCU has only the coded blocks in it.*/ @@ -1593,9 +1596,14 @@ static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec, code, and the hard case (high bitrate, high resolution) is handled correctly.*/ /*Copy the uncoded blocks from the previous reference frame.*/ - _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; - oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli], - _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli); + if(_pipe->nuncoded_fragis[_pli]>0){ + _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; + oc_frag_copy_list(&_dec->state, + _dec->state.ref_frame_data[OC_FRAME_SELF], + _dec->state.ref_frame_data[OC_FRAME_PREV], + _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli], + _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs); + } } /*Filter a horizontal block edge.*/ @@ -1953,9 +1961,9 @@ static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img, th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){ oc_dec_ctx *dec; if(_info==NULL||_setup==NULL)return NULL; - dec=_ogg_malloc(sizeof(*dec)); + dec=oc_aligned_malloc(sizeof(*dec),16); if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){ - _ogg_free(dec); + oc_aligned_free(dec); return NULL; } dec->state.curframe_num=0; @@ -1965,7 +1973,7 @@ th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){ void th_decode_free(th_dec_ctx *_dec){ if(_dec!=NULL){ oc_dec_clear(_dec); - _ogg_free(_dec); + oc_aligned_free(_dec); } } @@ -2013,28 +2021,24 @@ int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf, case TH_DECCTL_SET_TELEMETRY_MBMODE:{ if(_dec==NULL||_buf==NULL)return TH_EFAULT; if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; _dec->telemetry_mbmode=*(int *)_buf; return 0; }break; case TH_DECCTL_SET_TELEMETRY_MV:{ if(_dec==NULL||_buf==NULL)return TH_EFAULT; if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; _dec->telemetry_mv=*(int *)_buf; return 0; }break; case TH_DECCTL_SET_TELEMETRY_QI:{ if(_dec==NULL||_buf==NULL)return TH_EFAULT; if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; _dec->telemetry_qi=*(int *)_buf; return 0; }break; case TH_DECCTL_SET_TELEMETRY_BITS:{ if(_dec==NULL||_buf==NULL)return TH_EFAULT; if(_buf_sz!=sizeof(int))return TH_EINVAL; - _dec->telemetry=1; _dec->telemetry_bits=*(int *)_buf; return 0; }break; @@ -2047,63 +2051,751 @@ int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf, buffers (i.e., decoding did not start on a key frame). We initialize them to a solid gray here.*/ static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){ - th_info *info; - size_t yplane_sz; - size_t cplane_sz; - int yhstride; - int yheight; - int chstride; - int cheight; + th_info *info; + size_t yplane_sz; + size_t cplane_sz; + ptrdiff_t yoffset; + int yhstride; + int yheight; + int chstride; + int cheight; _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0; _dec->state.ref_frame_idx[OC_FRAME_PREV]=0; - _dec->state.ref_frame_idx[OC_FRAME_SELF]=1; + _dec->state.ref_frame_idx[OC_FRAME_SELF]=0; + _dec->state.ref_frame_data[OC_FRAME_GOLD]= + _dec->state.ref_frame_data[OC_FRAME_PREV]= + _dec->state.ref_frame_data[OC_FRAME_SELF]= + _dec->state.ref_frame_bufs[0][0].data; + memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0], + sizeof(_dec->pp_frame_buf[0])*3); info=&_dec->state.info; - yhstride=info->frame_width+2*OC_UMV_PADDING; + yhstride=abs(_dec->state.ref_ystride[0]); yheight=info->frame_height+2*OC_UMV_PADDING; - chstride=yhstride>>!(info->pixel_fmt&1); + chstride=abs(_dec->state.ref_ystride[1]); cheight=yheight>>!(info->pixel_fmt&2); - yplane_sz=yhstride*(size_t)yheight; + yplane_sz=yhstride*(size_t)yheight+16; cplane_sz=chstride*(size_t)cheight; - memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz); + yoffset=yhstride*(ptrdiff_t)(yheight-OC_UMV_PADDING-1)+OC_UMV_PADDING; + memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz); +} + +#if defined(HAVE_CAIRO) +static void oc_render_telemetry(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr, + int _telemetry){ + /*Stuff the plane into cairo.*/ + cairo_surface_t *cs; + unsigned char *data; + unsigned char *y_row; + unsigned char *u_row; + unsigned char *v_row; + unsigned char *rgb_row; + int cstride; + int w; + int h; + int x; + int y; + int hdec; + int vdec; + w=_ycbcr[0].width; + h=_ycbcr[0].height; + hdec=!(_dec->state.info.pixel_fmt&1); + vdec=!(_dec->state.info.pixel_fmt&2); + /*Lazy data buffer init. + We could try to re-use the post-processing buffer, which would save + memory, but complicate the allocation logic there. + I don't think anyone cares about memory usage when using telemetry; it is + not meant for embedded devices.*/ + if(_dec->telemetry_frame_data==NULL){ + _dec->telemetry_frame_data=_ogg_malloc( + (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data)); + if(_dec->telemetry_frame_data==NULL)return; + } + cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h); + /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/ + data=cairo_image_surface_get_data(cs); + if(data==NULL){ + cairo_surface_destroy(cs); + return; + } + cstride=cairo_image_surface_get_stride(cs); + y_row=_ycbcr[0].data; + u_row=_ycbcr[1].data; + v_row=_ycbcr[2].data; + rgb_row=data; + for(y=0;y<h;y++){ + for(x=0;x<w;x++){ + int r; + int g; + int b; + r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200; + g=(3827562*y_row[x]-1287801*u_row[x>>hdec] + -2672387*v_row[x>>hdec]+447306710)/3287200; + b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600; + rgb_row[4*x+0]=OC_CLAMP255(b); + rgb_row[4*x+1]=OC_CLAMP255(g); + rgb_row[4*x+2]=OC_CLAMP255(r); + } + y_row+=_ycbcr[0].stride; + u_row+=_ycbcr[1].stride&-((y&1)|!vdec); + v_row+=_ycbcr[2].stride&-((y&1)|!vdec); + rgb_row+=cstride; + } + /*Draw coded identifier for each macroblock (stored in Hilbert order).*/ + { + cairo_t *c; + const oc_fragment *frags; + oc_mv *frag_mvs; + const signed char *mb_modes; + oc_mb_map *mb_maps; + size_t nmbs; + size_t mbi; + int row2; + int col2; + int qim[3]={0,0,0}; + if(_dec->state.nqis==2){ + int bqi; + bqi=_dec->state.qis[0]; + if(_dec->state.qis[1]>bqi)qim[1]=1; + if(_dec->state.qis[1]<bqi)qim[1]=-1; + } + if(_dec->state.nqis==3){ + int bqi; + int cqi; + int dqi; + bqi=_dec->state.qis[0]; + cqi=_dec->state.qis[1]; + dqi=_dec->state.qis[2]; + if(cqi>bqi&&dqi>bqi){ + if(dqi>cqi){ + qim[1]=1; + qim[2]=2; + } + else{ + qim[1]=2; + qim[2]=1; + } + } + else if(cqi<bqi&&dqi<bqi){ + if(dqi<cqi){ + qim[1]=-1; + qim[2]=-2; + } + else{ + qim[1]=-2; + qim[2]=-1; + } + } + else{ + if(cqi<bqi)qim[1]=-1; + else qim[1]=1; + if(dqi<bqi)qim[2]=-1; + else qim[2]=1; + } + } + c=cairo_create(cs); + frags=_dec->state.frags; + frag_mvs=_dec->state.frag_mvs; + mb_modes=_dec->state.mb_modes; + mb_maps=_dec->state.mb_maps; + nmbs=_dec->state.nmbs; + row2=0; + col2=0; + for(mbi=0;mbi<nmbs;mbi++){ + float x; + float y; + int bi; + y=h-(row2+((col2+1>>1)&1))*16-16; + x=(col2>>1)*16; + cairo_set_line_width(c,1.); + /*Keyframe (all intra) red box.*/ + if(_dec->state.frame_type==OC_INTRA_FRAME){ + if(_dec->telemetry_mbmode&0x02){ + cairo_set_source_rgba(c,1.,0,0,.5); + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,1.,0,0,.25); + cairo_fill(c); + } + } + else{ + ptrdiff_t fragi; + int frag_mvx; + int frag_mvy; + for(bi=0;bi<4;bi++){ + fragi=mb_maps[mbi][0][bi]; + if(fragi>=0&&frags[fragi].coded){ + frag_mvx=OC_MV_X(frag_mvs[fragi]); + frag_mvy=OC_MV_Y(frag_mvs[fragi]); + break; + } + } + if(bi<4){ + switch(mb_modes[mbi]){ + case OC_MODE_INTRA:{ + if(_dec->telemetry_mbmode&0x02){ + cairo_set_source_rgba(c,1.,0,0,.5); + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,1.,0,0,.25); + cairo_fill(c); + } + }break; + case OC_MODE_INTER_NOMV:{ + if(_dec->telemetry_mbmode&0x01){ + cairo_set_source_rgba(c,0,0,1.,.5); + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,0,0,1.,.25); + cairo_fill(c); + } + }break; + case OC_MODE_INTER_MV:{ + if(_dec->telemetry_mbmode&0x04){ + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_set_source_rgba(c,0,1.,0,.5); + cairo_stroke(c); + } + if(_dec->telemetry_mv&0x04){ + cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+8,y+8); + cairo_stroke(c); + } + }break; + case OC_MODE_INTER_MV_LAST:{ + if(_dec->telemetry_mbmode&0x08){ + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_set_source_rgba(c,0,1.,0,.5); + cairo_move_to(c,x+13.5,y+2.5); + cairo_line_to(c,x+2.5,y+8); + cairo_line_to(c,x+13.5,y+13.5); + cairo_stroke(c); + } + if(_dec->telemetry_mv&0x08){ + cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+8,y+8); + cairo_stroke(c); + } + }break; + case OC_MODE_INTER_MV_LAST2:{ + if(_dec->telemetry_mbmode&0x10){ + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_set_source_rgba(c,0,1.,0,.5); + cairo_move_to(c,x+8,y+2.5); + cairo_line_to(c,x+2.5,y+8); + cairo_line_to(c,x+8,y+13.5); + cairo_move_to(c,x+13.5,y+2.5); + cairo_line_to(c,x+8,y+8); + cairo_line_to(c,x+13.5,y+13.5); + cairo_stroke(c); + } + if(_dec->telemetry_mv&0x10){ + cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+8,y+8); + cairo_stroke(c); + } + }break; + case OC_MODE_GOLDEN_NOMV:{ + if(_dec->telemetry_mbmode&0x20){ + cairo_set_source_rgba(c,1.,1.,0,.5); + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,1.,1.,0,.25); + cairo_fill(c); + } + }break; + case OC_MODE_GOLDEN_MV:{ + if(_dec->telemetry_mbmode&0x40){ + cairo_rectangle(c,x+2.5,y+2.5,11,11); + cairo_set_source_rgba(c,1.,1.,0,.5); + cairo_stroke(c); + } + if(_dec->telemetry_mv&0x40){ + cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+8,y+8); + cairo_stroke(c); + } + }break; + case OC_MODE_INTER_MV_FOUR:{ + if(_dec->telemetry_mbmode&0x80){ + cairo_rectangle(c,x+2.5,y+2.5,4,4); + cairo_rectangle(c,x+9.5,y+2.5,4,4); + cairo_rectangle(c,x+2.5,y+9.5,4,4); + cairo_rectangle(c,x+9.5,y+9.5,4,4); + cairo_set_source_rgba(c,0,1.,0,.5); + cairo_stroke(c); + } + /*4mv is odd, coded in raster order.*/ + fragi=mb_maps[mbi][0][0]; + if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ + frag_mvx=OC_MV_X(frag_mvs[fragi]); + frag_mvx=OC_MV_Y(frag_mvs[fragi]); + cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+4,y+12); + cairo_stroke(c); + } + fragi=mb_maps[mbi][0][1]; + if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ + frag_mvx=OC_MV_X(frag_mvs[fragi]); + frag_mvx=OC_MV_Y(frag_mvs[fragi]); + cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+12,y+12); + cairo_stroke(c); + } + fragi=mb_maps[mbi][0][2]; + if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ + frag_mvx=OC_MV_X(frag_mvs[fragi]); + frag_mvx=OC_MV_Y(frag_mvs[fragi]); + cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+4,y+4); + cairo_stroke(c); + } + fragi=mb_maps[mbi][0][3]; + if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ + frag_mvx=OC_MV_X(frag_mvs[fragi]); + frag_mvx=OC_MV_Y(frag_mvs[fragi]); + cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy); + cairo_set_source_rgba(c,1.,1.,1.,.9); + cairo_set_line_width(c,3.); + cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66); + cairo_stroke_preserve(c); + cairo_set_line_width(c,2.); + cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33); + cairo_stroke_preserve(c); + cairo_set_line_width(c,1.); + cairo_line_to(c,x+12,y+4); + cairo_stroke(c); + } + }break; + } + } + } + /*qii illustration.*/ + if(_dec->telemetry_qi&0x2){ + cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE); + for(bi=0;bi<4;bi++){ + ptrdiff_t fragi; + int qiv; + int xp; + int yp; + xp=x+(bi&1)*8; + yp=y+8-(bi&2)*4; + fragi=mb_maps[mbi][0][bi]; + if(fragi>=0&&frags[fragi].coded){ + qiv=qim[frags[fragi].qii]; + cairo_set_line_width(c,3.); + cairo_set_source_rgba(c,0.,0.,0.,.5); + switch(qiv){ + /*Double plus:*/ + case 2:{ + if((bi&1)^((bi&2)>>1)){ + cairo_move_to(c,xp+2.5,yp+1.5); + cairo_line_to(c,xp+2.5,yp+3.5); + cairo_move_to(c,xp+1.5,yp+2.5); + cairo_line_to(c,xp+3.5,yp+2.5); + cairo_move_to(c,xp+5.5,yp+4.5); + cairo_line_to(c,xp+5.5,yp+6.5); + cairo_move_to(c,xp+4.5,yp+5.5); + cairo_line_to(c,xp+6.5,yp+5.5); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,0.,1.,1.,1.); + } + else{ + cairo_move_to(c,xp+5.5,yp+1.5); + cairo_line_to(c,xp+5.5,yp+3.5); + cairo_move_to(c,xp+4.5,yp+2.5); + cairo_line_to(c,xp+6.5,yp+2.5); + cairo_move_to(c,xp+2.5,yp+4.5); + cairo_line_to(c,xp+2.5,yp+6.5); + cairo_move_to(c,xp+1.5,yp+5.5); + cairo_line_to(c,xp+3.5,yp+5.5); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,0.,1.,1.,1.); + } + }break; + /*Double minus:*/ + case -2:{ + cairo_move_to(c,xp+2.5,yp+2.5); + cairo_line_to(c,xp+5.5,yp+2.5); + cairo_move_to(c,xp+2.5,yp+5.5); + cairo_line_to(c,xp+5.5,yp+5.5); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,1.,1.,1.,1.); + }break; + /*Plus:*/ + case 1:{ + if((bi&2)==0)yp-=2; + if((bi&1)==0)xp-=2; + cairo_move_to(c,xp+4.5,yp+2.5); + cairo_line_to(c,xp+4.5,yp+6.5); + cairo_move_to(c,xp+2.5,yp+4.5); + cairo_line_to(c,xp+6.5,yp+4.5); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,.1,1.,.3,1.); + break; + } + /*Fall through.*/ + /*Minus:*/ + case -1:{ + cairo_move_to(c,xp+2.5,yp+4.5); + cairo_line_to(c,xp+6.5,yp+4.5); + cairo_stroke_preserve(c); + cairo_set_source_rgba(c,1.,.3,.1,1.); + }break; + default:continue; + } + cairo_set_line_width(c,1.); + cairo_stroke(c); + } + } + } + col2++; + if((col2>>1)>=_dec->state.nhmbs){ + col2=0; + row2+=2; + } + } + /*Bit usage indicator[s]:*/ + if(_dec->telemetry_bits){ + int widths[6]; + int fpsn; + int fpsd; + int mult; + int fullw; + int padw; + int i; + fpsn=_dec->state.info.fps_numerator; + fpsd=_dec->state.info.fps_denominator; + mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits); + fullw=250.f*h*fpsd*mult/fpsn; + padw=w-24; + /*Header and coded block bits.*/ + if(_dec->telemetry_frame_bytes<0|| + _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){ + _dec->telemetry_frame_bytes=0; + } + if(_dec->telemetry_coding_bytes<0|| + _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){ + _dec->telemetry_coding_bytes=0; + } + if(_dec->telemetry_mode_bytes<0|| + _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){ + _dec->telemetry_mode_bytes=0; + } + if(_dec->telemetry_mv_bytes<0|| + _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){ + _dec->telemetry_mv_bytes=0; + } + if(_dec->telemetry_qi_bytes<0|| + _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){ + _dec->telemetry_qi_bytes=0; + } + if(_dec->telemetry_dc_bytes<0|| + _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){ + _dec->telemetry_dc_bytes=0; + } + widths[0]=padw* + (_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw; + widths[1]=padw* + (_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw; + widths[2]=padw* + (_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw; + widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw; + widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw; + widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw; + for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w; + cairo_set_source_rgba(c,.0,.0,.0,.6); + cairo_rectangle(c,10,h-33,widths[0]+1,5); + cairo_rectangle(c,10,h-29,widths[1]+1,5); + cairo_rectangle(c,10,h-25,widths[2]+1,5); + cairo_rectangle(c,10,h-21,widths[3]+1,5); + cairo_rectangle(c,10,h-17,widths[4]+1,5); + cairo_rectangle(c,10,h-13,widths[5]+1,5); + cairo_fill(c); + cairo_set_source_rgb(c,1,0,0); + cairo_rectangle(c,10.5,h-32.5,widths[0],4); + cairo_fill(c); + cairo_set_source_rgb(c,0,1,0); + cairo_rectangle(c,10.5,h-28.5,widths[1],4); + cairo_fill(c); + cairo_set_source_rgb(c,0,0,1); + cairo_rectangle(c,10.5,h-24.5,widths[2],4); + cairo_fill(c); + cairo_set_source_rgb(c,.6,.4,.0); + cairo_rectangle(c,10.5,h-20.5,widths[3],4); + cairo_fill(c); + cairo_set_source_rgb(c,.3,.3,.3); + cairo_rectangle(c,10.5,h-16.5,widths[4],4); + cairo_fill(c); + cairo_set_source_rgb(c,.5,.5,.8); + cairo_rectangle(c,10.5,h-12.5,widths[5],4); + cairo_fill(c); + } + /*Master qi indicator[s]:*/ + if(_dec->telemetry_qi&0x1){ + cairo_text_extents_t extents; + char buffer[10]; + int p; + int y; + p=0; + y=h-7.5; + if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10; + buffer[p++]=48+_dec->state.qis[0]%10; + if(_dec->state.nqis>=2){ + buffer[p++]=' '; + if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10; + buffer[p++]=48+_dec->state.qis[1]%10; + } + if(_dec->state.nqis==3){ + buffer[p++]=' '; + if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10; + buffer[p++]=48+_dec->state.qis[2]%10; + } + buffer[p++]='\0'; + cairo_select_font_face(c,"sans", + CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD); + cairo_set_font_size(c,18); + cairo_text_extents(c,buffer,&extents); + cairo_set_source_rgb(c,1,1,1); + cairo_move_to(c,w-extents.x_advance-10,y); + cairo_show_text(c,buffer); + cairo_set_source_rgb(c,0,0,0); + cairo_move_to(c,w-extents.x_advance-10,y); + cairo_text_path(c,buffer); + cairo_set_line_width(c,.8); + cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND); + cairo_stroke(c); + } + cairo_destroy(c); + } + /*Out of the Cairo plane into the telemetry YUV buffer.*/ + _ycbcr[0].data=_dec->telemetry_frame_data; + _ycbcr[0].stride=_ycbcr[0].width; + _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride; + _ycbcr[1].stride=_ycbcr[1].width; + _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride; + _ycbcr[2].stride=_ycbcr[2].width; + y_row=_ycbcr[0].data; + u_row=_ycbcr[1].data; + v_row=_ycbcr[2].data; + rgb_row=data; + /*This is one of the few places it's worth handling chroma on a + case-by-case basis.*/ + switch(_dec->state.info.pixel_fmt){ + case TH_PF_420:{ + for(y=0;y<h;y+=2){ + unsigned char *y_row2; + unsigned char *rgb_row2; + y_row2=y_row+_ycbcr[0].stride; + rgb_row2=rgb_row+cstride; + for(x=0;x<w;x+=2){ + int y; + int u; + int v; + y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] + +24966*rgb_row[4*x+0]+4207500)/255000; + y_row[x]=OC_CLAMP255(y); + y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] + +24966*rgb_row[4*x+4]+4207500)/255000; + y_row[x+1]=OC_CLAMP255(y); + y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1] + +24966*rgb_row2[4*x+0]+4207500)/255000; + y_row2[x]=OC_CLAMP255(y); + y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5] + +24966*rgb_row2[4*x+4]+4207500)/255000; + y_row2[x+1]=OC_CLAMP255(y); + u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6] + +rgb_row2[4*x+2]+rgb_row2[4*x+6]) + -16436*(rgb_row[4*x+1]+rgb_row[4*x+5] + +rgb_row2[4*x+1]+rgb_row2[4*x+5]) + +24808*(rgb_row[4*x+0]+rgb_row[4*x+4] + +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930; + v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6] + +rgb_row2[4*x+2]+rgb_row2[4*x+6]) + -32872*(rgb_row[4*x+1]+rgb_row[4*x+5] + +rgb_row2[4*x+1]+rgb_row2[4*x+5]) + -6384*(rgb_row[4*x+0]+rgb_row[4*x+4] + +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510; + u_row[x>>1]=OC_CLAMP255(u); + v_row[x>>1]=OC_CLAMP255(v); + } + y_row+=_ycbcr[0].stride<<1; + u_row+=_ycbcr[1].stride; + v_row+=_ycbcr[2].stride; + rgb_row+=cstride<<1; + } + }break; + case TH_PF_422:{ + for(y=0;y<h;y++){ + for(x=0;x<w;x+=2){ + int y; + int u; + int v; + y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] + +24966*rgb_row[4*x+0]+4207500)/255000; + y_row[x]=OC_CLAMP255(y); + y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] + +24966*rgb_row[4*x+4]+4207500)/255000; + y_row[x+1]=OC_CLAMP255(y); + u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6]) + -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]) + +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930; + v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6]) + -65744*(rgb_row[4*x+1]+rgb_row[4*x+5]) + -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510; + u_row[x>>1]=OC_CLAMP255(u); + v_row[x>>1]=OC_CLAMP255(v); + } + y_row+=_ycbcr[0].stride; + u_row+=_ycbcr[1].stride; + v_row+=_ycbcr[2].stride; + rgb_row+=cstride; + } + }break; + /*case TH_PF_444:*/ + default:{ + for(y=0;y<h;y++){ + for(x=0;x<w;x++){ + int y; + int u; + int v; + y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] + +24966*rgb_row[4*x+0]+4207500)/255000; + u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1] + +99232*rgb_row[4*x+0]+29032005)/225930; + v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1] + -25536*rgb_row[4*x+0]+45940035)/357510; + y_row[x]=OC_CLAMP255(y); + u_row[x]=OC_CLAMP255(u); + v_row[x]=OC_CLAMP255(v); + } + y_row+=_ycbcr[0].stride; + u_row+=_ycbcr[1].stride; + v_row+=_ycbcr[2].stride; + rgb_row+=cstride; + } + }break; + } + /*Finished. + Destroy the surface.*/ + cairo_surface_destroy(cs); } +#endif int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, ogg_int64_t *_granpos){ int ret; if(_dec==NULL||_op==NULL)return TH_EFAULT; /*A completely empty packet indicates a dropped frame and is treated exactly - like an inter frame with no coded blocks. - Only proceed if we have a non-empty packet.*/ - if(_op->bytes!=0){ - oc_dec_pipeline_state pipe; - th_ycbcr_buffer stripe_buf; - int stripe_fragy; - int refi; - int pli; - int notstart; - int notdone; + like an inter frame with no coded blocks.*/ + if(_op->bytes==0){ + _dec->state.frame_type=OC_INTER_FRAME; + _dec->state.ntotal_coded_fragis=0; + } + else{ oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes); + ret=oc_dec_frame_header_unpack(_dec); + if(ret<0)return ret; + if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec); + else oc_dec_coded_flags_unpack(_dec); + } + /*If there have been no reference frames, and we need one, initialize one.*/ + if(_dec->state.frame_type!=OC_INTRA_FRAME&& + (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0|| + _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){ + oc_dec_init_dummy_frame(_dec); + } + /*If this was an inter frame with no coded blocks...*/ + if(_dec->state.ntotal_coded_fragis<=0){ + /*Just update the granule position and return.*/ + _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<< + _dec->state.info.keyframe_granule_shift) + +(_dec->state.curframe_num-_dec->state.keyframe_num); + _dec->state.curframe_num++; + if(_granpos!=NULL)*_granpos=_dec->state.granpos; + return TH_DUPFRAME; + } + else{ + th_ycbcr_buffer stripe_buf; + int stripe_fragy; + int refi; + int pli; + int notstart; + int notdone; +#ifdef HAVE_CAIRO + int telemetry; + /*Save the current telemetry state. + This prevents it from being modified in the middle of decoding this + frame, which could cause us to skip calls to the striped decoding + callback.*/ + telemetry=_dec->telemetry_mbmode||_dec->telemetry_mv|| + _dec->telemetry_qi||_dec->telemetry_bits; +#endif + /*Select a free buffer to use for the reconstructed version of this frame.*/ + for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]|| + refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++); + _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi; + _dec->state.ref_frame_data[OC_FRAME_SELF]= + _dec->state.ref_frame_bufs[refi][0].data; #if defined(HAVE_CAIRO) _dec->telemetry_frame_bytes=_op->bytes; #endif - ret=oc_dec_frame_header_unpack(_dec); - if(ret<0)return ret; - /*Select a free buffer to use for the reconstructed version of this - frame.*/ - if(_dec->state.frame_type!=OC_INTRA_FRAME&& - (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0|| - _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){ - /*No reference frames yet!*/ - oc_dec_init_dummy_frame(_dec); - refi=_dec->state.ref_frame_idx[OC_FRAME_SELF]; - } - else{ - for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]|| - refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++); - _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi; - } if(_dec->state.frame_type==OC_INTRA_FRAME){ - oc_dec_mark_all_intra(_dec); _dec->state.keyframe_num=_dec->state.curframe_num; #if defined(HAVE_CAIRO) _dec->telemetry_coding_bytes= @@ -2112,7 +2804,6 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, #endif } else{ - oc_dec_coded_flags_unpack(_dec); #if defined(HAVE_CAIRO) _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb); #endif @@ -2160,15 +2851,15 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, An application callback allows further application processing (blitting to video memory, color conversion, etc.) to also use the data while it's in cache.*/ - oc_dec_pipeline_init(_dec,&pipe); + oc_dec_pipeline_init(_dec,&_dec->pipe); oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf); notstart=0; notdone=1; - for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){ + for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){ int avail_fragy0; int avail_fragy_end; avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags; - notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end; + notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end; for(pli=0;pli<3;pli++){ oc_fragment_plane *fplane; int frag_shift; @@ -2179,45 +2870,46 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, /*Compute the first and last fragment row of the current MCU for this plane.*/ frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2); - pipe.fragy0[pli]=stripe_fragy>>frag_shift; - pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags, - pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift)); - oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli); - oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli); + _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift; + _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags, + _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift)); + oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli); + oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli); sdelay=edelay=0; - if(pipe.loop_filter){ + if(_dec->pipe.loop_filter){ sdelay+=notstart; edelay+=notdone; - oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values, - refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay); + oc_state_loop_filter_frag_rows(&_dec->state, + _dec->pipe.bounding_values,OC_FRAME_SELF,pli, + _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); } /*To fill the borders, we have an additional two pixel delay, since a fragment in the next row could filter its top edge, using two pixels from a fragment in this row. But there's no reason to delay a full fragment between the two.*/ oc_state_borders_fill_rows(&_dec->state,refi,pli, - (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1), - (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1)); + (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1), + (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1)); /*Out-of-loop post-processing.*/ pp_offset=3*(pli!=0); - if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){ + if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){ /*Perform de-blocking in one plane.*/ sdelay+=notstart; edelay+=notdone; oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf, _dec->state.ref_frame_bufs[refi],pli, - pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay); - if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){ + _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); + if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){ /*Perform de-ringing in one plane.*/ sdelay+=notstart; edelay+=notdone; oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli, - pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay); + _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay); } } /*If no post-processing is done, we still need to delay a row for the loop filter, thanks to the strange filtering order VP3 chose.*/ - else if(pipe.loop_filter){ + else if(_dec->pipe.loop_filter){ sdelay+=notstart; edelay+=notdone; } @@ -2226,11 +2918,16 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, doubled, but luma might have more post-processing filters enabled than chroma, so we don't know up front which one is the limiting factor.*/ - avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift); + avail_fragy0=OC_MINI(avail_fragy0, + _dec->pipe.fragy0[pli]-sdelay<<frag_shift); avail_fragy_end=OC_MINI(avail_fragy_end, - pipe.fragy_end[pli]-edelay<<frag_shift); + _dec->pipe.fragy_end[pli]-edelay<<frag_shift); } +#ifdef HAVE_CAIRO + if(_dec->stripe_cb.stripe_decoded!=NULL&&!telemetry){ +#else if(_dec->stripe_cb.stripe_decoded!=NULL){ +#endif /*The callback might want to use the FPU, so let's make sure they can. We violate all kinds of ABI restrictions by not doing this until now, but none of them actually matter since we don't use floating @@ -2252,692 +2949,44 @@ int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op, _dec->state.ref_frame_idx[OC_FRAME_GOLD]= _dec->state.ref_frame_idx[OC_FRAME_PREV]= _dec->state.ref_frame_idx[OC_FRAME_SELF]; + _dec->state.ref_frame_data[OC_FRAME_GOLD]= + _dec->state.ref_frame_data[OC_FRAME_PREV]= + _dec->state.ref_frame_data[OC_FRAME_SELF]; } else{ /*Otherwise, just replace the previous reference frame.*/ _dec->state.ref_frame_idx[OC_FRAME_PREV]= _dec->state.ref_frame_idx[OC_FRAME_SELF]; + _dec->state.ref_frame_data[OC_FRAME_PREV]= + _dec->state.ref_frame_data[OC_FRAME_SELF]; } /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG gamma values, if nothing else).*/ oc_restore_fpu(&_dec->state); +#ifdef HAVE_CAIRO + /*If telemetry ioctls are active, we need to draw to the output buffer.*/ + if(telemetry){ + oc_render_telemetry(_dec,stripe_buf,telemetry); + oc_ycbcr_buffer_flip(_dec->pp_frame_buf,stripe_buf); + /*If we had a striped decoding callback, we skipped calling it above + (because the telemetry wasn't rendered yet). + Call it now with the whole frame.*/ + if(_dec->stripe_cb.stripe_decoded!=NULL){ + (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx, + stripe_buf,0,_dec->state.fplanes[0].nvfrags); + } + } +#endif #if defined(OC_DUMP_IMAGES) - /*Don't dump images for dropped frames.*/ + /*We only dump images if there were some coded blocks.*/ oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec"); #endif return 0; } - else{ - if(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0|| - _dec->state.ref_frame_idx[OC_FRAME_PREV]<0){ - int refi; - /*No reference frames yet!*/ - oc_dec_init_dummy_frame(_dec); - refi=_dec->state.ref_frame_idx[OC_FRAME_PREV]; - _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi; - memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[refi], - sizeof(_dec->pp_frame_buf[0])*3); - } - /*Just update the granule position and return.*/ - _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<< - _dec->state.info.keyframe_granule_shift) - +(_dec->state.curframe_num-_dec->state.keyframe_num); - _dec->state.curframe_num++; - if(_granpos!=NULL)*_granpos=_dec->state.granpos; - return TH_DUPFRAME; - } } int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){ if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT; oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf); -#if defined(HAVE_CAIRO) - /*If telemetry ioctls are active, we need to draw to the output buffer. - Stuff the plane into cairo.*/ - if(_dec->telemetry){ - cairo_surface_t *cs; - unsigned char *data; - unsigned char *y_row; - unsigned char *u_row; - unsigned char *v_row; - unsigned char *rgb_row; - int cstride; - int w; - int h; - int x; - int y; - int hdec; - int vdec; - w=_ycbcr[0].width; - h=_ycbcr[0].height; - hdec=!(_dec->state.info.pixel_fmt&1); - vdec=!(_dec->state.info.pixel_fmt&2); - /*Lazy data buffer init. - We could try to re-use the post-processing buffer, which would save - memory, but complicate the allocation logic there. - I don't think anyone cares about memory usage when using telemetry; it is - not meant for embedded devices.*/ - if(_dec->telemetry_frame_data==NULL){ - _dec->telemetry_frame_data=_ogg_malloc( - (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data)); - if(_dec->telemetry_frame_data==NULL)return 0; - } - cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h); - /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/ - data=cairo_image_surface_get_data(cs); - if(data==NULL){ - cairo_surface_destroy(cs); - return 0; - } - cstride=cairo_image_surface_get_stride(cs); - y_row=_ycbcr[0].data; - u_row=_ycbcr[1].data; - v_row=_ycbcr[2].data; - rgb_row=data; - for(y=0;y<h;y++){ - for(x=0;x<w;x++){ - int r; - int g; - int b; - r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200; - g=(3827562*y_row[x]-1287801*u_row[x>>hdec] - -2672387*v_row[x>>hdec]+447306710)/3287200; - b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600; - rgb_row[4*x+0]=OC_CLAMP255(b); - rgb_row[4*x+1]=OC_CLAMP255(g); - rgb_row[4*x+2]=OC_CLAMP255(r); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride&-((y&1)|!vdec); - v_row+=_ycbcr[2].stride&-((y&1)|!vdec); - rgb_row+=cstride; - } - /*Draw coded identifier for each macroblock (stored in Hilbert order).*/ - { - cairo_t *c; - const oc_fragment *frags; - oc_mv *frag_mvs; - const signed char *mb_modes; - oc_mb_map *mb_maps; - size_t nmbs; - size_t mbi; - int row2; - int col2; - int qim[3]={0,0,0}; - if(_dec->state.nqis==2){ - int bqi; - bqi=_dec->state.qis[0]; - if(_dec->state.qis[1]>bqi)qim[1]=1; - if(_dec->state.qis[1]<bqi)qim[1]=-1; - } - if(_dec->state.nqis==3){ - int bqi; - int cqi; - int dqi; - bqi=_dec->state.qis[0]; - cqi=_dec->state.qis[1]; - dqi=_dec->state.qis[2]; - if(cqi>bqi&&dqi>bqi){ - if(dqi>cqi){ - qim[1]=1; - qim[2]=2; - } - else{ - qim[1]=2; - qim[2]=1; - } - } - else if(cqi<bqi&&dqi<bqi){ - if(dqi<cqi){ - qim[1]=-1; - qim[2]=-2; - } - else{ - qim[1]=-2; - qim[2]=-1; - } - } - else{ - if(cqi<bqi)qim[1]=-1; - else qim[1]=1; - if(dqi<bqi)qim[2]=-1; - else qim[2]=1; - } - } - c=cairo_create(cs); - frags=_dec->state.frags; - frag_mvs=_dec->state.frag_mvs; - mb_modes=_dec->state.mb_modes; - mb_maps=_dec->state.mb_maps; - nmbs=_dec->state.nmbs; - row2=0; - col2=0; - for(mbi=0;mbi<nmbs;mbi++){ - float x; - float y; - int bi; - y=h-(row2+((col2+1>>1)&1))*16-16; - x=(col2>>1)*16; - cairo_set_line_width(c,1.); - /*Keyframe (all intra) red box.*/ - if(_dec->state.frame_type==OC_INTRA_FRAME){ - if(_dec->telemetry_mbmode&0x02){ - cairo_set_source_rgba(c,1.,0,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,0,0,.25); - cairo_fill(c); - } - } - else{ - const signed char *frag_mv; - ptrdiff_t fragi; - for(bi=0;bi<4;bi++){ - fragi=mb_maps[mbi][0][bi]; - if(fragi>=0&&frags[fragi].coded){ - frag_mv=frag_mvs[fragi]; - break; - } - } - if(bi<4){ - switch(mb_modes[mbi]){ - case OC_MODE_INTRA:{ - if(_dec->telemetry_mbmode&0x02){ - cairo_set_source_rgba(c,1.,0,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,0,0,.25); - cairo_fill(c); - } - }break; - case OC_MODE_INTER_NOMV:{ - if(_dec->telemetry_mbmode&0x01){ - cairo_set_source_rgba(c,0,0,1.,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0,0,1.,.25); - cairo_fill(c); - } - }break; - case OC_MODE_INTER_MV:{ - if(_dec->telemetry_mbmode&0x04){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x04){ - cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_LAST:{ - if(_dec->telemetry_mbmode&0x08){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_move_to(c,x+13.5,y+2.5); - cairo_line_to(c,x+2.5,y+8); - cairo_line_to(c,x+13.5,y+13.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x08){ - cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_LAST2:{ - if(_dec->telemetry_mbmode&0x10){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_move_to(c,x+8,y+2.5); - cairo_line_to(c,x+2.5,y+8); - cairo_line_to(c,x+8,y+13.5); - cairo_move_to(c,x+13.5,y+2.5); - cairo_line_to(c,x+8,y+8); - cairo_line_to(c,x+13.5,y+13.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x10){ - cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_GOLDEN_NOMV:{ - if(_dec->telemetry_mbmode&0x20){ - cairo_set_source_rgba(c,1.,1.,0,.5); - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,1.,0,.25); - cairo_fill(c); - } - }break; - case OC_MODE_GOLDEN_MV:{ - if(_dec->telemetry_mbmode&0x40){ - cairo_rectangle(c,x+2.5,y+2.5,11,11); - cairo_set_source_rgba(c,1.,1.,0,.5); - cairo_stroke(c); - } - if(_dec->telemetry_mv&0x40){ - cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+8,y+8); - cairo_stroke(c); - } - }break; - case OC_MODE_INTER_MV_FOUR:{ - if(_dec->telemetry_mbmode&0x80){ - cairo_rectangle(c,x+2.5,y+2.5,4,4); - cairo_rectangle(c,x+9.5,y+2.5,4,4); - cairo_rectangle(c,x+2.5,y+9.5,4,4); - cairo_rectangle(c,x+9.5,y+9.5,4,4); - cairo_set_source_rgba(c,0,1.,0,.5); - cairo_stroke(c); - } - /*4mv is odd, coded in raster order.*/ - fragi=mb_maps[mbi][0][0]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mv=frag_mvs[fragi]; - cairo_move_to(c,x+4+frag_mv[0],y+12-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+4+frag_mv[0]*.66,y+12-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+4+frag_mv[0]*.33,y+12-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+4,y+12); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][1]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mv=frag_mvs[fragi]; - cairo_move_to(c,x+12+frag_mv[0],y+12-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+12+frag_mv[0]*.66,y+12-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+12+frag_mv[0]*.33,y+12-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+12,y+12); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][2]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mv=frag_mvs[fragi]; - cairo_move_to(c,x+4+frag_mv[0],y+4-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+4+frag_mv[0]*.66,y+4-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+4+frag_mv[0]*.33,y+4-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+4,y+4); - cairo_stroke(c); - } - fragi=mb_maps[mbi][0][3]; - if(frags[fragi].coded&&_dec->telemetry_mv&0x80){ - frag_mv=frag_mvs[fragi]; - cairo_move_to(c,x+12+frag_mv[0],y+4-frag_mv[1]); - cairo_set_source_rgba(c,1.,1.,1.,.9); - cairo_set_line_width(c,3.); - cairo_line_to(c,x+12+frag_mv[0]*.66,y+4-frag_mv[1]*.66); - cairo_stroke_preserve(c); - cairo_set_line_width(c,2.); - cairo_line_to(c,x+12+frag_mv[0]*.33,y+4-frag_mv[1]*.33); - cairo_stroke_preserve(c); - cairo_set_line_width(c,1.); - cairo_line_to(c,x+12,y+4); - cairo_stroke(c); - } - }break; - } - } - } - /*qii illustration.*/ - if(_dec->telemetry_qi&0x2){ - cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE); - for(bi=0;bi<4;bi++){ - ptrdiff_t fragi; - int qiv; - int xp; - int yp; - xp=x+(bi&1)*8; - yp=y+8-(bi&2)*4; - fragi=mb_maps[mbi][0][bi]; - if(fragi>=0&&frags[fragi].coded){ - qiv=qim[frags[fragi].qii]; - cairo_set_line_width(c,3.); - cairo_set_source_rgba(c,0.,0.,0.,.5); - switch(qiv){ - /*Double plus:*/ - case 2:{ - if((bi&1)^((bi&2)>>1)){ - cairo_move_to(c,xp+2.5,yp+1.5); - cairo_line_to(c,xp+2.5,yp+3.5); - cairo_move_to(c,xp+1.5,yp+2.5); - cairo_line_to(c,xp+3.5,yp+2.5); - cairo_move_to(c,xp+5.5,yp+4.5); - cairo_line_to(c,xp+5.5,yp+6.5); - cairo_move_to(c,xp+4.5,yp+5.5); - cairo_line_to(c,xp+6.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0.,1.,1.,1.); - } - else{ - cairo_move_to(c,xp+5.5,yp+1.5); - cairo_line_to(c,xp+5.5,yp+3.5); - cairo_move_to(c,xp+4.5,yp+2.5); - cairo_line_to(c,xp+6.5,yp+2.5); - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+2.5,yp+6.5); - cairo_move_to(c,xp+1.5,yp+5.5); - cairo_line_to(c,xp+3.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,0.,1.,1.,1.); - } - }break; - /*Double minus:*/ - case -2:{ - cairo_move_to(c,xp+2.5,yp+2.5); - cairo_line_to(c,xp+5.5,yp+2.5); - cairo_move_to(c,xp+2.5,yp+5.5); - cairo_line_to(c,xp+5.5,yp+5.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,1.,1.,1.); - }break; - /*Plus:*/ - case 1:{ - if(bi&2==0)yp-=2; - if(bi&1==0)xp-=2; - cairo_move_to(c,xp+4.5,yp+2.5); - cairo_line_to(c,xp+4.5,yp+6.5); - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+6.5,yp+4.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,.1,1.,.3,1.); - break; - } - /*Fall through.*/ - /*Minus:*/ - case -1:{ - cairo_move_to(c,xp+2.5,yp+4.5); - cairo_line_to(c,xp+6.5,yp+4.5); - cairo_stroke_preserve(c); - cairo_set_source_rgba(c,1.,.3,.1,1.); - }break; - default:continue; - } - cairo_set_line_width(c,1.); - cairo_stroke(c); - } - } - } - col2++; - if((col2>>1)>=_dec->state.nhmbs){ - col2=0; - row2+=2; - } - } - /*Bit usage indicator[s]:*/ - if(_dec->telemetry_bits){ - int widths[6]; - int fpsn; - int fpsd; - int mult; - int fullw; - int padw; - int i; - fpsn=_dec->state.info.fps_numerator; - fpsd=_dec->state.info.fps_denominator; - mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits); - fullw=250.f*h*fpsd*mult/fpsn; - padw=w-24; - /*Header and coded block bits.*/ - if(_dec->telemetry_frame_bytes<0|| - _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){ - _dec->telemetry_frame_bytes=0; - } - if(_dec->telemetry_coding_bytes<0|| - _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_coding_bytes=0; - } - if(_dec->telemetry_mode_bytes<0|| - _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_mode_bytes=0; - } - if(_dec->telemetry_mv_bytes<0|| - _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_mv_bytes=0; - } - if(_dec->telemetry_qi_bytes<0|| - _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_qi_bytes=0; - } - if(_dec->telemetry_dc_bytes<0|| - _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){ - _dec->telemetry_dc_bytes=0; - } - widths[0]=padw*(_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw; - widths[1]=padw*(_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw; - widths[2]=padw*(_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw; - widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw; - widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw; - widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw; - for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w; - cairo_set_source_rgba(c,.0,.0,.0,.6); - cairo_rectangle(c,10,h-33,widths[0]+1,5); - cairo_rectangle(c,10,h-29,widths[1]+1,5); - cairo_rectangle(c,10,h-25,widths[2]+1,5); - cairo_rectangle(c,10,h-21,widths[3]+1,5); - cairo_rectangle(c,10,h-17,widths[4]+1,5); - cairo_rectangle(c,10,h-13,widths[5]+1,5); - cairo_fill(c); - cairo_set_source_rgb(c,1,0,0); - cairo_rectangle(c,10.5,h-32.5,widths[0],4); - cairo_fill(c); - cairo_set_source_rgb(c,0,1,0); - cairo_rectangle(c,10.5,h-28.5,widths[1],4); - cairo_fill(c); - cairo_set_source_rgb(c,0,0,1); - cairo_rectangle(c,10.5,h-24.5,widths[2],4); - cairo_fill(c); - cairo_set_source_rgb(c,.6,.4,.0); - cairo_rectangle(c,10.5,h-20.5,widths[3],4); - cairo_fill(c); - cairo_set_source_rgb(c,.3,.3,.3); - cairo_rectangle(c,10.5,h-16.5,widths[4],4); - cairo_fill(c); - cairo_set_source_rgb(c,.5,.5,.8); - cairo_rectangle(c,10.5,h-12.5,widths[5],4); - cairo_fill(c); - } - /*Master qi indicator[s]:*/ - if(_dec->telemetry_qi&0x1){ - cairo_text_extents_t extents; - char buffer[10]; - int p; - int y; - p=0; - y=h-7.5; - if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10; - buffer[p++]=48+_dec->state.qis[0]%10; - if(_dec->state.nqis>=2){ - buffer[p++]=' '; - if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10; - buffer[p++]=48+_dec->state.qis[1]%10; - } - if(_dec->state.nqis==3){ - buffer[p++]=' '; - if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10; - buffer[p++]=48+_dec->state.qis[2]%10; - } - buffer[p++]='\0'; - cairo_select_font_face(c,"sans", - CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD); - cairo_set_font_size(c,18); - cairo_text_extents(c,buffer,&extents); - cairo_set_source_rgb(c,1,1,1); - cairo_move_to(c,w-extents.x_advance-10,y); - cairo_show_text(c,buffer); - cairo_set_source_rgb(c,0,0,0); - cairo_move_to(c,w-extents.x_advance-10,y); - cairo_text_path(c,buffer); - cairo_set_line_width(c,.8); - cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND); - cairo_stroke(c); - } - cairo_destroy(c); - } - /*Out of the Cairo plane into the telemetry YUV buffer.*/ - _ycbcr[0].data=_dec->telemetry_frame_data; - _ycbcr[0].stride=_ycbcr[0].width; - _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride; - _ycbcr[1].stride=_ycbcr[1].width; - _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride; - _ycbcr[2].stride=_ycbcr[2].width; - y_row=_ycbcr[0].data; - u_row=_ycbcr[1].data; - v_row=_ycbcr[2].data; - rgb_row=data; - /*This is one of the few places it's worth handling chroma on a - case-by-case basis.*/ - switch(_dec->state.info.pixel_fmt){ - case TH_PF_420:{ - for(y=0;y<h;y+=2){ - unsigned char *y_row2; - unsigned char *rgb_row2; - y_row2=y_row+_ycbcr[0].stride; - rgb_row2=rgb_row+cstride; - for(x=0;x<w;x+=2){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - y_row[x]=OC_CLAMP255(y); - y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] - +24966*rgb_row[4*x+4]+4207500)/255000; - y_row[x+1]=OC_CLAMP255(y); - y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1] - +24966*rgb_row2[4*x+0]+4207500)/255000; - y_row2[x]=OC_CLAMP255(y); - y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5] - +24966*rgb_row2[4*x+4]+4207500)/255000; - y_row2[x+1]=OC_CLAMP255(y); - u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6] - +rgb_row2[4*x+2]+rgb_row2[4*x+6]) - -16436*(rgb_row[4*x+1]+rgb_row[4*x+5] - +rgb_row2[4*x+1]+rgb_row2[4*x+5]) - +24808*(rgb_row[4*x+0]+rgb_row[4*x+4] - +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930; - v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6] - +rgb_row2[4*x+2]+rgb_row2[4*x+6]) - -32872*(rgb_row[4*x+1]+rgb_row[4*x+5] - +rgb_row2[4*x+1]+rgb_row2[4*x+5]) - -6384*(rgb_row[4*x+0]+rgb_row[4*x+4] - +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510; - u_row[x>>1]=OC_CLAMP255(u); - v_row[x>>1]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride<<1; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride<<1; - } - }break; - case TH_PF_422:{ - for(y=0;y<h;y++){ - for(x=0;x<w;x+=2){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - y_row[x]=OC_CLAMP255(y); - y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5] - +24966*rgb_row[4*x+4]+4207500)/255000; - y_row[x+1]=OC_CLAMP255(y); - u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6]) - -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]) - +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930; - v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6]) - -65744*(rgb_row[4*x+1]+rgb_row[4*x+5]) - -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510; - u_row[x>>1]=OC_CLAMP255(u); - v_row[x>>1]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride; - } - }break; - /*case TH_PF_444:*/ - default:{ - for(y=0;y<h;y++){ - for(x=0;x<w;x++){ - int y; - int u; - int v; - y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1] - +24966*rgb_row[4*x+0]+4207500)/255000; - u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1] - +99232*rgb_row[4*x+0]+29032005)/225930; - v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1] - -25536*rgb_row[4*x+0]+45940035)/357510; - y_row[x]=OC_CLAMP255(y); - u_row[x]=OC_CLAMP255(u); - v_row[x]=OC_CLAMP255(v); - } - y_row+=_ycbcr[0].stride; - u_row+=_ycbcr[1].stride; - v_row+=_ycbcr[2].stride; - rgb_row+=cstride; - } - }break; - } - /*Finished. - Destroy the surface.*/ - cairo_surface_destroy(cs); - } -#endif return 0; } diff --git a/thirdparty/libtheora/dequant.c b/thirdparty/libtheora/dequant.c index e554872d4e..860536f72d 100644 --- a/thirdparty/libtheora/dequant.c +++ b/thirdparty/libtheora/dequant.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/dequant.h b/thirdparty/libtheora/dequant.h index ef25838e35..9d6cd6be56 100644 --- a/thirdparty/libtheora/dequant.h +++ b/thirdparty/libtheora/dequant.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/encfrag.c b/thirdparty/libtheora/encfrag.c index bb814c8e4a..0e18111ac7 100644 --- a/thirdparty/libtheora/encfrag.c +++ b/thirdparty/libtheora/encfrag.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: encfrag.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> @@ -19,11 +19,6 @@ #include "encint.h" -void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64], - const unsigned char *_src,const unsigned char *_ref,int _ystride){ - (*_enc->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride); -} - void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src, const unsigned char *_ref,int _ystride){ int i; @@ -35,11 +30,6 @@ void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src, } } -void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64], - const unsigned char *_src,int _ystride){ - (*_enc->opt_vtable.frag_sub_128)(_diff,_src,_ystride); -} - void oc_enc_frag_sub_128_c(ogg_int16_t *_diff, const unsigned char *_src,int _ystride){ int i; @@ -50,11 +40,6 @@ void oc_enc_frag_sub_128_c(ogg_int16_t *_diff, } } -unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_x, - const unsigned char *_y,int _ystride){ - return (*_enc->opt_vtable.frag_sad)(_x,_y,_ystride); -} - unsigned oc_enc_frag_sad_c(const unsigned char *_src, const unsigned char *_ref,int _ystride){ unsigned sad; @@ -69,12 +54,6 @@ unsigned oc_enc_frag_sad_c(const unsigned char *_src, return sad; } -unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref,int _ystride, - unsigned _thresh){ - return (*_enc->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh); -} - unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src, const unsigned char *_ref,int _ystride,unsigned _thresh){ unsigned sad; @@ -90,13 +69,6 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src, return sad; } -unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref1, - const unsigned char *_ref2,int _ystride,unsigned _thresh){ - return (*_enc->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride, - _thresh); -} - unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src, const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, unsigned _thresh){ @@ -114,6 +86,27 @@ unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src, return sad; } +unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){ + const unsigned char *src = _src; + unsigned dc; + unsigned sad; + int i; + dc=0; + for(i=8;i-->0;){ + int j; + for(j=0;j<8;j++)dc+=src[j]; + src+=_ystride; + } + dc=dc+32>>6; + sad=0; + for(i=8;i-->0;){ + int j; + for(j=0;j<8;j++)sad+=abs(_src[j]-dc); + _src+=_ystride; + } + return sad; +} + static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src, const unsigned char *_ref,int _ystride){ int i; @@ -269,19 +262,20 @@ static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src, } } -unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){ - unsigned sad; - int t0; - int t1; - int t2; - int t3; - int t4; - int t5; - int t6; - int t7; - int r; - int i; - sad=0; +unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){ + unsigned sad; + int dc; + int t0; + int t1; + int t2; + int t3; + int t4; + int t5; + int t6; + int t7; + int r; + int i; + sad=dc=0; for(i=0;i<8;i++){ /*Hadamard stage 1:*/ t0=_buf[i*8+0]+_buf[i*8+4]; @@ -306,7 +300,7 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){ t5+=t7; t7=r-t7; /*Hadamard stage 3:*/ - r=abs(t0+t1); + r=abs(t0+t1)&-(i>0); r+=abs(t0-t1); r+=abs(t2+t3); r+=abs(t2-t3); @@ -315,54 +309,61 @@ unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){ r+=abs(t6+t7); r+=abs(t6-t7); sad+=r; - if(sad>_thresh)break; } + dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7]; + *_dc=dc; return sad; } -unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref,int _ystride, - unsigned _thresh){ - return (*_enc->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh); -} - -unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh){ +unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride){ ogg_int16_t buf[64]; oc_diff_hadamard(buf,_src,_ref,_ystride); - return oc_hadamard_sad_thresh(buf,_thresh); -} - -unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref1, - const unsigned char *_ref2,int _ystride,unsigned _thresh){ - return (*_enc->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride, - _thresh); + return oc_hadamard_sad(_dc,buf); } -unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh){ +unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ ogg_int16_t buf[64]; oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride); - return oc_hadamard_sad_thresh(buf,_thresh); + return oc_hadamard_sad(_dc,buf); } -unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc, +unsigned oc_enc_frag_intra_satd_c(int *_dc, const unsigned char *_src,int _ystride){ - return (*_enc->opt_vtable.frag_intra_satd)(_src,_ystride); -} - -unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride){ ogg_int16_t buf[64]; oc_intra_hadamard(buf,_src,_ystride); - return oc_hadamard_sad_thresh(buf,UINT_MAX) - -abs(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]); + return oc_hadamard_sad(_dc,buf); } -void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst, - const unsigned char *_src1,const unsigned char *_src2,int _ystride){ - (*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride); +unsigned oc_enc_frag_ssd_c(const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + unsigned ret; + int y; + int x; + ret=0; + for(y=0;y<8;y++){ + for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]); + _src+=_ystride; + _ref+=_ystride; + } + return ret; +} + +unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src, + const unsigned char *_ref,int _ystride,ogg_int64_t _mask){ + unsigned ret; + int y; + int x; + ret=0; + for(y=0;y<8;y++){ + for(x=0;x<8;x++,_mask>>=1){ + if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]); + } + _src+=_ystride; + _ref+=_ystride; + } + return ret; } void oc_enc_frag_copy2_c(unsigned char *_dst, @@ -376,13 +377,3 @@ void oc_enc_frag_copy2_c(unsigned char *_dst, _src2+=_ystride; } } - -void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc, - unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){ - (*_enc->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue); -} - -void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){ - (*_enc->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue); -} diff --git a/thirdparty/libtheora/encinfo.c b/thirdparty/libtheora/encinfo.c index 83be1dae72..41db6bad45 100644 --- a/thirdparty/libtheora/encinfo.c +++ b/thirdparty/libtheora/encinfo.c @@ -1,6 +1,6 @@ #include <stdlib.h> #include <string.h> -#include "internal.h" +#include "state.h" #include "enquant.h" #include "huffenc.h" diff --git a/thirdparty/libtheora/encint.h b/thirdparty/libtheora/encint.h index 97897d5a04..d25de4b8f6 100644 --- a/thirdparty/libtheora/encint.h +++ b/thirdparty/libtheora/encint.h @@ -11,17 +11,13 @@ ******************************************************************** function: - last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #if !defined(_encint_H) # define _encint_H (1) -# if defined(HAVE_CONFIG_H) -# include "config.h" -# endif # include "theora/theoraenc.h" -# include "internal.h" -# include "ocintrin.h" +# include "state.h" # include "mathops.h" # include "enquant.h" # include "huffenc.h" @@ -32,8 +28,13 @@ typedef oc_mv oc_mv2[2]; typedef struct oc_enc_opt_vtable oc_enc_opt_vtable; +typedef struct oc_enc_opt_data oc_enc_opt_data; typedef struct oc_mb_enc_info oc_mb_enc_info; typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser; +typedef struct oc_fr_state oc_fr_state; +typedef struct oc_qii_state oc_qii_state; +typedef struct oc_enc_pipeline_state oc_enc_pipeline_state; +typedef struct oc_mode_rd oc_mode_rd; typedef struct oc_iir_filter oc_iir_filter; typedef struct oc_frame_metrics oc_frame_metrics; typedef struct oc_rc_state oc_rc_state; @@ -42,6 +43,170 @@ typedef struct oc_token_checkpoint oc_token_checkpoint; +/*Encoder-specific accelerated functions.*/ +# if defined(OC_X86_ASM) +# if defined(_MSC_VER) +# include "x86_vc/x86enc.h" +# else +# include "x86/x86enc.h" +# endif +# endif +# if defined(OC_ARM_ASM) +# include "arm/armenc.h" +# endif + +# if !defined(oc_enc_accel_init) +# define oc_enc_accel_init oc_enc_accel_init_c +# endif +# if defined(OC_ENC_USE_VTABLE) +# if !defined(oc_enc_frag_sub) +# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \ + ((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride)) +# endif +# if !defined(oc_enc_frag_sub_128) +# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \ + ((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride)) +# endif +# if !defined(oc_enc_frag_sad) +# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \ + ((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride)) +# endif +# if !defined(oc_enc_frag_sad_thresh) +# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \ + ((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh)) +# endif +# if !defined(oc_enc_frag_sad2_thresh) +# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \ + ((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh)) +# endif +# if !defined(oc_enc_frag_intra_sad) +# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \ + ((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride)) +# endif +# if !defined(oc_enc_frag_satd) +# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \ + ((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride)) +# endif +# if !defined(oc_enc_frag_satd2) +# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \ + ((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride)) +# endif +# if !defined(oc_enc_frag_intra_satd) +# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \ + ((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride)) +# endif +# if !defined(oc_enc_frag_ssd) +# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \ + ((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride)) +# endif +# if !defined(oc_enc_frag_border_ssd) +# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \ + ((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask)) +# endif +# if !defined(oc_enc_frag_copy2) +# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \ + ((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride)) +# endif +# if !defined(oc_enc_enquant_table_init) +# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \ + ((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant)) +# endif +# if !defined(oc_enc_enquant_table_fixup) +# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \ + ((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis)) +# endif +# if !defined(oc_enc_quantize) +# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \ + ((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant)) +# endif +# if !defined(oc_enc_frag_recon_intra) +# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \ + ((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue)) +# endif +# if !defined(oc_enc_frag_recon_inter) +# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \ + ((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue)) +# endif +# if !defined(oc_enc_fdct8x8) +# define oc_enc_fdct8x8(_enc,_y,_x) \ + ((*(_enc)->opt_vtable.fdct8x8)(_y,_x)) +# endif +# else +# if !defined(oc_enc_frag_sub) +# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \ + oc_enc_frag_sub_c(_diff,_src,_ref,_ystride) +# endif +# if !defined(oc_enc_frag_sub_128) +# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \ + oc_enc_frag_sub_128_c(_diff,_src,_ystride) +# endif +# if !defined(oc_enc_frag_sad) +# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \ + oc_enc_frag_sad_c(_src,_ref,_ystride) +# endif +# if !defined(oc_enc_frag_sad_thresh) +# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \ + oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh) +# endif +# if !defined(oc_enc_frag_sad2_thresh) +# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \ + oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh) +# endif +# if !defined(oc_enc_frag_intra_sad) +# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \ + oc_enc_frag_intra_sad_c(_src,_ystride) +# endif +# if !defined(oc_enc_frag_satd) +# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \ + oc_enc_frag_satd_c(_dc,_src,_ref,_ystride) +# endif +# if !defined(oc_enc_frag_satd2) +# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \ + oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride) +# endif +# if !defined(oc_enc_frag_intra_satd) +# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \ + oc_enc_frag_intra_satd_c(_dc,_src,_ystride) +# endif +# if !defined(oc_enc_frag_ssd) +# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \ + oc_enc_frag_ssd_c(_src,_ref,_ystride) +# endif +# if !defined(oc_enc_frag_border_ssd) +# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \ + oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask) +# endif +# if !defined(oc_enc_frag_copy2) +# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \ + oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride) +# endif +# if !defined(oc_enc_enquant_table_init) +# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \ + oc_enc_enquant_table_init_c(_enquant,_dequant) +# endif +# if !defined(oc_enc_enquant_table_fixup) +# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \ + oc_enc_enquant_table_fixup_c(_enquant,_nqis) +# endif +# if !defined(oc_enc_quantize) +# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \ + oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant) +# endif +# if !defined(oc_enc_frag_recon_intra) +# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \ + oc_frag_recon_intra_c(_dst,_ystride,_residue) +# endif +# if !defined(oc_enc_frag_recon_inter) +# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \ + oc_frag_recon_inter_c(_dst,_src,_ystride,_residue) +# endif +# if !defined(oc_enc_fdct8x8) +# define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x) +# endif +# endif + + + /*Constants for the packet-out state machine specific to the encoder.*/ /*Next packet to emit: Data packet, but none are ready yet.*/ @@ -50,13 +215,61 @@ typedef struct oc_token_checkpoint oc_token_checkpoint; #define OC_PACKET_READY (1) /*All features enabled.*/ -#define OC_SP_LEVEL_SLOW (0) +#define OC_SP_LEVEL_SLOW (0) /*Enable early skip.*/ -#define OC_SP_LEVEL_EARLY_SKIP (1) +#define OC_SP_LEVEL_EARLY_SKIP (1) +/*Use analysis shortcuts, single quantizer, and faster tokenization.*/ +#define OC_SP_LEVEL_FAST_ANALYSIS (2) +/*Use SAD instead of SATD*/ +#define OC_SP_LEVEL_NOSATD (3) /*Disable motion compensation.*/ -#define OC_SP_LEVEL_NOMC (2) +#define OC_SP_LEVEL_NOMC (4) /*Maximum valid speed level.*/ -#define OC_SP_LEVEL_MAX (2) +#define OC_SP_LEVEL_MAX (4) + + +/*The number of extra bits of precision at which to store rate metrics.*/ +# define OC_BIT_SCALE (6) +/*The number of extra bits of precision at which to store RMSE metrics. + This must be at least half OC_BIT_SCALE (rounded up).*/ +# define OC_RMSE_SCALE (5) +/*The number of quantizer bins to partition statistics into.*/ +# define OC_LOGQ_BINS (8) +/*The number of SAD/SATD bins to partition statistics into.*/ +# define OC_COMP_BINS (24) +/*The number of bits of precision to drop from SAD and SATD scores + to assign them to a bin.*/ +# define OC_SAD_SHIFT (6) +# define OC_SATD_SHIFT (9) + +/*Masking is applied by scaling the D used in R-D optimization (via rd_scale) + or the lambda parameter (via rd_iscale). + These are only equivalent within a single block; when more than one block is + being considered, the former is the interpretation used.*/ + +/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/ +# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE) +# define OC_RD_ISCALE_BITS (11) + +/*This macro is applied to _ssd values with just 4 bits of headroom + ((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as + large as 16, and need additional fractional bits, our only recourse that + doesn't lose precision on blocks with very small SSDs is to use a wider + multiply.*/ +# if LONG_MAX>2147483647 +# define OC_RD_SCALE(_ssd,_rd_scale) \ + ((unsigned)((unsigned long)(_ssd)*(_rd_scale) \ + +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS)) +# else +# define OC_RD_SCALE(_ssd,_rd_scale) \ + (((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \ + +(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \ + +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS)) +# endif +# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \ + ((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4) +# define OC_RD_ISCALE(_lambda,_rd_iscale) \ + ((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS) /*The bits used for each of the MB mode codebooks.*/ @@ -78,6 +291,10 @@ extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30]; /*Encoder specific functions with accelerated variants.*/ struct oc_enc_opt_vtable{ + void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src, + const unsigned char *_ref,int _ystride); + void (*frag_sub_128)(ogg_int16_t _diff[64], + const unsigned char *_src,int _ystride); unsigned (*frag_sad)(const unsigned char *_src, const unsigned char *_ref,int _ystride); unsigned (*frag_sad_thresh)(const unsigned char *_src, @@ -85,18 +302,23 @@ struct oc_enc_opt_vtable{ unsigned (*frag_sad2_thresh)(const unsigned char *_src, const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, unsigned _thresh); - unsigned (*frag_satd_thresh)(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh); - unsigned (*frag_satd2_thresh)(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh); - unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride); - void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src, + unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride); + unsigned (*frag_satd)(int *_dc,const unsigned char *_src, const unsigned char *_ref,int _ystride); - void (*frag_sub_128)(ogg_int16_t _diff[64], - const unsigned char *_src,int _ystride); + unsigned (*frag_satd2)(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride); + unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride); + unsigned (*frag_ssd)(const unsigned char *_src, + const unsigned char *_ref,int _ystride); + unsigned (*frag_border_ssd)(const unsigned char *_src, + const unsigned char *_ref,int _ystride,ogg_int64_t _mask); void (*frag_copy2)(unsigned char *_dst, const unsigned char *_src1,const unsigned char *_src2,int _ystride); + void (*enquant_table_init)(void *_enquant, + const ogg_uint16_t _dequant[64]); + void (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis); + int (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], + const ogg_uint16_t _dequant[64],const void *_enquant); void (*frag_recon_intra)(unsigned char *_dst,int _ystride, const ogg_int16_t _residue[64]); void (*frag_recon_inter)(unsigned char *_dst, @@ -105,7 +327,19 @@ struct oc_enc_opt_vtable{ }; -void oc_enc_vtable_init(oc_enc_ctx *_enc); +/*Encoder specific data that varies according to which variants of the above + functions are used.*/ +struct oc_enc_opt_data{ + /*The size of a single quantizer table. + This must be a multiple of enquant_table_alignment.*/ + size_t enquant_table_size; + /*The alignment required for the quantizer tables. + This must be a positive power of two.*/ + int enquant_table_alignment; +}; + + +void oc_enc_accel_init(oc_enc_ctx *_enc); @@ -158,7 +392,7 @@ struct oc_mode_scheme_chooser{ corresponds to the ranks above.*/ unsigned char scheme0_list[OC_NMODES]; /*The number of times each mode has been chosen so far.*/ - int mode_counts[OC_NMODES]; + unsigned mode_counts[OC_NMODES]; /*The list of mode coding schemes, sorted in ascending order of bit cost.*/ unsigned char scheme_list[8]; /*The number of bits used by each mode coding scheme.*/ @@ -170,6 +404,106 @@ void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser); +/*State to track coded block flags and their bit cost. + We use opportunity cost to measure the bits required to code or skip the next + block, using the cheaper of the cost to code it fully or partially, so long + as both are possible.*/ +struct oc_fr_state{ + /*The number of bits required for the coded block flags so far this frame.*/ + ptrdiff_t bits; + /*The length of the current run for the partial super block flag, not + including the current super block.*/ + unsigned sb_partial_count:16; + /*The length of the current run for the full super block flag, not + including the current super block.*/ + unsigned sb_full_count:16; + /*The length of the coded block flag run when the current super block + started.*/ + unsigned b_coded_count_prev:6; + /*The coded block flag when the current super block started.*/ + signed int b_coded_prev:2; + /*The length of the current coded block flag run.*/ + unsigned b_coded_count:6; + /*The current coded block flag.*/ + signed int b_coded:2; + /*The number of blocks processed in the current super block.*/ + unsigned b_count:5; + /*Whether or not it is cheaper to code the current super block partially, + even if it could still be coded fully.*/ + unsigned sb_prefer_partial:1; + /*Whether the last super block was coded partially.*/ + signed int sb_partial:2; + /*The number of bits required for the flags for the current super block.*/ + unsigned sb_bits:6; + /*Whether the last non-partial super block was coded fully.*/ + signed int sb_full:2; +}; + + + +struct oc_qii_state{ + ptrdiff_t bits; + unsigned qi01_count:14; + signed int qi01:2; + unsigned qi12_count:14; + signed int qi12:2; +}; + + + +/*Temporary encoder state for the analysis pipeline.*/ +struct oc_enc_pipeline_state{ + /*DCT coefficient storage. + This is kept off the stack because a) gcc can't align things on the stack + reliably on ARM, and b) it avoids (unintentional) data hazards between + ARM and NEON code.*/ + OC_ALIGN16(ogg_int16_t dct_data[64*3]); + OC_ALIGN16(signed char bounding_values[256]); + oc_fr_state fr[3]; + oc_qii_state qs[3]; + /*Skip SSD storage for the current MCU in each plane.*/ + unsigned *skip_ssd[3]; + /*Coded/uncoded fragment lists for each plane for the current MCU.*/ + ptrdiff_t *coded_fragis[3]; + ptrdiff_t *uncoded_fragis[3]; + ptrdiff_t ncoded_fragis[3]; + ptrdiff_t nuncoded_fragis[3]; + /*The starting fragment for the current MCU in each plane.*/ + ptrdiff_t froffset[3]; + /*The starting row for the current MCU in each plane.*/ + int fragy0[3]; + /*The ending row for the current MCU in each plane.*/ + int fragy_end[3]; + /*The starting superblock for the current MCU in each plane.*/ + unsigned sbi0[3]; + /*The ending superblock for the current MCU in each plane.*/ + unsigned sbi_end[3]; + /*The number of tokens for zzi=1 for each color plane.*/ + int ndct_tokens1[3]; + /*The outstanding eob_run count for zzi=1 for each color plane.*/ + int eob_run1[3]; + /*Whether or not the loop filter is enabled.*/ + int loop_filter; +}; + + + +/*Statistics used to estimate R-D cost of a block in a given coding mode. + See modedec.h for more details.*/ +struct oc_mode_rd{ + /*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/ + ogg_int16_t rate; + /*The expected square root of the sum of squared errors, shifted by + OC_RMSE_SCALE.*/ + ogg_int16_t rmse; +}; + +# if defined(OC_COLLECT_METRICS) +# include "collect.h" +# endif + + + /*A 2nd order low-pass Bessel follower. We use this for rate control because it has fast reaction time, but is critically damped.*/ @@ -190,6 +524,8 @@ struct oc_frame_metrics{ unsigned dup_count:31; /*The frame type from pass 1.*/ unsigned frame_type:1; + /*The frame activity average from pass 1.*/ + unsigned activity_avg; }; @@ -335,10 +671,15 @@ struct th_enc_ctx{ size_t mv_bits[2]; /*The mode scheme chooser for estimating mode coding costs.*/ oc_mode_scheme_chooser chooser; + /*Temporary encoder state for the analysis pipeline.*/ + oc_enc_pipeline_state pipe; /*The number of vertical super blocks in an MCU.*/ int mcu_nvsbs; /*The SSD error for skipping each fragment in the current MCU.*/ unsigned *mcu_skip_ssd; + /*The masking scale factors for chroma blocks in the current MCU.*/ + ogg_uint16_t *mcu_rd_scale; + ogg_uint16_t *mcu_rd_iscale; /*The DCT token lists for each coefficient and each plane.*/ unsigned char **dct_tokens[3]; /*The extra bits associated with each DCT token.*/ @@ -350,8 +691,10 @@ struct th_enc_ctx{ /*The offset of the first DCT token for each coefficient for each plane.*/ unsigned char dct_token_offs[3][64]; /*The last DC coefficient for each plane and reference frame.*/ - int dc_pred_last[3][3]; + int dc_pred_last[3][4]; #if defined(OC_COLLECT_METRICS) + /*Fragment SAD statistics for MB mode estimation metrics.*/ + unsigned *frag_sad; /*Fragment SATD statistics for MB mode estimation metrics.*/ unsigned *frag_satd; /*Fragment SSD statistics for MB mode estimation metrics.*/ @@ -359,32 +702,56 @@ struct th_enc_ctx{ #endif /*The R-D optimization parameter.*/ int lambda; + /*The average block "activity" of the previous frame.*/ + unsigned activity_avg; + /*The average MB luma of the previous frame.*/ + unsigned luma_avg; /*The huffman tables in use.*/ th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]; /*The quantization parameters in use.*/ th_quant_info qinfo; - oc_iquant *enquant_tables[64][3][2]; - oc_iquant_table enquant_table_data[64][3][2]; - /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi - value. - This is used to paramterize the rate control decisions. + /*The original DC coefficients saved off from the dequatization tables.*/ + ogg_uint16_t dequant_dc[64][3][2]; + /*Condensed dequantization tables.*/ + const ogg_uint16_t *dequant[3][3][2]; + /*Condensed quantization tables.*/ + void *enquant[3][3][2]; + /*The full set of quantization tables.*/ + void *enquant_tables[64][3][2]; + /*Storage for the quantization tables.*/ + unsigned char *enquant_table_data; + /*An "average" quantizer for each frame type (INTRA or INTER) and qi value. + This is used to parameterize the rate control decisions. They are kept in the log domain to simplify later processing. - Keep in mind these are DCT domain quantizers, and so are scaled by an - additional factor of 4 from the pixel domain.*/ + These are DCT domain quantizers, and so are scaled by an additional factor + of 4 from the pixel domain.*/ ogg_int64_t log_qavg[2][64]; + /*The "average" quantizer futher partitioned by color plane. + This is used to parameterize mode decision. + These are DCT domain quantizers, and so are scaled by an additional factor + of 4 from the pixel domain.*/ + ogg_int16_t log_plq[64][3][2]; + /*The R-D scale factors to apply to chroma blocks for a given frame type + (INTRA or INTER) and qi value. + The first is the "D" modifier (rd_scale), while the second is the "lambda" + modifier (rd_iscale).*/ + ogg_uint16_t chroma_rd_scale[2][64][2]; + /*The interpolated mode decision R-D lookup tables for the current + quantizers, color plane, and quantization type.*/ + oc_mode_rd mode_rd[3][3][2][OC_COMP_BINS]; /*The buffer state used to drive rate control.*/ oc_rc_state rc; +# if defined(OC_ENC_USE_VTABLE) /*Table for encoder acceleration functions.*/ oc_enc_opt_vtable opt_vtable; +# endif + /*Table for encoder data used by accelerated functions.*/ + oc_enc_opt_data opt_data; }; void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode); int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode); -#if defined(OC_COLLECT_METRICS) -void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc); -void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc); -#endif @@ -415,8 +782,13 @@ struct oc_token_checkpoint{ void oc_enc_tokenize_start(oc_enc_ctx *_enc); int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, - ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, - int _zzi,oc_token_checkpoint **_stack,int _acmin); + ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in, + const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, + int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin); +int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, + ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in, + const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, + int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin); void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc, const oc_token_checkpoint *_stack,int _n); void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc, @@ -436,45 +808,13 @@ int oc_state_flushheader(oc_theora_state *_state,int *_packet_state, -/*Encoder-specific accelerated functions.*/ -void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64], - const unsigned char *_src,const unsigned char *_ref,int _ystride); -void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64], - const unsigned char *_src,int _ystride); -unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src, - const unsigned char *_ref,int _ystride); -unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref,int _ystride, - unsigned _thresh); -unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref1, - const unsigned char *_ref2,int _ystride,unsigned _thresh); -unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref,int _ystride, - unsigned _thresh); -unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc, - const unsigned char *_src,const unsigned char *_ref1, - const unsigned char *_ref2,int _ystride,unsigned _thresh); -unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc, - const unsigned char *_src,int _ystride); -void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst, - const unsigned char *_src1,const unsigned char *_src2,int _ystride); -void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc, - unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]); -void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); -void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64], - const ogg_int16_t _x[64]); - -/*Default pure-C implementations.*/ -void oc_enc_vtable_init_c(oc_enc_ctx *_enc); +/*Default pure-C implementations of encoder-specific accelerated functions.*/ +void oc_enc_accel_init_c(oc_enc_ctx *_enc); void oc_enc_frag_sub_c(ogg_int16_t _diff[64], const unsigned char *_src,const unsigned char *_ref,int _ystride); void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64], const unsigned char *_src,int _ystride); -void oc_enc_frag_copy2_c(unsigned char *_dst, - const unsigned char *_src1,const unsigned char *_src2,int _ystride); unsigned oc_enc_frag_sad_c(const unsigned char *_src, const unsigned char *_ref,int _ystride); unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src, @@ -482,12 +822,24 @@ unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src, unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src, const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, unsigned _thresh); -unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh); -unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh); -unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride); +unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride); +unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride); +unsigned oc_enc_frag_intra_satd_c(int *_dc, + const unsigned char *_src,int _ystride); +unsigned oc_enc_frag_ssd_c(const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src, + const unsigned char *_ref,int _ystride,ogg_int64_t _mask); +void oc_enc_frag_copy2_c(unsigned char *_dst, + const unsigned char *_src1,const unsigned char *_src2,int _ystride); +void oc_enc_enquant_table_init_c(void *_enquant, + const ogg_uint16_t _dequant[64]); +void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis); +int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], + const ogg_uint16_t _dequant[64],const void *_enquant); void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]); #endif diff --git a/thirdparty/libtheora/encode.c b/thirdparty/libtheora/encode.c index 0c5ea6a172..3309f97c03 100644 --- a/thirdparty/libtheora/encode.c +++ b/thirdparty/libtheora/encode.c @@ -11,15 +11,13 @@ ******************************************************************** function: - last mod: $Id: encode.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> #include <string.h> #include "encint.h" -#if defined(OC_X86_ASM) -# include "x86/x86enc.h" -#endif +#include "dequant.h" @@ -288,12 +286,12 @@ const th_quant_info TH_DEF_QUANT_INFO={ 28, 25, 24, 22, 20, 17, 14, 10 }, { - 30,25,20,20,15,15,14,14, - 13,13,12,12,11,11,10,10, - 9, 9, 8, 8, 7, 7, 7, 7, - 6, 6, 6, 6, 5, 5, 5, 5, - 4, 4, 4, 4, 3, 3, 3, 3, + 15,12, 9, 8, 6, 6, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -623,11 +621,15 @@ static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){ } } -static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,int _dx,int _dy){ +static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,oc_mv _mv){ + int dx; + int dy; + dx=OC_MV_X(_mv); + dy=OC_MV_Y(_mv); oggpackB_write(&_enc->opb, - OC_MV_CODES[_mv_scheme][_dx+31],OC_MV_BITS[_mv_scheme][_dx+31]); + OC_MV_CODES[_mv_scheme][dx+31],OC_MV_BITS[_mv_scheme][dx+31]); oggpackB_write(&_enc->opb, - OC_MV_CODES[_mv_scheme][_dy+31],OC_MV_BITS[_mv_scheme][_dy+31]); + OC_MV_CODES[_mv_scheme][dy+31],OC_MV_BITS[_mv_scheme][dy+31]); } static void oc_enc_mvs_pack(oc_enc_ctx *_enc){ @@ -650,7 +652,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){ mb_modes=_enc->state.mb_modes; mb_maps=(const oc_mb_map *)_enc->state.mb_maps; frags=_enc->state.frags; - frag_mvs=(const oc_mv *)_enc->state.frag_mvs; + frag_mvs=_enc->state.frag_mvs; for(mbii=0;mbii<ncoded_mbis;mbii++){ ptrdiff_t fragi; unsigned mbi; @@ -662,8 +664,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){ for(bi=0;;bi++){ fragi=mb_maps[mbi][0][bi]; if(frags[fragi].coded){ - oc_enc_mv_pack(_enc,mv_scheme, - frag_mvs[fragi][0],frag_mvs[fragi][1]); + oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]); /*Only code a single MV for this macro block.*/ break; } @@ -673,8 +674,7 @@ static void oc_enc_mvs_pack(oc_enc_ctx *_enc){ for(bi=0;bi<4;bi++){ fragi=mb_maps[mbi][0][bi]; if(frags[fragi].coded){ - oc_enc_mv_pack(_enc,mv_scheme, - frag_mvs[fragi][0],frag_mvs[fragi][1]); + oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]); /*Keep coding all the MVs for this macro block.*/ } } @@ -863,11 +863,55 @@ static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){ } } +/*Packs an explicit drop frame, instead of using the more efficient 0-byte + packet. + This is only enabled in VP3-compatibility mode, even though it is not + strictly required for VP3 compatibility (VP3 could be encoded in AVI, which + also supports dropping frames by inserting 0 byte packets). + However, almost every _Theora_ player used to get this wrong (and many still + do), and it wasn't until we started shipping a post-VP3 encoder that + actually used non-VP3 features that this began to be discovered and fixed, + despite being in the standard since 2004. + The pack buffer must be reset before calling this function.*/ +static void oc_enc_drop_frame_pack(oc_enc_ctx *_enc){ + unsigned nsbs; + /*Mark this as a data packet.*/ + oggpackB_write(&_enc->opb,0,1); + /*Output the frame type (key frame or delta frame).*/ + oggpackB_write(&_enc->opb,OC_INTER_FRAME,1); + /*Write out the current qi list. + We always use just 1 qi, to avoid wasting bits on the others.*/ + oggpackB_write(&_enc->opb,_enc->state.qis[0],6); + oggpackB_write(&_enc->opb,0,1); + /*Coded block flags: everything is uncoded.*/ + nsbs=_enc->state.nsbs; + /*No partially coded SBs.*/ + oggpackB_write(&_enc->opb,0,1); + oc_sb_run_pack(&_enc->opb,nsbs,0,1); + /*No fully coded SBs.*/ + oggpackB_write(&_enc->opb,0,1); + oc_sb_run_pack(&_enc->opb,nsbs,0,1); + /*MB modes: just need write which scheme to use. + Since we have no coded MBs, we can pick any of them except 0, which would + require writing out an additional mode list.*/ + oggpackB_write(&_enc->opb,7,3); + /*MVs: just need write which scheme to use. + We can pick either one, since we have no MVs.*/ + oggpackB_write(&_enc->opb,1,1); + /*Write the chosen DC token tables.*/ + oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][0],4); + oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][1],4); + /*Write the chosen AC token tables.*/ + oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][0],4); + oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][1],4); +} + static void oc_enc_frame_pack(oc_enc_ctx *_enc){ + /*musl libc malloc()/realloc() calls might use floating point, so make sure + we've cleared the MMX state for them.*/ + oc_restore_fpu(&_enc->state); oggpackB_reset(&_enc->opb); - /*Only proceed if we have some coded blocks. - If there are no coded blocks, we can drop this frame simply by emitting a - 0 byte packet.*/ + /*Only proceed if we have some coded blocks.*/ if(_enc->state.ntotal_coded_fragis>0){ oc_enc_frame_header_pack(_enc); if(_enc->state.frame_type==OC_INTER_FRAME){ @@ -880,6 +924,10 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){ oc_enc_tokenize_finish(_enc); oc_enc_residual_tokens_pack(_enc); } + /*If there are no coded blocks, we can drop this frame simply by emitting a + 0 byte packet. + We emit an inter frame with no coded blocks in VP3-compatibility mode.*/ + else if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc); /*Success: Mark the packet as ready to be flushed.*/ _enc->packet_state=OC_PACKET_READY; #if defined(OC_COLLECT_METRICS) @@ -888,21 +936,31 @@ static void oc_enc_frame_pack(oc_enc_ctx *_enc){ } -void oc_enc_vtable_init_c(oc_enc_ctx *_enc){ +void oc_enc_accel_init_c(oc_enc_ctx *_enc){ /*The implementations prefixed with oc_enc_ are encoder-specific. The rest we re-use from the decoder.*/ +# if defined(OC_ENC_USE_VTABLE) + _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c; + _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c; _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c; _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c; _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c; - _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c; - _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c; + _enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c; + _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c; + _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c; _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c; - _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c; - _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c; + _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c; + _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c; _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c; + _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c; + _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c; + _enc->opt_vtable.quantize=oc_enc_quantize_c; _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c; _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c; _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c; +# endif + _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant); + _enc->opt_data.enquant_table_alignment=16; } /*Initialize the macro block neighbor lists for MC analysis. @@ -1003,6 +1061,55 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc, return 0; } +static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc, + const th_quant_info *_qinfo){ + unsigned char *etd; + size_t ets; + int align; + int qii; + int qi; + int pli; + int qti; + for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ + _enc->state.dequant_tables[qi][pli][qti]= + _enc->state.dequant_table_data[qi][pli][qti]; + } + /*Initialize the dequantization tables.*/ + oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo); + /*And save off the DC values.*/ + for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ + _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0]; + } + /*Set up storage for the quantization tables.*/ + etd=_enc->enquant_table_data; + ets=_enc->opt_data.enquant_table_size; + align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1; + etd+=align; + /*Set up the main tables.*/ + for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ + _enc->enquant_tables[qi][pli][qti]=etd; + oc_enc_enquant_table_init(_enc,etd, + _enc->state.dequant_tables[qi][pli][qti]); + etd+=ets; + } + /*Set up storage for the local copies we modify for each frame.*/ + for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){ + _enc->enquant[pli][qii][qti]=etd; + etd+=ets; + } +} + +/*Updates the encoder state after the quantization parameters have been + changed.*/ +static void oc_enc_quant_params_updated(oc_enc_ctx *_enc, + const th_quant_info *_qinfo){ + oc_enc_enquant_tables_init(_enc,_qinfo); + memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits, + sizeof(_enc->state.loop_filter_limits)); + oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale, + _enc->state.dequant_tables,_enc->state.info.pixel_fmt); +} + /*Sets the quantization parameters to use. This may only be called before the setup header is written. If it is called multiple times, only the last call has any effect. @@ -1012,25 +1119,20 @@ static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc, will be used.*/ static int oc_enc_set_quant_params(oc_enc_ctx *_enc, const th_quant_info *_qinfo){ - int qi; - int pli; - int qti; + th_quant_info old_qinfo; + int ret; if(_enc==NULL)return TH_EFAULT; if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL; if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO; - /*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/ - memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo)); - for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){ - _enc->state.dequant_tables[qi][pli][qti]= - _enc->state.dequant_table_data[qi][pli][qti]; - _enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti]; + memcpy(&old_qinfo,&_enc->qinfo,sizeof(old_qinfo)); + ret=oc_quant_params_clone(&_enc->qinfo,_qinfo); + if(ret<0){ + oc_quant_params_clear(&_enc->qinfo); + memcpy(&_enc->qinfo,&old_qinfo,sizeof(old_qinfo)); + return ret; } - oc_enquant_tables_init(_enc->state.dequant_tables, - _enc->enquant_tables,_qinfo); - memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits, - sizeof(_enc->state.loop_filter_limits)); - oc_enquant_qavg_init(_enc->log_qavg,_enc->state.dequant_tables, - _enc->state.info.pixel_fmt); + else oc_quant_params_clear(&old_qinfo); + oc_enc_quant_params_updated(_enc,_qinfo); return 0; } @@ -1039,6 +1141,7 @@ static void oc_enc_clear(oc_enc_ctx *_enc); static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){ th_info info; size_t mcu_nmbs; + ptrdiff_t mcu_ncfrags; ptrdiff_t mcu_nfrags; int hdec; int vdec; @@ -1053,8 +1156,9 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){ if(info.quality<0)info.quality=32; if(info.target_bitrate<0)info.target_bitrate=0; /*Initialize the shared encoder/decoder state.*/ - ret=oc_state_init(&_enc->state,&info,4); + ret=oc_state_init(&_enc->state,&info,6); if(ret<0)return ret; + oc_enc_accel_init(_enc); _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info)); _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc)); _enc->coded_mbis= @@ -1065,9 +1169,14 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){ super block rows of Y' for each super block row of Cb and Cr.*/ _enc->mcu_nvsbs=1<<vdec; mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4; - mcu_nfrags=4*mcu_nmbs+(8*mcu_nmbs>>hdec+vdec); + mcu_ncfrags=mcu_nmbs<<3-(hdec+vdec); + mcu_nfrags=4*mcu_nmbs+mcu_ncfrags; _enc->mcu_skip_ssd=(unsigned *)_ogg_malloc( mcu_nfrags*sizeof(*_enc->mcu_skip_ssd)); + _enc->mcu_rd_scale=(ogg_uint16_t *)_ogg_malloc( + (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_scale)); + _enc->mcu_rd_iscale=(ogg_uint16_t *)_ogg_malloc( + (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_iscale)); for(pli=0;pli<3;pli++){ _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64, _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens)); @@ -1075,34 +1184,22 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){ _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits)); } #if defined(OC_COLLECT_METRICS) + _enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad)); _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd)); _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd)); #endif -#if defined(OC_X86_ASM) - oc_enc_vtable_init_x86(_enc); -#else - oc_enc_vtable_init_c(_enc); -#endif + _enc->enquant_table_data=(unsigned char *)_ogg_malloc( + (64+3)*3*2*_enc->opt_data.enquant_table_size + +_enc->opt_data.enquant_table_alignment-1); _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift; _enc->state.qis[0]=_enc->state.info.quality; _enc->state.nqis=1; + _enc->activity_avg=90<<12; + _enc->luma_avg=128<<8; oc_rc_state_init(&_enc->rc,_enc); oggpackB_writeinit(&_enc->opb); - if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL|| - _enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL|| - _enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL|| - _enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL|| - _enc->extra_bits[2]==NULL -#if defined(OC_COLLECT_METRICS) - ||_enc->frag_satd==NULL||_enc->frag_ssd==NULL -#endif - ){ - oc_enc_clear(_enc); - return TH_EFAULT; - } - oc_mode_scheme_chooser_init(&_enc->chooser); - oc_enc_mb_info_init(_enc); - memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs)); + memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes)); + memset(_enc->qinfo.qi_ranges,0,sizeof(_enc->qinfo.qi_ranges)); /*Reset the packet-out state machine.*/ _enc->packet_state=OC_PACKET_INFO_HDR; _enc->dup_count=0; @@ -1114,26 +1211,45 @@ static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){ _enc->vp3_compatible=0; /*No INTER frames coded yet.*/ _enc->coded_inter_frame=0; - memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes)); - oc_enc_set_quant_params(_enc,NULL); + if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL + ||_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL + ||_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL + ||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL + ||_enc->extra_bits[2]==NULL +#if defined(OC_COLLECT_METRICS) + ||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL +#endif + ||oc_enc_set_quant_params(_enc,NULL)<0){ + oc_enc_clear(_enc); + return TH_EFAULT; + } + oc_mode_scheme_chooser_init(&_enc->chooser); + oc_enc_mb_info_init(_enc); + memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs)); return 0; } static void oc_enc_clear(oc_enc_ctx *_enc){ int pli; oc_rc_state_clear(&_enc->rc); -#if defined(OC_COLLECT_METRICS) - oc_enc_mode_metrics_dump(_enc); -#endif oggpackB_writeclear(&_enc->opb); + oc_quant_params_clear(&_enc->qinfo); + _ogg_free(_enc->enquant_table_data); #if defined(OC_COLLECT_METRICS) + /*Save the collected metrics from this run. + Use tools/process_modedec_stats to actually generate modedec.h from the + resulting file.*/ + oc_mode_metrics_dump(); _ogg_free(_enc->frag_ssd); _ogg_free(_enc->frag_satd); + _ogg_free(_enc->frag_sad); #endif for(pli=3;pli-->0;){ oc_free_2d(_enc->extra_bits[pli]); oc_free_2d(_enc->dct_tokens[pli]); } + _ogg_free(_enc->mcu_rd_iscale); + _ogg_free(_enc->mcu_rd_scale); _ogg_free(_enc->mcu_skip_ssd); _ogg_free(_enc->coded_mbis); _ogg_free(_enc->frag_dc); @@ -1145,10 +1261,14 @@ static void oc_enc_drop_frame(th_enc_ctx *_enc){ /*Use the previous frame's reconstruction.*/ _enc->state.ref_frame_idx[OC_FRAME_SELF]= _enc->state.ref_frame_idx[OC_FRAME_PREV]; + _enc->state.ref_frame_data[OC_FRAME_SELF]= + _enc->state.ref_frame_data[OC_FRAME_PREV]; /*Flag motion vector analysis about the frame drop.*/ _enc->prevframe_dropped=1; /*Zero the packet.*/ oggpackB_reset(&_enc->opb); + /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/ + if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc); } static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){ @@ -1222,9 +1342,9 @@ static void oc_enc_set_granpos(oc_enc_ctx *_enc){ th_enc_ctx *th_encode_alloc(const th_info *_info){ oc_enc_ctx *enc; if(_info==NULL)return NULL; - enc=_ogg_malloc(sizeof(*enc)); + enc=oc_aligned_malloc(sizeof(*enc),16); if(enc==NULL||oc_enc_init(enc,_info)<0){ - _ogg_free(enc); + oc_aligned_free(enc); return NULL; } return enc; @@ -1233,7 +1353,7 @@ th_enc_ctx *th_encode_alloc(const th_info *_info){ void th_encode_free(th_enc_ctx *_enc){ if(_enc!=NULL){ oc_enc_clear(_enc); - _ogg_free(_enc); + oc_aligned_free(_enc); } } @@ -1272,12 +1392,17 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){ }break; case TH_ENCCTL_SET_VP3_COMPATIBLE:{ int vp3_compatible; + int ret; if(_enc==NULL||_buf==NULL)return TH_EFAULT; if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL; + /*Try this before we change anything else, because it can fail.*/ + ret=oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO); + /*If we can't allocate enough memory, don't change any of the state.*/ + if(ret==TH_EFAULT)return ret; vp3_compatible=*(int *)_buf; _enc->vp3_compatible=vp3_compatible; if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0; - if(oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO)<0)vp3_compatible=0; + if(ret<0)vp3_compatible=0; if(_enc->state.info.pixel_fmt!=TH_PF_420|| _enc->state.info.pic_width<_enc->state.info.frame_width|| _enc->state.info.pic_height<_enc->state.info.frame_height|| @@ -1386,6 +1511,44 @@ int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){ } return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz); }break; + case TH_ENCCTL_SET_COMPAT_CONFIG:{ + unsigned char buf[7]; + oc_pack_buf opb; + th_quant_info qinfo; + th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]; + int ret; + int i; + if(_enc==NULL||_buf==NULL)return TH_EFAULT; + if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL; + oc_pack_readinit(&opb,_buf,_buf_sz); + /*Validate the setup packet header.*/ + for(i=0;i<7;i++)buf[i]=(unsigned char)oc_pack_read(&opb,8); + if(!(buf[0]&0x80)||memcmp(buf+1,"theora",6)!=0)return TH_ENOTFORMAT; + if(buf[0]!=0x82)return TH_EBADHEADER; + /*Reads its contents.*/ + ret=oc_quant_params_unpack(&opb,&qinfo); + if(ret<0){ + oc_quant_params_clear(&qinfo); + return ret; + } + ret=oc_huff_codes_unpack(&opb,huff_codes); + if(ret<0){ + oc_quant_params_clear(&qinfo); + return ret; + } + /*Install the new state.*/ + oc_quant_params_clear(&_enc->qinfo); + memcpy(&_enc->qinfo,&qinfo,sizeof(qinfo)); + oc_enc_quant_params_updated(_enc,&qinfo); + memcpy(_enc->huff_codes,huff_codes,sizeof(_enc->huff_codes)); + return 0; + } +#if defined(OC_COLLECT_METRICS) + case TH_ENCCTL_SET_METRICS_FILE:{ + OC_MODE_METRICS_FILENAME=(const char *)_buf; + return 0; + } +#endif default:return TH_EIMPL; } } @@ -1477,6 +1640,12 @@ static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src, int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){ th_ycbcr_buffer img; + int frame_width; + int frame_height; + int pic_width; + int pic_height; + int pic_x; + int pic_y; int cframe_width; int cframe_height; int cpic_width; @@ -1492,53 +1661,94 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){ if(_enc==NULL||_img==NULL)return TH_EFAULT; if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL; if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL; - if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width|| - (ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){ - return TH_EINVAL; - } hdec=!(_enc->state.info.pixel_fmt&1); vdec=!(_enc->state.info.pixel_fmt&2); - cframe_width=_enc->state.info.frame_width>>hdec; - cframe_height=_enc->state.info.frame_height>>vdec; - if(_img[1].width!=cframe_width||_img[2].width!=cframe_width|| - _img[1].height!=cframe_height||_img[2].height!=cframe_height){ - return TH_EINVAL; - } - /*Step 2: Copy the input to our internal buffer. - This lets us add padding, if necessary, so we don't have to worry about - dereferencing possibly invalid addresses, and allows us to use the same - strides and fragment offsets for both the input frame and the reference - frames.*/ + frame_width=_enc->state.info.frame_width; + frame_height=_enc->state.info.frame_height; + pic_x=_enc->state.info.pic_x; + pic_y=_enc->state.info.pic_y; + pic_width=_enc->state.info.pic_width; + pic_height=_enc->state.info.pic_height; + cframe_width=frame_width>>hdec; + cframe_height=frame_height>>vdec; + cpic_x=pic_x>>hdec; + cpic_y=pic_y>>vdec; + cpic_width=(pic_x+pic_width+hdec>>hdec)-cpic_x; + cpic_height=(pic_y+pic_height+vdec>>vdec)-cpic_y; /*Flip the input buffer upside down.*/ oc_ycbcr_buffer_flip(img,_img); - oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+0,img+0, - _enc->state.info.pic_x,_enc->state.info.pic_y, - _enc->state.info.pic_width,_enc->state.info.pic_height); - cpic_x=_enc->state.info.pic_x>>hdec; - cpic_y=_enc->state.info.pic_y>>vdec; - cpic_width=(_enc->state.info.pic_x+_enc->state.info.pic_width+hdec>>hdec) - -cpic_x; - cpic_height=(_enc->state.info.pic_y+_enc->state.info.pic_height+vdec>>vdec) - -cpic_y; - for(pli=1;pli<3;pli++){ - oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+pli,img+pli, - cpic_x,cpic_y,cpic_width,cpic_height); + if(img[0].width!=frame_width||img[0].height!=frame_height|| + img[1].width!=cframe_width||img[2].width!=cframe_width|| + img[1].height!=cframe_height||img[2].height!=cframe_height){ + /*The buffer does not match the frame size. + Check to see if it matches the picture size.*/ + if(img[0].width!=pic_width||img[0].height!=pic_height|| + img[1].width!=cpic_width||img[2].width!=cpic_width|| + img[1].height!=cpic_height||img[2].height!=cpic_height){ + /*It doesn't; we don't know how to handle it.*/ + return TH_EINVAL; + } + /*Adjust the pointers to address a full frame. + We still only use the picture region, however.*/ + img[0].data-=pic_y*(ptrdiff_t)img[0].stride+pic_x; + img[1].data-=cpic_y*(ptrdiff_t)img[1].stride+cpic_x; + img[2].data-=cpic_y*(ptrdiff_t)img[2].stride+cpic_x; } - /*Step 3: Update the buffer state.*/ + /*Step 2: Update the buffer state.*/ if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){ _enc->state.ref_frame_idx[OC_FRAME_PREV]= _enc->state.ref_frame_idx[OC_FRAME_SELF]; + _enc->state.ref_frame_data[OC_FRAME_PREV]= + _enc->state.ref_frame_data[OC_FRAME_SELF]; if(_enc->state.frame_type==OC_INTRA_FRAME){ /*The new frame becomes both the previous and gold reference frames.*/ _enc->state.keyframe_num=_enc->state.curframe_num; _enc->state.ref_frame_idx[OC_FRAME_GOLD]= _enc->state.ref_frame_idx[OC_FRAME_SELF]; + _enc->state.ref_frame_data[OC_FRAME_GOLD]= + _enc->state.ref_frame_data[OC_FRAME_SELF]; + } + } + if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){ + _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]= + _enc->state.ref_frame_idx[OC_FRAME_IO]; + _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]= + _enc->state.ref_frame_data[OC_FRAME_IO]; + if(_enc->state.frame_type==OC_INTRA_FRAME){ + /*The new input frame becomes both the previous and gold + original-reference frames.*/ + _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]= + _enc->state.ref_frame_idx[OC_FRAME_IO]; + _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]= + _enc->state.ref_frame_data[OC_FRAME_IO]; } } + /*Select a free buffer to use for the incoming frame*/ + for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]|| + refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++); + _enc->state.ref_frame_idx[OC_FRAME_IO]=refi; + _enc->state.ref_frame_data[OC_FRAME_IO]= + _enc->state.ref_frame_bufs[refi][0].data; + /*Step 3: Copy the input to our internal buffer. + This lets us add padding, so we don't have to worry about dereferencing + possibly invalid addresses, and allows us to use the same strides and + fragment offsets for both the input frame and the reference frames.*/ + oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+0,img+0, + pic_x,pic_y,pic_width,pic_height); + oc_state_borders_fill_rows(&_enc->state,refi,0,0,frame_height); + oc_state_borders_fill_caps(&_enc->state,refi,0); + for(pli=1;pli<3;pli++){ + oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+pli,img+pli, + cpic_x,cpic_y,cpic_width,cpic_height); + oc_state_borders_fill_rows(&_enc->state,refi,pli,0,cframe_height); + oc_state_borders_fill_caps(&_enc->state,refi,pli); + } /*Select a free buffer to use for the reconstructed version of this frame.*/ for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]|| refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++); _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi; + _enc->state.ref_frame_data[OC_FRAME_SELF]= + _enc->state.ref_frame_bufs[refi][0].data; _enc->state.curframe_num+=_enc->prev_dup_count+1; /*Step 4: Compress the frame.*/ /*Start with a keyframe, and don't allow the generation of invalid files that @@ -1575,11 +1785,11 @@ int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){ } int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){ + unsigned char *packet; if(_enc==NULL||_op==NULL)return TH_EFAULT; if(_enc->packet_state==OC_PACKET_READY){ _enc->packet_state=OC_PACKET_EMPTY; if(_enc->rc.twopass!=1){ - unsigned char *packet; packet=oggpackB_get_buffer(&_enc->opb); /*If there's no packet, malloc failed while writing; it's lost forever.*/ if(packet==NULL)return TH_EFAULT; @@ -1595,8 +1805,22 @@ int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){ else if(_enc->packet_state==OC_PACKET_EMPTY){ if(_enc->nqueued_dups>0){ _enc->nqueued_dups--; - _op->packet=NULL; - _op->bytes=0; + /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/ + if(_enc->vp3_compatible){ + oggpackB_reset(&_enc->opb); + oc_enc_drop_frame_pack(_enc); + packet=oggpackB_get_buffer(&_enc->opb); + /*If there's no packet, malloc failed while writing; it's lost + forever.*/ + if(packet==NULL)return TH_EFAULT; + _op->packet=packet; + _op->bytes=oggpackB_bytes(&_enc->opb); + } + /*Otherwise emit a 0-byte packet.*/ + else{ + _op->packet=NULL; + _op->bytes=0; + } } else{ if(_last_p)_enc->packet_state=OC_PACKET_DONE; diff --git a/thirdparty/libtheora/encoder_disabled.c b/thirdparty/libtheora/encoder_disabled.c index 0cbf6645ac..ba6d995505 100644 --- a/thirdparty/libtheora/encoder_disabled.c +++ b/thirdparty/libtheora/encoder_disabled.c @@ -11,12 +11,15 @@ ******************************************************************** function: - last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include "apiwrapper.h" #include "encint.h" +const th_quant_info TH_VP31_QUANT_INFO = {}; +const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]; + th_enc_ctx *th_encode_alloc(const th_info *_info){ return NULL; } diff --git a/thirdparty/libtheora/enquant.c b/thirdparty/libtheora/enquant.c index 3372fed221..8fd220edd7 100644 --- a/thirdparty/libtheora/enquant.c +++ b/thirdparty/libtheora/enquant.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: enquant.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> @@ -20,6 +20,69 @@ +int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src){ + int i; + memcpy(_dst,_src,sizeof(*_dst)); + memset(_dst->qi_ranges,0,sizeof(_dst->qi_ranges)); + for(i=0;i<6;i++){ + int nranges; + int qti; + int pli; + int qtj; + int plj; + int pdup; + int qdup; + qti=i/3; + pli=i%3; + qtj=(i-1)/3; + plj=(i-1)%3; + nranges=_src->qi_ranges[qti][pli].nranges; + /*Check for those duplicates that can be cleanly handled by + oc_quant_params_clear().*/ + pdup=i>0&&nranges<=_src->qi_ranges[qtj][plj].nranges; + qdup=qti>0&&nranges<=_src->qi_ranges[0][pli].nranges; + _dst->qi_ranges[qti][pli].nranges=nranges; + if(pdup&&_src->qi_ranges[qti][pli].sizes==_src->qi_ranges[qtj][plj].sizes){ + _dst->qi_ranges[qti][pli].sizes=_dst->qi_ranges[qtj][plj].sizes; + } + else if(qdup&&_src->qi_ranges[1][pli].sizes==_src->qi_ranges[0][pli].sizes){ + _dst->qi_ranges[1][pli].sizes=_dst->qi_ranges[0][pli].sizes; + } + else{ + int *sizes; + sizes=(int *)_ogg_malloc(nranges*sizeof(*sizes)); + /*Note: The caller is responsible for cleaning up any partially + constructed qinfo.*/ + if(sizes==NULL)return TH_EFAULT; + memcpy(sizes,_src->qi_ranges[qti][pli].sizes,nranges*sizeof(*sizes)); + _dst->qi_ranges[qti][pli].sizes=sizes; + } + if(pdup&&_src->qi_ranges[qti][pli].base_matrices== + _src->qi_ranges[qtj][plj].base_matrices){ + _dst->qi_ranges[qti][pli].base_matrices= + _dst->qi_ranges[qtj][plj].base_matrices; + } + else if(qdup&&_src->qi_ranges[1][pli].base_matrices== + _src->qi_ranges[0][pli].base_matrices){ + _dst->qi_ranges[1][pli].base_matrices= + _dst->qi_ranges[0][pli].base_matrices; + } + else{ + th_quant_base *base_matrices; + base_matrices=(th_quant_base *)_ogg_malloc( + (nranges+1)*sizeof(*base_matrices)); + /*Note: The caller is responsible for cleaning up any partially + constructed qinfo.*/ + if(base_matrices==NULL)return TH_EFAULT; + memcpy(base_matrices,_src->qi_ranges[qti][pli].base_matrices, + (nranges+1)*sizeof(*base_matrices)); + _dst->qi_ranges[qti][pli].base_matrices= + (const th_quant_base *)base_matrices; + } + } + return 0; +} + void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){ const th_quant_ranges *qranges; const th_quant_base *base_mats[2*3*64]; @@ -119,7 +182,7 @@ void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){ } } -static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){ +void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){ ogg_uint32_t t; int l; _d<<=1; @@ -129,48 +192,61 @@ static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){ _this->l=l; } -/*See comments at oc_dequant_tables_init() for how the quantization tables' - storage should be initialized.*/ -void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2], - oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){ - int qi; +void oc_enc_enquant_table_init_c(void *_enquant, + const ogg_uint16_t _dequant[64]){ + oc_iquant *enquant; + int zzi; + /*In the original VP3.2 code, the rounding offset and the size of the + dead zone around 0 were controlled by a "sharpness" parameter. + We now R-D optimize the tokens for each block after quantization, + so the rounding offset should always be 1/2, and an explicit dead + zone is unnecessary. + Hence, all of that VP3.2 code is gone from here, and the remaining + floating point code has been implemented as equivalent integer + code with exact precision.*/ + enquant=(oc_iquant *)_enquant; + for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]); +} + +void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){ int pli; + int qii; int qti; - /*Initialize the dequantization tables first.*/ - oc_dequant_tables_init(_dequant,NULL,_qinfo); - /*Derive the quantization tables directly from the dequantization tables.*/ - for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){ - int zzi; - int plj; - int qtj; - int dupe; - dupe=0; - for(qtj=0;qtj<=qti;qtj++){ - for(plj=0;plj<(qtj<qti?3:pli);plj++){ - if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){ - dupe=1; - break; - } - } - if(dupe)break; - } - if(dupe){ - _enquant[qi][pli][qti]=_enquant[qi][plj][qtj]; - continue; - } - /*In the original VP3.2 code, the rounding offset and the size of the - dead zone around 0 were controlled by a "sharpness" parameter. - We now R-D optimize the tokens for each block after quantization, - so the rounding offset should always be 1/2, and an explicit dead - zone is unnecessary. - Hence, all of that VP3.2 code is gone from here, and the remaining - floating point code has been implemented as equivalent integer - code with exact precision.*/ - for(zzi=0;zzi<64;zzi++){ - oc_iquant_init(_enquant[qi][pli][qti]+zzi, - _dequant[qi][pli][qti][zzi]); + for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){ + *((oc_iquant *)_enquant[pli][qii][qti])= + *((oc_iquant *)_enquant[pli][0][qti]); + } +} + +int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], + const ogg_uint16_t _dequant[64],const void *_enquant){ + const oc_iquant *enquant; + int nonzero; + int zzi; + int val; + int d; + int s; + enquant=(const oc_iquant *)_enquant; + nonzero=0; + for(zzi=0;zzi<64;zzi++){ + val=_dct[zzi]; + d=_dequant[zzi]; + val=val<<1; + if(abs(val)>=d){ + s=OC_SIGNMASK(val); + /*The bias added here rounds ties away from zero, since token + optimization can only decrease the magnitude of the quantized + value.*/ + val+=d+s^s; + /*Note the arithmetic right shift is not guaranteed by ANSI C. + Hopefully no one still uses ones-complement architectures.*/ + val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s; + _qdct[zzi]=(ogg_int16_t)val; + nonzero=zzi; } + else _qdct[zzi]=0; } + return nonzero; } @@ -226,7 +302,7 @@ static const ogg_uint16_t OC_RPSD[2][64]={ relative to the total, scaled by 2**16, for each pixel format. These values were measured after motion-compensated prediction, before quantization, over a large set of test video encoded at all possible rates. - TODO: These values are only from INTER frames; it should be re-measured for + TODO: These values are only from INTER frames; they should be re-measured for INTRA frames.*/ static const ogg_uint16_t OC_PCD[4][3]={ {59926, 3038, 2572}, @@ -236,38 +312,58 @@ static const ogg_uint16_t OC_PCD[4][3]={ }; -/*Compute an "average" quantizer for each qi level. - We do one for INTER and one for INTRA, since their behavior is very - different, but average across chroma channels. +/*Compute "average" quantizers for each qi level to use for rate control. + We do one for each color channel, as well as an average across color + channels, separately for INTER and INTRA, since their behavior is very + different. The basic approach is to compute a harmonic average of the squared quantizer, weighted by the expected squared magnitude of the DCT coefficients. Under the (not quite true) assumption that DCT coefficients are Laplacian-distributed, this preserves the product Q*lambda, where lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be confused with the lambda used in R-D optimization throughout most of the - rest of the code). - The value Q*lambda completely determines the entropy of the coefficients.*/ + rest of the code), when the distributions from multiple coefficients are + pooled. + The value Q*lambda completely determines the entropy of coefficients drawn + from a Laplacian distribution, and thus the expected bitrate.*/ void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64], + ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _chroma_rd_scale[2][64][2], ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){ int qi; int pli; int qti; int ci; for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){ - ogg_int64_t q2; + ogg_int64_t q2; + ogg_uint32_t qp[3]; + ogg_uint32_t cqp; + ogg_uint32_t d; q2=0; for(pli=0;pli<3;pli++){ - ogg_uint32_t qp; - qp=0; + qp[pli]=0; for(ci=0;ci<64;ci++){ unsigned rq; unsigned qd; qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]]; rq=(OC_RPSD[qti][ci]+(qd>>1))/qd; - qp+=rq*(ogg_uint32_t)rq; + qp[pli]+=rq*(ogg_uint32_t)rq; } - q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp; + q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp[pli]; + /*plq=1.0/sqrt(qp)*/ + _log_plq[qi][pli][qti]= + (ogg_int16_t)(OC_Q10(32)-oc_blog32_q10(qp[pli])>>1); } + d=OC_PCD[_pixel_fmt][1]+OC_PCD[_pixel_fmt][2]; + cqp=(ogg_uint32_t)((OC_PCD[_pixel_fmt][1]*(ogg_int64_t)qp[1]+ + OC_PCD[_pixel_fmt][2]*(ogg_int64_t)qp[2]+(d>>1))/d); + /*chroma_rd_scale=clamp(0.25,cqp/qp[0],4)*/ + d=OC_MAXI(qp[0]+(1<<OC_RD_SCALE_BITS-1)>>OC_RD_SCALE_BITS,1); + d=OC_CLAMPI(1<<OC_RD_SCALE_BITS-2,(cqp+(d>>1))/d,4<<OC_RD_SCALE_BITS); + _chroma_rd_scale[qti][qi][0]=(ogg_int16_t)d; + /*chroma_rd_iscale=clamp(0.25,qp[0]/cqp,4)*/ + d=OC_MAXI(OC_RD_ISCALE(cqp,1),1); + d=OC_CLAMPI(1<<OC_RD_ISCALE_BITS-2,(qp[0]+(d>>1))/d,4<<OC_RD_ISCALE_BITS); + _chroma_rd_scale[qti][qi][1]=(ogg_int16_t)d; /*qavg=1.0/sqrt(q2).*/ _log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1; } diff --git a/thirdparty/libtheora/enquant.h b/thirdparty/libtheora/enquant.h index d62df10d1a..e5f78144cc 100644 --- a/thirdparty/libtheora/enquant.h +++ b/thirdparty/libtheora/enquant.h @@ -14,14 +14,13 @@ struct oc_iquant{ ogg_int16_t l; }; -typedef oc_iquant oc_iquant_table[64]; - +int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src); void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo); -void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2], - oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo); +void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d); void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64], + ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _pl_rd_scale[2][64][2], ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt); #endif diff --git a/thirdparty/libtheora/fdct.c b/thirdparty/libtheora/fdct.c index dc3a66f245..9c2f8b0446 100644 --- a/thirdparty/libtheora/fdct.c +++ b/thirdparty/libtheora/fdct.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: fdct.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include "encint.h" @@ -120,11 +120,6 @@ static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){ _y[7]=v; } -void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64], - const ogg_int16_t _x[64]){ - (*_enc->opt_vtable.fdct8x8)(_y,_x); -} - /*Performs a forward 8x8 Type-II DCT transform. The output is scaled by a factor of 4 relative to the orthonormal version of the transform. @@ -152,7 +147,7 @@ void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ /*Round the result back to the external working precision (which is still scaled by four relative to the orthogonal result). TODO: We should just update the external working precision.*/ - for(i=0;i<64;i++)_y[i]=w[i]+2>>2; + for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2; } diff --git a/thirdparty/libtheora/fragment.c b/thirdparty/libtheora/fragment.c index 15372e9d9f..14c38be507 100644 --- a/thirdparty/libtheora/fragment.c +++ b/thirdparty/libtheora/fragment.c @@ -11,17 +11,12 @@ ******************************************************************** function: - last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <string.h> #include "internal.h" -void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst, - const unsigned char *_src,int _ystride){ - (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride); -} - void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){ int i; for(i=8;i-->0;){ @@ -31,9 +26,24 @@ void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){ } } -void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst, - int _ystride,const ogg_int16_t _residue[64]){ - _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue); +/*Copies the fragments specified by the lists of fragment indices from one + frame to another. + _dst_frame: The reference frame to copy to. + _src_frame: The reference frame to copy from. + _ystride: The row stride of the reference frames. + _fragis: A pointer to a list of fragment indices. + _nfragis: The number of fragment indices to copy. + _frag_buf_offs: The offsets of fragments in the reference frames.*/ +void oc_frag_copy_list_c(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ + ptrdiff_t fragii; + for(fragii=0;fragii<_nfragis;fragii++){ + ptrdiff_t frag_buf_off; + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; + oc_frag_copy_c(_dst_frame+frag_buf_off, + _src_frame+frag_buf_off,_ystride); + } } void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride, @@ -46,11 +56,6 @@ void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride, } } -void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){ - _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue); -} - void oc_frag_recon_inter_c(unsigned char *_dst, const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){ int i; @@ -62,12 +67,6 @@ void oc_frag_recon_inter_c(unsigned char *_dst, } } -void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst, - const unsigned char *_src1,const unsigned char *_src2,int _ystride, - const ogg_int16_t _residue[64]){ - _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue); -} - void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){ int i; @@ -80,8 +79,4 @@ void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, } } -void oc_restore_fpu(const oc_theora_state *_state){ - _state->opt_vtable.restore_fpu(); -} - void oc_restore_fpu_c(void){} diff --git a/thirdparty/libtheora/huffdec.c b/thirdparty/libtheora/huffdec.c index 8cf27f0341..5a83c5f150 100644 --- a/thirdparty/libtheora/huffdec.c +++ b/thirdparty/libtheora/huffdec.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: huffdec.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -22,14 +22,60 @@ #include "decint.h" -/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/ -#define _ogg_offsetof(_type,_field)\ - ((size_t)((char *)&((_type *)0)->_field-(char *)0)) -/*The number of internal tokens associated with each of the spec tokens.*/ -static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={ - 1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8 -}; +/*Instead of storing every branching in the tree, subtrees can be collapsed + into one node, with a table of size 1<<nbits pointing directly to its + descedents nbits levels down. + This allows more than one bit to be read at a time, and avoids following all + the intermediate branches with next to no increased code complexity once + the collapsed tree has been built. + We do _not_ require that a subtree be complete to be collapsed, but instead + store duplicate pointers in the table, and record the actual depth of the + node below its parent. + This tells us the number of bits to advance the stream after reaching it. + + This turns out to be equivalent to the method described in \cite{Hash95}, + without the requirement that codewords be sorted by length. + If the codewords were sorted by length (so-called ``canonical-codes''), they + could be decoded much faster via either Lindell and Moffat's approach or + Hashemian's Condensed Huffman Code approach, the latter of which has an + extremely small memory footprint. + We can't use Choueka et al.'s finite state machine approach, which is + extremely fast, because we can't allow multiple symbols to be output at a + time; the codebook can and does change between symbols. + It also has very large memory requirements, which impairs cache coherency. + + We store the tree packed in an array of 16-bit integers (words). + Each node consists of a single word, followed consecutively by two or more + indices of its children. + Let n be the value of this first word. + This is the number of bits that need to be read to traverse the node, and + must be positive. + 1<<n entries follow in the array, each an index to a child node. + If the child is positive, then it is the index of another internal node in + the table. + If the child is negative or zero, then it is a leaf node. + These are stored directly in the child pointer to save space, since they only + require a single word. + If a leaf node would have been encountered before reading n bits, then it is + duplicated the necessary number of times in this table. + Leaf nodes pack both a token value and their actual depth in the tree. + The token in the leaf node is (-leaf&255). + The number of bits that need to be consumed to reach the leaf, starting from + the current node, is (-leaf>>8). + + @ARTICLE{Hash95, + author="Reza Hashemian", + title="Memory Efficient and High-Speed Search {Huffman} Coding", + journal="{IEEE} Transactions on Communications", + volume=43, + number=10, + pages="2576--2581", + month=Oct, + year=1995 + }*/ + + /*The map from external spec-defined tokens to internal tokens. This is constructed so that any extra bits read with the original token value @@ -99,391 +145,371 @@ static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={ 40 }; -/*These three functions are really part of the bitpack.c module, but - they are only used here. - Declaring local static versions so they can be inlined saves considerable - function call overhead.*/ - -static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){ - const unsigned char *ptr; - const unsigned char *stop; - oc_pb_window window; - int available; - window=_b->window; - available=_b->bits; - ptr=_b->ptr; - stop=_b->stop; - /*This version of _refill() doesn't bother setting eof because we won't - check for it after we've started decoding DCT tokens.*/ - if(ptr>=stop)available=OC_LOTS_OF_BITS; - while(available<=OC_PB_WINDOW_SIZE-8){ - available+=8; - window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available; - if(ptr>=stop)available=OC_LOTS_OF_BITS; - } - _b->ptr=ptr; - if(_bits>available)window|=*ptr>>(available&7); - _b->bits=available; - return window; -} - - -/*Read in bits without advancing the bit pointer. - Here we assume 0<=_bits&&_bits<=32.*/ -static long oc_pack_look(oc_pack_buf *_b,int _bits){ - oc_pb_window window; - int available; - long result; - window=_b->window; - available=_b->bits; - if(_bits==0)return 0; - if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits); - result=window>>OC_PB_WINDOW_SIZE-_bits; - return result; -} - -/*Advance the bit pointer.*/ -static void oc_pack_adv(oc_pack_buf *_b,int _bits){ - /*We ignore the special cases for _bits==0 and _bits==32 here, since they are - never used actually used. - OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read - 32 bits in a single go, and would require a 32 GB lookup table (assuming - 8 byte pointers, since 4 byte pointers couldn't fit such a table).*/ - _b->window<<=_bits; - _b->bits-=_bits; -} +/*The log base 2 of number of internal tokens associated with each of the spec + tokens (i.e., how many of the extra bits are folded into the token value). + Increasing the maximum value beyond 3 will enlarge the amount of stack + required for tree construction.*/ +static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={ + 0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3 +}; -/*The log_2 of the size of a lookup table is allowed to grow to relative to - the number of unique nodes it contains. - E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is - wasted (each node will have an amortized cost of at most 20 bytes when using - 4-byte pointers). +/*The size a lookup table is allowed to grow to relative to the number of + unique nodes it contains. + E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is + wasted (1/4 of the space must be used). Larger numbers can decode tokens with fewer read operations, while smaller - numbers may save more space (requiring as little as 8 bytes amortized per - node, though there will be more nodes). + numbers may save more space. With a sample file: 32233473 read calls are required when no tree collapsing is done (100.0%). - 19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%). - 11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%). - 10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%). - 10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%). - Since a value of 1 gets us the vast majority of the speed-up with only a - small amount of wasted memory, this is what we use.*/ -#define OC_HUFF_SLUSH (1) - - -/*Determines the size in bytes of a Huffman tree node that represents a - subtree of depth _nbits. - _nbits: The depth of the subtree. - If this is 0, the node is a leaf node. - Otherwise 1<<_nbits pointers are allocated for children. - Return: The number of bytes required to store the node.*/ -static size_t oc_huff_node_size(int _nbits){ - size_t size; - size=_ogg_offsetof(oc_huff_node,nodes); - if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits); - return size; -} - -static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){ - oc_huff_node *ret; - ret=(oc_huff_node *)*_storage; - ret->nbits=(unsigned char)_nbits; - (*_storage)+=_size; - return ret; -} - - -/*Determines the size in bytes of a Huffman tree. - _nbits: The depth of the subtree. - If this is 0, the node is a leaf node. - Otherwise storage for 1<<_nbits pointers are added for children. - Return: The number of bytes required to store the tree.*/ -static size_t oc_huff_tree_size(const oc_huff_node *_node){ - size_t size; - size=oc_huff_node_size(_node->nbits); - if(_node->nbits){ - int nchildren; - int i; - nchildren=1<<_node->nbits; - for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){ - size+=oc_huff_tree_size(_node->nodes[i]); - } - } - return size; -} - - -/*Unpacks a sub-tree from the given buffer. - _opb: The buffer to unpack from. - _binodes: The nodes to store the sub-tree in. - _nbinodes: The number of nodes available for the sub-tree. - Return: 0 on success, or a negative value on error.*/ -static int oc_huff_tree_unpack(oc_pack_buf *_opb, - oc_huff_node *_binodes,int _nbinodes){ - oc_huff_node *binode; - long bits; - int nused; - if(_nbinodes<1)return TH_EBADHEADER; - binode=_binodes; - nused=0; - bits=oc_pack_read1(_opb); - if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; - /*Read an internal node:*/ - if(!bits){ - int ret; - nused++; - binode->nbits=1; - binode->depth=1; - binode->nodes[0]=_binodes+nused; - ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused); - if(ret>=0){ - nused+=ret; - binode->nodes[1]=_binodes+nused; - ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused); - } - if(ret<0)return ret; - nused+=ret; - } - /*Read a leaf node:*/ - else{ - int ntokens; - int token; - int i; - bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS); + 19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%). + 11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%). + 10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%). + 10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%). + Since a value of 2 gets us the vast majority of the speed-up with only a + small amount of wasted memory, this is what we use. + This value must be less than 128, or you could create a tree with more than + 32767 entries, which would overflow the 16-bit words used to index it.*/ +#define OC_HUFF_SLUSH (2) +/*The root of the tree is on the fast path, and a larger value here is more + beneficial than elsewhere in the tree. + 7 appears to give the best performance, trading off between increased use of + the single-read fast path and cache footprint for the tables, though + obviously this will depend on your cache size. + Using 7 here, the VP3 tables are about twice as large compared to using 2.*/ +#define OC_ROOT_HUFF_SLUSH (7) + + + +/*Unpacks a Huffman codebook. + _opb: The buffer to unpack from. + _tokens: Stores a list of internal tokens, in the order they were found in + the codebook, and the lengths of their corresponding codewords. + This is enough to completely define the codebook, while minimizing + stack usage and avoiding temporary allocations (for platforms + where free() is a no-op). + Return: The number of internal tokens in the codebook, or a negative value + on error.*/ +int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){ + ogg_uint32_t code; + int len; + int ntokens; + int nleaves; + code=0; + len=ntokens=nleaves=0; + for(;;){ + long bits; + bits=oc_pack_read1(_opb); + /*Only process nodes so long as there's more bits in the buffer.*/ if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; - /*Find out how many internal tokens we translate this external token into.*/ - ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits]; - if(_nbinodes<2*ntokens-1)return TH_EBADHEADER; - /*Fill in a complete binary tree pointing to the internal tokens.*/ - for(i=1;i<ntokens;i<<=1){ - int j; - binode=_binodes+nused; - nused+=i; - for(j=0;j<i;j++){ - binode[j].nbits=1; - binode[j].depth=1; - binode[j].nodes[0]=_binodes+nused+2*j; - binode[j].nodes[1]=_binodes+nused+2*j+1; - } + /*Read an internal node:*/ + if(!bits){ + len++; + /*Don't allow codewords longer than 32 bits.*/ + if(len>32)return TH_EBADHEADER; } - /*And now the leaf nodes with those tokens.*/ - token=OC_DCT_TOKEN_MAP[bits]; - for(i=0;i<ntokens;i++){ - binode=_binodes+nused++; - binode->nbits=0; - binode->depth=1; - binode->token=token+i; + /*Read a leaf node:*/ + else{ + ogg_uint32_t code_bit; + int neb; + int nentries; + int token; + /*Don't allow more than 32 spec-tokens per codebook.*/ + if(++nleaves>32)return TH_EBADHEADER; + bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS); + neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits]; + token=OC_DCT_TOKEN_MAP[bits]; + nentries=1<<neb; + while(nentries-->0){ + _tokens[ntokens][0]=(unsigned char)token++; + _tokens[ntokens][1]=(unsigned char)(len+neb); + ntokens++; + } + code_bit=0x80000000U>>len-1; + while(len>0&&(code&code_bit)){ + code^=code_bit; + code_bit<<=1; + len--; + } + if(len<=0)break; + code|=code_bit; } } - return nused; -} - -/*Finds the depth of shortest branch of the given sub-tree. - The tree must be binary. - _binode: The root of the given sub-tree. - _binode->nbits must be 0 or 1. - Return: The smallest depth of a leaf node in this sub-tree. - 0 indicates this sub-tree is a leaf node.*/ -static int oc_huff_tree_mindepth(oc_huff_node *_binode){ - int depth0; - int depth1; - if(_binode->nbits==0)return 0; - depth0=oc_huff_tree_mindepth(_binode->nodes[0]); - depth1=oc_huff_tree_mindepth(_binode->nodes[1]); - return OC_MINI(depth0,depth1)+1; -} - -/*Finds the number of internal nodes at a given depth, plus the number of - leaves at that depth or shallower. - The tree must be binary. - _binode: The root of the given sub-tree. - _binode->nbits must be 0 or 1. - Return: The number of entries that would be contained in a jump table of the - given depth.*/ -static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){ - if(_binode->nbits==0||_depth<=0)return 1; - else{ - return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+ - oc_huff_tree_occupancy(_binode->nodes[1],_depth-1); - } + return ntokens; } -/*Makes a copy of the given Huffman tree. - _node: The Huffman tree to copy. - Return: The copy of the Huffman tree.*/ -static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node, - char **_storage){ - oc_huff_node *ret; - ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits); - ret->depth=_node->depth; - if(_node->nbits){ - int nchildren; - int i; - int inext; - nchildren=1<<_node->nbits; - for(i=0;i<nchildren;){ - ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage); - inext=i+(1<<_node->nbits-ret->nodes[i]->depth); - while(++i<inext)ret->nodes[i]=ret->nodes[i-1]; +/*Count how many tokens would be required to fill a subtree at depth _depth. + _tokens: A list of internal tokens, in the order they are found in the + codebook, and the lengths of their corresponding codewords. + _depth: The depth of the desired node in the corresponding tree structure. + Return: The number of tokens that belong to that subtree.*/ +static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){ + ogg_uint32_t code; + int ti; + code=0; + ti=0; + do{ + if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth; + else{ + /*Because of the expanded internal tokens, we can have codewords as long + as 35 bits. + A single recursion here is enough to advance past them.*/ + code++; + ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31); } } - else ret->token=_node->token; - return ret; + while(code<0x80000000U); + return ti; } -static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){ - size_t size; - int mindepth; - int depth; - int loccupancy; - int occupancy; - if(_binode->nbits!=0&&_depth>0){ - return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+ - oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1); - } - depth=mindepth=oc_huff_tree_mindepth(_binode); - occupancy=1<<mindepth; +/*Compute the number of bits to use for a collapsed tree node at the given + depth. + _tokens: A list of internal tokens, in the order they are found in the + codebook, and the lengths of their corresponding codewords. + _ntokens: The number of tokens corresponding to this tree node. + _depth: The depth of this tree node. + Return: The number of bits to use for a collapsed tree node rooted here. + This is always at least one, even if this was a leaf node.*/ +static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2], + int _ntokens,int _depth){ + int got_leaves; + int loccupancy; + int occupancy; + int slush; + int nbits; + int best_nbits; + slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH; + /*It's legal to have a tree with just a single node, which requires no bits + to decode and always returns the same token. + However, no encoder actually does this (yet). + To avoid a special case in oc_huff_token_decode(), we force the number of + lookahead bits to be at least one. + This will produce a tree that looks ahead one bit and then advances the + stream zero bits.*/ + nbits=1; + occupancy=2; + got_leaves=1; do{ + int ti; + if(got_leaves)best_nbits=nbits; + nbits++; + got_leaves=0; loccupancy=occupancy; - occupancy=oc_huff_tree_occupancy(_binode,++depth); - } - while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0)); - depth--; - size=oc_huff_node_size(depth); - if(depth>0){ - size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1); - size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1); + for(occupancy=ti=0;ti<_ntokens;occupancy++){ + if(_tokens[ti][1]<_depth+nbits)ti++; + else if(_tokens[ti][1]==_depth+nbits){ + got_leaves=1; + ti++; + } + else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits); + } } - return size; + while(occupancy>loccupancy&&occupancy*slush>=1<<nbits); + return best_nbits; } -static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode, - char **_storage); - -/*Fills the given nodes table with all the children in the sub-tree at the - given depth. - The nodes in the sub-tree with a depth less than that stored in the table - are freed. - The sub-tree must be binary and complete up until the given depth. - _nodes: The nodes table to fill. - _binode: The root of the sub-tree to fill it with. - _binode->nbits must be 0 or 1. - _level: The current level in the table. - 0 indicates that the current node should be stored, regardless of - whether it is a leaf node or an internal node. - _depth: The depth of the nodes to fill the table with, relative to their - parent.*/ -static void oc_huff_node_fill(oc_huff_node **_nodes, - oc_huff_node *_binode,int _level,int _depth,char **_storage){ - if(_level<=0||_binode->nbits==0){ - int i; - _binode->depth=(unsigned char)(_depth-_level); - _nodes[0]=oc_huff_tree_collapse(_binode,_storage); - for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0]; - } - else{ - _level--; - oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage); - _nodes+=1<<_level; - oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage); - } +/*Determines the size in words of a Huffman tree node that represents a + subtree of depth _nbits. + _nbits: The depth of the subtree. + This must be greater than zero. + Return: The number of words required to store the node.*/ +static size_t oc_huff_node_size(int _nbits){ + return 1+(1<<_nbits); } -/*Finds the largest complete sub-tree rooted at the current node and collapses - it into a single node. - This procedure is then applied recursively to all the children of that node. - _binode: The root of the sub-tree to collapse. - _binode->nbits must be 0 or 1. - Return: The new root of the collapsed sub-tree.*/ -static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode, - char **_storage){ - oc_huff_node *root; - size_t size; - int mindepth; - int depth; - int loccupancy; - int occupancy; - depth=mindepth=oc_huff_tree_mindepth(_binode); - occupancy=1<<mindepth; +/*Produces a collapsed-tree representation of the given token list. + _tree: The storage for the collapsed Huffman tree. + This may be NULL to compute the required storage size instead of + constructing the tree. + _tokens: A list of internal tokens, in the order they are found in the + codebook, and the lengths of their corresponding codewords. + _ntokens: The number of tokens corresponding to this tree node. + Return: The number of words required to store the tree.*/ +static size_t oc_huff_tree_collapse(ogg_int16_t *_tree, + unsigned char _tokens[][2],int _ntokens){ + ogg_int16_t node[34]; + unsigned char depth[34]; + unsigned char last[34]; + size_t ntree; + int ti; + int l; + depth[0]=0; + last[0]=(unsigned char)(_ntokens-1); + ntree=0; + ti=0; + l=0; do{ - loccupancy=occupancy; - occupancy=oc_huff_tree_occupancy(_binode,++depth); + int nbits; + nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]); + node[l]=(ogg_int16_t)ntree; + ntree+=oc_huff_node_size(nbits); + if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits; + do{ + while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){ + if(_tree!=NULL){ + ogg_int16_t leaf; + int nentries; + nentries=1<<depth[l]+nbits-_tokens[ti][1]; + leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]); + while(nentries-->0)_tree[node[l]++]=leaf; + } + ti++; + } + if(ti<=last[l]){ + /*We need to recurse*/ + depth[l+1]=(unsigned char)(depth[l]+nbits); + if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree; + l++; + last[l]= + (unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1); + break; + } + /*Pop back up a level of recursion.*/ + else if(l-->0)nbits=depth[l+1]-depth[l]; + } + while(l>=0); } - while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0)); - depth--; - if(depth<=1)return oc_huff_tree_copy(_binode,_storage); - size=oc_huff_node_size(depth); - root=oc_huff_node_init(_storage,size,depth); - root->depth=_binode->depth; - oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage); - return root; + while(l>=0); + return ntree; } /*Unpacks a set of Huffman trees, and reduces them to a collapsed representation. _opb: The buffer to unpack the trees from. _nodes: The table to fill with the Huffman trees. - Return: 0 on success, or a negative value on error.*/ + Return: 0 on success, or a negative value on error. + The caller is responsible for cleaning up any partially initialized + _nodes on failure.*/ int oc_huff_trees_unpack(oc_pack_buf *_opb, - oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){ + ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ int i; for(i=0;i<TH_NHUFFMAN_TABLES;i++){ - oc_huff_node nodes[511]; - char *storage; - size_t size; - int ret; + unsigned char tokens[256][2]; + int ntokens; + ogg_int16_t *tree; + size_t size; /*Unpack the full tree into a temporary buffer.*/ - ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes)); - if(ret<0)return ret; - /*Figure out how big the collapsed tree will be.*/ - size=oc_huff_tree_collapse_size(nodes,0); - storage=(char *)_ogg_calloc(1,size); - if(storage==NULL)return TH_EFAULT; - /*And collapse it.*/ - _nodes[i]=oc_huff_tree_collapse(nodes,&storage); + ntokens=oc_huff_tree_unpack(_opb,tokens); + if(ntokens<0)return ntokens; + /*Figure out how big the collapsed tree will be and allocate space for it.*/ + size=oc_huff_tree_collapse(NULL,tokens,ntokens); + /*This should never happen; if it does it means you set OC_HUFF_SLUSH or + OC_ROOT_HUFF_SLUSH too large.*/ + if(size>32767)return TH_EIMPL; + tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree)); + if(tree==NULL)return TH_EFAULT; + /*Construct the collapsed the tree.*/ + oc_huff_tree_collapse(tree,tokens,ntokens); + _nodes[i]=tree; } return 0; } +/*Determines the size in words of a Huffman subtree. + _tree: The complete Huffman tree. + _node: The index of the root of the desired subtree. + Return: The number of words required to store the tree.*/ +static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){ + size_t size; + int nchildren; + int n; + int i; + n=_tree[_node]; + size=oc_huff_node_size(n); + nchildren=1<<n; + i=0; + do{ + int child; + child=_tree[_node+i+1]; + if(child<=0)i+=1<<n-(-child>>8); + else{ + size+=oc_huff_tree_size(_tree,child); + i++; + } + } + while(i<nchildren); + return size; +} + /*Makes a copy of the given set of Huffman trees. _dst: The array to store the copy in. _src: The array of trees to copy.*/ -int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES], - const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){ +int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES], + const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){ + int total; int i; + total=0; for(i=0;i<TH_NHUFFMAN_TABLES;i++){ - size_t size; - char *storage; - size=oc_huff_tree_size(_src[i]); - storage=(char *)_ogg_calloc(1,size); - if(storage==NULL){ + size_t size; + size=oc_huff_tree_size(_src[i],0); + total+=size; + _dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i])); + if(_dst[i]==NULL){ while(i-->0)_ogg_free(_dst[i]); return TH_EFAULT; } - _dst[i]=oc_huff_tree_copy(_src[i],&storage); + memcpy(_dst[i],_src[i],size*sizeof(*_dst[i])); } return 0; } /*Frees the memory used by a set of Huffman trees. _nodes: The array of trees to free.*/ -void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){ +void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){ int i; for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]); } + /*Unpacks a single token using the given Huffman tree. _opb: The buffer to unpack the token from. _node: The tree to unpack the token with. Return: The token value.*/ -int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){ - long bits; - while(_node->nbits!=0){ - bits=oc_pack_look(_opb,_node->nbits); - _node=_node->nodes[bits]; - oc_pack_adv(_opb,_node->depth); +int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){ + const unsigned char *ptr; + const unsigned char *stop; + oc_pb_window window; + int available; + long bits; + int node; + int n; + ptr=_opb->ptr; + window=_opb->window; + stop=_opb->stop; + available=_opb->bits; + node=0; + for(;;){ + n=_tree[node]; + if(n>available){ + unsigned shift; + shift=OC_PB_WINDOW_SIZE-available; + do{ + /*We don't bother setting eof because we won't check for it after we've + started decoding DCT tokens.*/ + if(ptr>=stop){ + shift=(unsigned)-OC_LOTS_OF_BITS; + break; + } + shift-=8; + window|=(oc_pb_window)*ptr++<<shift; + } + while(shift>=8); + /*Note: We never request more than 24 bits, so there's no need to fill in + the last partial byte here.*/ + available=OC_PB_WINDOW_SIZE-shift; + } + bits=window>>OC_PB_WINDOW_SIZE-n; + node=_tree[node+1+bits]; + if(node<=0)break; + window<<=n; + available-=n; } - return _node->token; + node=-node; + n=node>>8; + window<<=n; + available-=n; + _opb->ptr=ptr; + _opb->window=window; + _opb->bits=available; + return node&255; } diff --git a/thirdparty/libtheora/huffdec.h b/thirdparty/libtheora/huffdec.h index d7ffa0e99b..03d25dcd1e 100644 --- a/thirdparty/libtheora/huffdec.h +++ b/thirdparty/libtheora/huffdec.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -22,71 +22,11 @@ -typedef struct oc_huff_node oc_huff_node; - -/*A node in the Huffman tree. - Instead of storing every branching in the tree, subtrees can be collapsed - into one node, with a table of size 1<<nbits pointing directly to its - descedents nbits levels down. - This allows more than one bit to be read at a time, and avoids following all - the intermediate branches with next to no increased code complexity once - the collapsed tree has been built. - We do _not_ require that a subtree be complete to be collapsed, but instead - store duplicate pointers in the table, and record the actual depth of the - node below its parent. - This tells us the number of bits to advance the stream after reaching it. - - This turns out to be equivalent to the method described in \cite{Hash95}, - without the requirement that codewords be sorted by length. - If the codewords were sorted by length (so-called ``canonical-codes''), they - could be decoded much faster via either Lindell and Moffat's approach or - Hashemian's Condensed Huffman Code approach, the latter of which has an - extremely small memory footprint. - We can't use Choueka et al.'s finite state machine approach, which is - extremely fast, because we can't allow multiple symbols to be output at a - time; the codebook can and does change between symbols. - It also has very large memory requirements, which impairs cache coherency. - - @ARTICLE{Hash95, - author="Reza Hashemian", - title="Memory Efficient and High-Speed Search {Huffman} Coding", - journal="{IEEE} Transactions on Communications", - volume=43, - number=10, - pages="2576--2581", - month=Oct, - year=1995 - }*/ -struct oc_huff_node{ - /*The number of bits of the code needed to descend through this node. - 0 indicates a leaf node. - Otherwise there are 1<<nbits nodes in the nodes table, which can be - indexed by reading nbits bits from the stream.*/ - unsigned char nbits; - /*The value of a token stored in a leaf node. - The value in non-leaf nodes is undefined.*/ - unsigned char token; - /*The depth of the current node, relative to its parent in the collapsed - tree. - This can be less than its parent's nbits value, in which case there are - 1<<nbits-depth copies of this node in the table, and the bitstream should - only be advanced depth bits after reaching this node.*/ - unsigned char depth; - /*The table of child nodes. - The ACTUAL size of this array is 1<<nbits, despite what the declaration - below claims. - The exception is that for leaf nodes the size is 0.*/ - oc_huff_node *nodes[2]; -}; - - - int oc_huff_trees_unpack(oc_pack_buf *_opb, - oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]); -int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES], - const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]); -void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]); -int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node); - + ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]); +int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES], + const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]); +void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]); +int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node); #endif diff --git a/thirdparty/libtheora/huffenc.c b/thirdparty/libtheora/huffenc.c index bf624e0523..77ab584a19 100644 --- a/thirdparty/libtheora/huffenc.c +++ b/thirdparty/libtheora/huffenc.c @@ -859,9 +859,10 @@ int oc_huff_codes_pack(oggpack_buffer *_opb, /*First, find the maximum code length so we can align all the bit patterns.*/ maxlen=_codes[i][0].nbits; - for(j=1;j<TH_NDCT_TOKENS;j++){ - maxlen=OC_MAXI(_codes[i][j].nbits,maxlen); - } + for(j=1;j<TH_NDCT_TOKENS;j++)maxlen=OC_MAXI(_codes[i][j].nbits,maxlen); + /*It's improbable that a code with more than 32 bits could pass the + validation below, but abort early in any case.*/ + if(maxlen>32)return TH_EINVAL; mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1; /*Copy over the codes into our temporary workspace. The bit patterns are aligned, and the original entry each code is from @@ -877,34 +878,89 @@ int oc_huff_codes_pack(oggpack_buffer *_opb, /*For each leaf of the tree:*/ bpos=maxlen; for(j=0;j<TH_NDCT_TOKENS;j++){ - int bit; - /*If this code has any bits at all.*/ - if(entries[j].shift<maxlen){ - /*Descend into the tree, writing a bit for each branch.*/ - for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1); - /*Mark this as a leaf node, and write its value.*/ - oggpackB_write(_opb,1,1); - oggpackB_write(_opb,entries[j].token,5); - /*For each 1 branch we've descended, back up the tree until we reach a - 0 branch.*/ - bit=1<<bpos; - for(;entries[j].pattern&bit;bpos++)bit<<=1; - /*Validate the code.*/ - if(j+1<TH_NDCT_TOKENS){ - mask=~(bit-1)<<1; - /*The next entry should have a 1 bit where we had a 0, and should - match our code above that bit. - This verifies both fullness and prefix-freeness simultaneously.*/ - if(!(entries[j+1].pattern&bit)|| - (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){ - return TH_EINVAL; - } + ogg_uint32_t bit; + /*Fail if this code has no bits at all. + Technically a codebook with a single 0-bit entry is legal, but the + encoder currently does not support codebooks which do not contain all + the tokens.*/ + if(entries[j].shift>=maxlen)return TH_EINVAL; + /*Descend into the tree, writing a bit for each branch.*/ + for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1); + /*Mark this as a leaf node, and write its value.*/ + oggpackB_write(_opb,1,1); + oggpackB_write(_opb,entries[j].token,5); + /*For each 1 branch we've descended, back up the tree until we reach a + 0 branch.*/ + bit=(ogg_uint32_t)1<<bpos; + for(;entries[j].pattern&bit;bpos++)bit<<=1; + /*Validate the code.*/ + if(j+1<TH_NDCT_TOKENS){ + mask=~(bit-1)<<1; + /*The next entry should have a 1 bit where we had a 0, and should + match our code above that bit. + This verifies both fullness and prefix-freeness simultaneously.*/ + if(!(entries[j+1].pattern&bit)|| + (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){ + return TH_EINVAL; + } + } + /*If there are no more codes, we should have ascended back to the top + of the tree.*/ + else if(bpos<maxlen)return TH_EINVAL; + } + } + return 0; +} + +/*This is used to copy the configuration of an existing setup header for use by + the encoder. + The decoder uses a completely different data structure for the Huffman + codebooks.*/ +int oc_huff_codes_unpack(oc_pack_buf *_opb, + th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){ + int i; + for(i=0;i<TH_NHUFFMAN_TABLES;i++){ + ogg_uint32_t code; + int len; + int nleaves; + code=0; + len=nleaves=0; + memset(_codes[i],0,TH_NDCT_TOKENS*sizeof(*_codes[i])); + for(;;){ + long bits; + bits=oc_pack_read1(_opb); + /*Only process nodes so long as there's more bits in the buffer.*/ + if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER; + /*Read an internal node:*/ + if(!bits){ + len++; + /*Don't allow codewords longer than 32 bits.*/ + if(len>32)return TH_EBADHEADER; + } + /*Read a leaf node:*/ + else{ + ogg_uint32_t code_bit; + /*Don't allow more than 32 tokens per codebook.*/ + if(++nleaves>32)return TH_EBADHEADER; + bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS); + /*The current encoder does not support codebooks that do not contain + all of the tokens.*/ + if(_codes[i][bits].nbits>0)return TH_EINVAL; + _codes[i][bits].pattern=code>>32-len; + _codes[i][bits].nbits=len; + code_bit=0x80000000U>>len-1; + while(len>0&&(code&code_bit)){ + code^=code_bit; + code_bit<<=1; + len--; } - /*If there are no more codes, we should have ascended back to the top - of the tree.*/ - else if(bpos<maxlen)return TH_EINVAL; + if(len<=0)break; + code|=code_bit; } } + /*The current encoder does not support codebooks that do not contain all of + the tokens.*/ + if(nleaves<32)return TH_EINVAL; } return 0; } diff --git a/thirdparty/libtheora/huffenc.h b/thirdparty/libtheora/huffenc.h index c5a3956f1f..0554cc4060 100644 --- a/thirdparty/libtheora/huffenc.h +++ b/thirdparty/libtheora/huffenc.h @@ -1,6 +1,7 @@ #if !defined(_huffenc_H) # define _huffenc_H (1) # include "huffman.h" +# include "bitpack.h" @@ -15,5 +16,7 @@ extern const th_huff_code int oc_huff_codes_pack(oggpack_buffer *_opb, const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]); +int oc_huff_codes_unpack(oc_pack_buf *_opb, + th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]); #endif diff --git a/thirdparty/libtheora/huffman.h b/thirdparty/libtheora/huffman.h index 36cf7572e5..eb805866b9 100644 --- a/thirdparty/libtheora/huffman.h +++ b/thirdparty/libtheora/huffman.h @@ -11,12 +11,12 @@ ******************************************************************** function: - last mod: $Id: huffman.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #if !defined(_huffman_H) -# define _hufffman_H (1) +# define _huffman_H (1) # include "theora/codec.h" # include "ocintrin.h" diff --git a/thirdparty/libtheora/idct.c b/thirdparty/libtheora/idct.c index 0e68ac7658..838e3ad8ca 100644 --- a/thirdparty/libtheora/idct.c +++ b/thirdparty/libtheora/idct.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -231,18 +231,18 @@ static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){ _y: The buffer to store the result in. This may be the same as _x. _x: The input coefficients.*/ -static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - const ogg_int16_t *in; - ogg_int16_t *end; - ogg_int16_t *out; - ogg_int16_t w[64]; +static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + ogg_int16_t w[64]; + int i; /*Transform rows of x into columns of w.*/ idct8_2(w,_x); idct8_1(w+1,_x+8); /*Transform rows of w into columns of y.*/ - for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in); + for(i=0;i<8;i++)idct8_2(_y+i,w+i*8); /*Adjust for the scale factor.*/ - for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4); + for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); + /*Clear input data for next block.*/ + _x[0]=_x[1]=_x[8]=0; } /*Performs an inverse 8x8 Type-II DCT transform. @@ -260,20 +260,20 @@ static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ _y: The buffer to store the result in. This may be the same as _x. _x: The input coefficients.*/ -static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - const ogg_int16_t *in; - ogg_int16_t *end; - ogg_int16_t *out; - ogg_int16_t w[64]; +static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + ogg_int16_t w[64]; + int i; /*Transform rows of x into columns of w.*/ idct8_4(w,_x); idct8_3(w+1,_x+8); idct8_2(w+2,_x+16); idct8_1(w+3,_x+24); /*Transform rows of w into columns of y.*/ - for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in); + for(i=0;i<8;i++)idct8_4(_y+i,w+i*8); /*Adjust for the scale factor.*/ - for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4); + for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); + /*Clear input data for next block.*/ + _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0; } /*Performs an inverse 8x8 Type-II DCT transform. @@ -282,28 +282,23 @@ static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ _y: The buffer to store the result in. This may be the same as _x. _x: The input coefficients.*/ -static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - const ogg_int16_t *in; - ogg_int16_t *end; - ogg_int16_t *out; - ogg_int16_t w[64]; +static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + ogg_int16_t w[64]; + int i; /*Transform rows of x into columns of w.*/ - for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in); + for(i=0;i<8;i++)idct8(w+i,_x+i*8); /*Transform rows of w into columns of y.*/ - for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in); + for(i=0;i<8;i++)idct8(_y+i,w+i*8); /*Adjust for the scale factor.*/ - for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4); -} - -void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64], - int _last_zzi){ - (*_state->opt_vtable.idct8x8)(_y,_last_zzi); + for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); + /*Clear input data for next block.*/ + for(i=0;i<64;i++)_x[i]=0; } /*Performs an inverse 8x8 Type-II DCT transform. The input is assumed to be scaled by a factor of 4 relative to orthonormal version of the transform.*/ -void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){ +void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ /*_last_zzi is subtly different from an actual count of the number of coefficients we decoded for this block. It contains the value of zzi BEFORE the final token in the block was @@ -329,7 +324,7 @@ void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){ gets. Needless to say we inherited this approach from VP3.*/ /*Then perform the iDCT.*/ - if(_last_zzi<3)oc_idct8x8_3(_y,_y); - else if(_last_zzi<10)oc_idct8x8_10(_y,_y); - else oc_idct8x8_slow(_y,_y); + if(_last_zzi<=3)oc_idct8x8_3(_y,_x); + else if(_last_zzi<=10)oc_idct8x8_10(_y,_x); + else oc_idct8x8_slow(_y,_x); } diff --git a/thirdparty/libtheora/info.c b/thirdparty/libtheora/info.c index 6b9762978b..e5cecd2de5 100644 --- a/thirdparty/libtheora/info.c +++ b/thirdparty/libtheora/info.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: info.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -54,7 +54,7 @@ void th_comment_init(th_comment *_tc){ memset(_tc,0,sizeof(*_tc)); } -void th_comment_add(th_comment *_tc,char *_comment){ +void th_comment_add(th_comment *_tc,const char *_comment){ char **user_comments; int *comment_lengths; int comment_len; @@ -75,7 +75,7 @@ void th_comment_add(th_comment *_tc,char *_comment){ _tc->user_comments[_tc->comments]=NULL; } -void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){ +void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){ char *comment; int tag_len; int val_len; @@ -91,7 +91,7 @@ void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){ _ogg_free(comment); } -char *th_comment_query(th_comment *_tc,char *_tag,int _count){ +char *th_comment_query(th_comment *_tc,const char *_tag,int _count){ long i; int found; int tag_len; @@ -107,7 +107,7 @@ char *th_comment_query(th_comment *_tc,char *_tag,int _count){ return NULL; } -int th_comment_query_count(th_comment *_tc,char *_tag){ +int th_comment_query_count(th_comment *_tc,const char *_tag){ long i; int tag_len; int count; diff --git a/thirdparty/libtheora/internal.c b/thirdparty/libtheora/internal.c index 0fe4f63e72..afbb6efae7 100644 --- a/thirdparty/libtheora/internal.c +++ b/thirdparty/libtheora/internal.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -97,79 +97,29 @@ int oc_ilog(unsigned _v){ -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the X and Y directions - (4:2:0). - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0]; - dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1]; - _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2); - _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2); -} - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the Y direction. - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=_lbmvs[0][0]+_lbmvs[2][0]; - dy=_lbmvs[0][1]+_lbmvs[2][1]; - _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1); - _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1); - dx=_lbmvs[1][0]+_lbmvs[3][0]; - dy=_lbmvs[1][1]+_lbmvs[3][1]; - _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1); - _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1); -} - -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with chroma decimated in the X direction (4:2:2). - _cbmvs: The chroma block-level motion vectors to fill in. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - int dx; - int dy; - dx=_lbmvs[0][0]+_lbmvs[1][0]; - dy=_lbmvs[0][1]+_lbmvs[1][1]; - _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1); - _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1); - dx=_lbmvs[2][0]+_lbmvs[3][0]; - dy=_lbmvs[2][1]+_lbmvs[3][1]; - _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1); - _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1); +void *oc_aligned_malloc(size_t _sz,size_t _align){ + unsigned char *p; + if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL; + p=(unsigned char *)_ogg_malloc(_sz+_align); + if(p!=NULL){ + int offs; + offs=((p-(unsigned char *)0)-1&_align-1); + p[offs]=offs; + p+=offs+1; + } + return p; } -/*The function used to fill in the chroma plane motion vectors for a macro - block when 4 different motion vectors are specified in the luma plane. - This version is for use with no chroma decimation (4:4:4). - _cbmvs: The chroma block-level motion vectors to fill in. - _lmbmv: The luma macro-block level motion vector to fill in for use in - prediction. - _lbmvs: The luma block-level motion vectors.*/ -static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ - memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0])); +void oc_aligned_free(void *_ptr){ + unsigned char *p; + p=(unsigned char *)_ptr; + if(p!=NULL){ + int offs; + offs=*--p; + _ogg_free(p-offs); + } } -/*A table of functions used to fill in the chroma plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane.*/ -const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={ - (oc_set_chroma_mvs_func)oc_set_chroma_mvs00, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs01, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs10, - (oc_set_chroma_mvs_func)oc_set_chroma_mvs11 -}; - - void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){ size_t rowsz; @@ -181,7 +131,6 @@ void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){ datsz=rowsz*_height; /*Alloc array and row pointers.*/ ret=(char *)_ogg_malloc(datsz+colsz); - if(ret==NULL)return NULL; /*Initialize the array.*/ if(ret!=NULL){ size_t i; @@ -204,7 +153,6 @@ void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){ datsz=rowsz*_height; /*Alloc array and row pointers.*/ ret=(char *)_ogg_calloc(datsz+colsz,1); - if(ret==NULL)return NULL; /*Initialize the array.*/ if(ret!=NULL){ size_t i; diff --git a/thirdparty/libtheora/internal.h b/thirdparty/libtheora/internal.h index d81263e13e..53c77b88be 100644 --- a/thirdparty/libtheora/internal.h +++ b/thirdparty/libtheora/internal.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #if !defined(_internal_H) @@ -19,10 +19,20 @@ # include <stdlib.h> # include <limits.h> # if defined(HAVE_CONFIG_H) -# include <config.h> +# include "config.h" # endif # include "theora/codec.h" # include "theora/theora.h" +# include "ocintrin.h" + +# if !defined(__GNUC_PREREQ) +# if defined(__GNUC__)&&defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(_maj,_min) \ + ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) +# else +# define __GNUC_PREREQ(_maj,_min) 0 +# endif +# endif # if defined(_MSC_VER) /*Disable missing EMMS warnings.*/ @@ -31,24 +41,25 @@ # pragma warning(disable:4554) # endif /*You, too, gcc.*/ -# if defined(__GNUC_PREREQ) -# if __GNUC_PREREQ(4,2) -# pragma GCC diagnostic ignored "-Wparentheses" -# endif +# if __GNUC_PREREQ(4,2) +# pragma GCC diagnostic ignored "-Wparentheses" # endif -# include "ocintrin.h" -# include "huffman.h" -# include "quant.h" - -/*Some assembly constructs require aligned operands.*/ -# if defined(OC_X86_ASM) +/*Some assembly constructs require aligned operands. + The following macros are _only_ intended for structure member declarations. + Although they will sometimes work on stack variables, gcc will often silently + ignore them. + A separate set of macros could be made for manual stack alignment, but we + don't actually require it anywhere.*/ +# if defined(OC_X86_ASM)||defined(OC_ARM_ASM) # if defined(__GNUC__) # define OC_ALIGN8(expr) expr __attribute__((aligned(8))) # define OC_ALIGN16(expr) expr __attribute__((aligned(16))) # elif defined(_MSC_VER) # define OC_ALIGN8(expr) __declspec (align(8)) expr # define OC_ALIGN16(expr) __declspec (align(16)) expr +# else +# error "Alignment macros required for this platform." # endif # endif # if !defined(OC_ALIGN8) @@ -60,19 +71,8 @@ -typedef struct oc_sb_flags oc_sb_flags; -typedef struct oc_border_info oc_border_info; -typedef struct oc_fragment oc_fragment; -typedef struct oc_fragment_plane oc_fragment_plane; -typedef struct oc_base_opt_vtable oc_base_opt_vtable; -typedef struct oc_base_opt_data oc_base_opt_data; -typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable; -typedef struct oc_theora_state oc_theora_state; - - - /*This library's version.*/ -# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)" +# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)" /*Theora bitstream version.*/ # define TH_VERSION_MAJOR (3) @@ -83,315 +83,6 @@ typedef struct oc_theora_state oc_theora_state; ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \ (_info)->version_subminor>=(_sub))) -/*A keyframe.*/ -#define OC_INTRA_FRAME (0) -/*A predicted frame.*/ -#define OC_INTER_FRAME (1) -/*A frame of unknown type (frame type decision has not yet been made).*/ -#define OC_UNKWN_FRAME (-1) - -/*The amount of padding to add to the reconstructed frame buffers on all - sides. - This is used to allow unrestricted motion vectors without special casing. - This must be a multiple of 2.*/ -#define OC_UMV_PADDING (16) - -/*Frame classification indices.*/ -/*The previous golden frame.*/ -#define OC_FRAME_GOLD (0) -/*The previous frame.*/ -#define OC_FRAME_PREV (1) -/*The current frame.*/ -#define OC_FRAME_SELF (2) - -/*The input or output buffer.*/ -#define OC_FRAME_IO (3) - -/*Macroblock modes.*/ -/*Macro block is invalid: It is never coded.*/ -#define OC_MODE_INVALID (-1) -/*Encoded difference from the same macro block in the previous frame.*/ -#define OC_MODE_INTER_NOMV (0) -/*Encoded with no motion compensated prediction.*/ -#define OC_MODE_INTRA (1) -/*Encoded difference from the previous frame offset by the given motion - vector.*/ -#define OC_MODE_INTER_MV (2) -/*Encoded difference from the previous frame offset by the last coded motion - vector.*/ -#define OC_MODE_INTER_MV_LAST (3) -/*Encoded difference from the previous frame offset by the second to last - coded motion vector.*/ -#define OC_MODE_INTER_MV_LAST2 (4) -/*Encoded difference from the same macro block in the previous golden - frame.*/ -#define OC_MODE_GOLDEN_NOMV (5) -/*Encoded difference from the previous golden frame offset by the given motion - vector.*/ -#define OC_MODE_GOLDEN_MV (6) -/*Encoded difference from the previous frame offset by the individual motion - vectors given for each block.*/ -#define OC_MODE_INTER_MV_FOUR (7) -/*The number of (coded) modes.*/ -#define OC_NMODES (8) - -/*Determines the reference frame used for a given MB mode.*/ -#define OC_FRAME_FOR_MODE(_x) \ - OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \ - OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x)) - -/*Constants for the packet state machine common between encoder and decoder.*/ - -/*Next packet to emit/read: Codec info header.*/ -#define OC_PACKET_INFO_HDR (-3) -/*Next packet to emit/read: Comment header.*/ -#define OC_PACKET_COMMENT_HDR (-2) -/*Next packet to emit/read: Codec setup header.*/ -#define OC_PACKET_SETUP_HDR (-1) -/*No more packets to emit/read.*/ -#define OC_PACKET_DONE (INT_MAX) - - - -/*Super blocks are 32x32 segments of pixels in a single color plane indexed - in image order. - Internally, super blocks are broken up into four quadrants, each of which - contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels. - Quadrants, and the blocks within them, are indexed in a special order called - a "Hilbert curve" within the super block. - - In order to differentiate between the Hilbert-curve indexing strategy and - the regular image order indexing strategy, blocks indexed in image order - are called "fragments". - Fragments are indexed in image order, left to right, then bottom to top, - from Y' plane to Cb plane to Cr plane. - - The co-located fragments in all image planes corresponding to the location - of a single quadrant of a luma plane super block form a macro block. - Thus there is only a single set of macro blocks for all planes, each of which - contains between 6 and 12 fragments, depending on the pixel format. - Therefore macro block information is kept in a separate set of arrays from - super blocks to avoid unused space in the other planes. - The lists are indexed in super block order. - That is, the macro block corresponding to the macro block mbi in (luma plane) - super block sbi is at index (sbi<<2|mbi). - Thus the number of macro blocks in each dimension is always twice the number - of super blocks, even when only an odd number fall inside the coded frame. - These "extra" macro blocks are just an artifact of our internal data layout, - and not part of the coded stream; they are flagged with a negative MB mode.*/ - - - -/*A single quadrant of the map from a super block to fragment numbers.*/ -typedef ptrdiff_t oc_sb_map_quad[4]; -/*A map from a super block to fragment numbers.*/ -typedef oc_sb_map_quad oc_sb_map[4]; -/*A single plane of the map from a macro block to fragment numbers.*/ -typedef ptrdiff_t oc_mb_map_plane[4]; -/*A map from a macro block to fragment numbers.*/ -typedef oc_mb_map_plane oc_mb_map[3]; -/*A motion vector.*/ -typedef signed char oc_mv[2]; - - - -/*Super block information.*/ -struct oc_sb_flags{ - unsigned char coded_fully:1; - unsigned char coded_partially:1; - unsigned char quad_valid:4; -}; - - - -/*Information about a fragment which intersects the border of the displayable - region. - This marks which pixels belong to the displayable region.*/ -struct oc_border_info{ - /*A bit mask marking which pixels are in the displayable region. - Pixel (x,y) corresponds to bit (y<<3|x).*/ - ogg_int64_t mask; - /*The number of pixels in the displayable region. - This is always positive, and always less than 64.*/ - int npixels; -}; - - - -/*Fragment information.*/ -struct oc_fragment{ - /*A flag indicating whether or not this fragment is coded.*/ - unsigned coded:1; - /*A flag indicating that this entire fragment lies outside the displayable - region of the frame. - Note the contrast with an invalid macro block, which is outside the coded - frame, not just the displayable one. - There are no fragments outside the coded frame by construction.*/ - unsigned invalid:1; - /*The index of the quality index used for this fragment's AC coefficients.*/ - unsigned qii:6; - /*The mode of the macroblock this fragment belongs to.*/ - unsigned mb_mode:3; - /*The index of the associated border information for fragments which lie - partially outside the displayable region. - For fragments completely inside or outside this region, this is -1. - Note that the C standard requires an explicit signed keyword for bitfield - types, since some compilers may treat them as unsigned without it.*/ - signed int borderi:5; - /*The prediction-corrected DC component. - Note that the C standard requires an explicit signed keyword for bitfield - types, since some compilers may treat them as unsigned without it.*/ - signed int dc:16; -}; - - - -/*A description of each fragment plane.*/ -struct oc_fragment_plane{ - /*The number of fragments in the horizontal direction.*/ - int nhfrags; - /*The number of fragments in the vertical direction.*/ - int nvfrags; - /*The offset of the first fragment in the plane.*/ - ptrdiff_t froffset; - /*The total number of fragments in the plane.*/ - ptrdiff_t nfrags; - /*The number of super blocks in the horizontal direction.*/ - unsigned nhsbs; - /*The number of super blocks in the vertical direction.*/ - unsigned nvsbs; - /*The offset of the first super block in the plane.*/ - unsigned sboffset; - /*The total number of super blocks in the plane.*/ - unsigned nsbs; -}; - - - -/*The shared (encoder and decoder) functions that have accelerated variants.*/ -struct oc_base_opt_vtable{ - void (*frag_copy)(unsigned char *_dst, - const unsigned char *_src,int _ystride); - void (*frag_recon_intra)(unsigned char *_dst,int _ystride, - const ogg_int16_t _residue[64]); - void (*frag_recon_inter)(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); - void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); - void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi); - void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); - void (*state_frag_copy_list)(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli); - void (*state_loop_filter_frag_rows)(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); - void (*restore_fpu)(void); -}; - -/*The shared (encoder and decoder) tables that vary according to which variants - of the above functions are used.*/ -struct oc_base_opt_data{ - const unsigned char *dct_fzig_zag; -}; - - -/*State information common to both the encoder and decoder.*/ -struct oc_theora_state{ - /*The stream information.*/ - th_info info; - /*Table for shared accelerated functions.*/ - oc_base_opt_vtable opt_vtable; - /*Table for shared data used by accelerated functions.*/ - oc_base_opt_data opt_data; - /*CPU flags to detect the presence of extended instruction sets.*/ - ogg_uint32_t cpu_flags; - /*The fragment plane descriptions.*/ - oc_fragment_plane fplanes[3]; - /*The list of fragments, indexed in image order.*/ - oc_fragment *frags; - /*The the offset into the reference frame buffer to the upper-left pixel of - each fragment.*/ - ptrdiff_t *frag_buf_offs; - /*The motion vector for each fragment.*/ - oc_mv *frag_mvs; - /*The total number of fragments in a single frame.*/ - ptrdiff_t nfrags; - /*The list of super block maps, indexed in image order.*/ - oc_sb_map *sb_maps; - /*The list of super block flags, indexed in image order.*/ - oc_sb_flags *sb_flags; - /*The total number of super blocks in a single frame.*/ - unsigned nsbs; - /*The fragments from each color plane that belong to each macro block. - Fragments are stored in image order (left to right then top to bottom). - When chroma components are decimated, the extra fragments have an index of - -1.*/ - oc_mb_map *mb_maps; - /*The list of macro block modes. - A negative number indicates the macro block lies entirely outside the - coded frame.*/ - signed char *mb_modes; - /*The number of macro blocks in the X direction.*/ - unsigned nhmbs; - /*The number of macro blocks in the Y direction.*/ - unsigned nvmbs; - /*The total number of macro blocks.*/ - size_t nmbs; - /*The list of coded fragments, in coded order. - Uncoded fragments are stored in reverse order from the end of the list.*/ - ptrdiff_t *coded_fragis; - /*The number of coded fragments in each plane.*/ - ptrdiff_t ncoded_fragis[3]; - /*The total number of coded fragments.*/ - ptrdiff_t ntotal_coded_fragis; - /*The index of the buffers being used for each OC_FRAME_* reference frame.*/ - int ref_frame_idx[4]; - /*The actual buffers used for the previously decoded frames.*/ - th_ycbcr_buffer ref_frame_bufs[4]; - /*The storage for the reference frame buffers.*/ - unsigned char *ref_frame_data[4]; - /*The strides for each plane in the reference frames.*/ - int ref_ystride[3]; - /*The number of unique border patterns.*/ - int nborders; - /*The unique border patterns for all border fragments. - The borderi field of fragments which straddle the border indexes this - list.*/ - oc_border_info borders[16]; - /*The frame number of the last keyframe.*/ - ogg_int64_t keyframe_num; - /*The frame number of the current frame.*/ - ogg_int64_t curframe_num; - /*The granpos of the current frame.*/ - ogg_int64_t granpos; - /*The type of the current frame.*/ - unsigned char frame_type; - /*The bias to add to the frame count when computing granule positions.*/ - unsigned char granpos_bias; - /*The number of quality indices used in the current frame.*/ - unsigned char nqis; - /*The quality indices of the current frame.*/ - unsigned char qis[3]; - /*The dequantization tables, stored in zig-zag order, and indexed by - qi, pli, qti, and zzi.*/ - ogg_uint16_t *dequant_tables[64][3][2]; - OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]); - /*Loop filter strength parameters.*/ - unsigned char loop_filter_limits[64]; -}; - - - -/*The function type used to fill in the chroma plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane. - _cbmvs: The chroma block-level motion vectors to fill in. - _lmbmv: The luma macro-block level motion vector to fill in for use in - prediction. - _lbmvs: The luma block-level motion vectors.*/ -typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]); - /*A map from the index in the zig zag scan to the coefficient number in a @@ -409,14 +100,12 @@ extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12]; /*The number of indices in the oc_mb_map array that can be valid for each of the various chroma decimation types.*/ extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS]; -/*A table of functions used to fill in the Cb,Cr plane motion vectors for a - macro block when 4 different motion vectors are specified in the luma - plane.*/ -extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]; int oc_ilog(unsigned _v); +void *oc_aligned_malloc(size_t _sz,size_t _align); +void oc_aligned_free(void *_ptr); void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz); void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz); void oc_free_2d(void *_ptr); @@ -424,86 +113,4 @@ void oc_free_2d(void *_ptr); void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst, const th_ycbcr_buffer _src); -int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs); -void oc_state_clear(oc_theora_state *_state); -void oc_state_vtable_init_c(oc_theora_state *_state); -void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli, - int _y0,int _yend); -void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli); -void oc_state_borders_fill(oc_theora_state *_state,int _refi); -void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx, - th_ycbcr_buffer _img); -int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby); -int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], - int _pli,int _dx,int _dy); - -int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv); -void oc_state_loop_filter(oc_theora_state *_state,int _frame); -#if defined(OC_DUMP_IMAGES) -int oc_state_dump_frame(const oc_theora_state *_state,int _frame, - const char *_suf); -#endif - -/*Shared accelerated functions.*/ -void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst, - const unsigned char *_src,int _ystride); -void oc_frag_recon_intra(const oc_theora_state *_state, - unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]); -void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); -void oc_frag_recon_inter2(const oc_theora_state *_state, - unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2, - int _ystride,const ogg_int16_t _residue[64]); -void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi); -void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_state_frag_copy_list(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli); -void oc_state_loop_filter_frag_rows(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_restore_fpu(const oc_theora_state *_state); - -/*Default pure-C implementations.*/ -void oc_frag_copy_c(unsigned char *_dst, - const unsigned char *_src,int _src_ystride); -void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride, - const ogg_int16_t _residue[64]); -void oc_frag_recon_inter_c(unsigned char *_dst, - const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); -void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, - const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); -void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi); -void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_state_frag_copy_list_c(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli); -void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); -void oc_restore_fpu_c(void); - -/*We need a way to call a few encoder functions without introducing a link-time - dependency into the decoder, while still allowing the old alpha API which - does not distinguish between encoder and decoder objects to be used. - We do this by placing a function table at the start of the encoder object - which can dispatch into the encoder library. - We do a similar thing for the decoder in case we ever decide to split off a - common base library.*/ -typedef void (*oc_state_clear_func)(theora_state *_th); -typedef int (*oc_state_control_func)(theora_state *th,int _req, - void *_buf,size_t _buf_sz); -typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th, - ogg_int64_t _granulepos); -typedef double (*oc_state_granule_time_func)(theora_state *_th, - ogg_int64_t _granulepos); - - -struct oc_state_dispatch_vtable{ - oc_state_clear_func clear; - oc_state_control_func control; - oc_state_granule_frame_func granule_frame; - oc_state_granule_time_func granule_time; -}; - #endif diff --git a/thirdparty/libtheora/mathops.c b/thirdparty/libtheora/mathops.c index d3fb909194..23c8f6e1ba 100644 --- a/thirdparty/libtheora/mathops.c +++ b/thirdparty/libtheora/mathops.c @@ -1,10 +1,8 @@ +#include "internal.h" #include "mathops.h" -#include <limits.h> /*The fastest fallback strategy for platforms with fast multiplication appears to be based on de Bruijn sequences~\cite{LP98}. - Tests confirmed this to be true even on an ARM11, where it is actually faster - than using the native clz instruction. Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where multiplication or table lookups are too expensive. @@ -15,8 +13,7 @@ year=1998, note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}" }*/ -#if !defined(OC_ILOG_NODEBRUIJN)&& \ - !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL +#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32) static const unsigned char OC_DEBRUIJN_IDX32[32]={ 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8, 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9 @@ -25,7 +22,7 @@ static const unsigned char OC_DEBRUIJN_IDX32[32]={ int oc_ilog32(ogg_uint32_t _v){ #if defined(OC_CLZ32) - return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v; + return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v; #else /*On a Pentium M, this branchless version tested as the fastest version without multiplications on 1,000,000,000 random 32-bit integers, edging out a @@ -51,12 +48,12 @@ int oc_ilog32(ogg_uint32_t _v){ /*This de Bruijn sequence version is faster if you have a fast multiplier.*/ # else int ret; - ret=_v>0; _v|=_v>>1; _v|=_v>>2; _v|=_v>>4; _v|=_v>>8; _v|=_v>>16; + ret=_v&1; _v=(_v>>1)+1; ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F]; return ret; @@ -66,16 +63,21 @@ int oc_ilog32(ogg_uint32_t _v){ int oc_ilog64(ogg_int64_t _v){ #if defined(OC_CLZ64) - return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v; + return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v; #else -# if defined(OC_ILOG_NODEBRUIJN) +/*If we don't have a fast 64-bit word implementation, split it into two 32-bit + halves.*/ +# if defined(OC_ILOG_NODEBRUIJN)|| \ + defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL ogg_uint32_t v; int ret; int m; - ret=_v>0; m=(_v>0xFFFFFFFFU)<<5; v=(ogg_uint32_t)(_v>>m); - ret|=m; +# if defined(OC_CLZ32) + ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v; +# elif defined(OC_ILOG_NODEBRUIJN) + ret=v>0|m; m=(v>0xFFFFU)<<4; v>>=m; ret|=m; @@ -90,26 +92,19 @@ int oc_ilog64(ogg_int64_t _v){ ret|=m; ret+=v>1; return ret; -# else -/*If we don't have a 64-bit word, split it into two 32-bit halves.*/ -# if LONG_MAX<9223372036854775807LL - ogg_uint32_t v; - int ret; - int m; - ret=_v>0; - m=(_v>0xFFFFFFFFU)<<5; - v=(ogg_uint32_t)(_v>>m); - ret|=m; +# else v|=v>>1; v|=v>>2; v|=v>>4; v|=v>>8; v|=v>>16; + ret=v&1|m; v=(v>>1)+1; ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F]; +# endif return ret; -/*Otherwise do it in one 64-bit operation.*/ -# else +/*Otherwise do it in one 64-bit multiply.*/ +# else static const unsigned char OC_DEBRUIJN_IDX64[64]={ 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, @@ -117,17 +112,16 @@ int oc_ilog64(ogg_int64_t _v){ 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 }; int ret; - ret=_v>0; _v|=_v>>1; _v|=_v>>2; _v|=_v>>4; _v|=_v>>8; _v|=_v>>16; _v|=_v>>32; + ret=(int)_v&1; _v=(_v>>1)+1; ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F]; return ret; -# endif # endif #endif } @@ -294,3 +288,27 @@ ogg_int64_t oc_blog64(ogg_int64_t _w){ } return OC_Q57(ipart)+z; } + +/*Polynomial approximation of a binary exponential. + Q10 input, Q0 output.*/ +ogg_uint32_t oc_bexp32_q10(int _z){ + unsigned n; + int ipart; + ipart=_z>>10; + n=(_z&(1<<10)-1)<<4; + n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384; + return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14; +} + +/*Polynomial approximation of a binary logarithm. + Q0 input, Q10 output.*/ +int oc_blog32_q10(ogg_uint32_t _w){ + int n; + int ipart; + int fpart; + if(_w<=0)return -1; + ipart=OC_ILOGNZ_32(_w); + n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384; + fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793; + return (ipart<<10)+(fpart>>4); +} diff --git a/thirdparty/libtheora/mathops.h b/thirdparty/libtheora/mathops.h index efbc5377b0..a1a4f9df0e 100644 --- a/thirdparty/libtheora/mathops.h +++ b/thirdparty/libtheora/mathops.h @@ -2,29 +2,27 @@ # define _mathops_H (1) # include <ogg/ogg.h> -# ifdef __GNUC_PREREQ -# if __GNUC_PREREQ(3,4) -# include <limits.h> +# if __GNUC_PREREQ(3,4) +# include <limits.h> /*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from "upgrading" the type of an entire expression to an (unsigned) size_t.*/ -# if INT_MAX>=2147483647 -# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT) -# define OC_CLZ32(_x) (__builtin_clz(_x)) -# elif LONG_MAX>=2147483647L -# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) -# define OC_CLZ32(_x) (__builtin_clzl(_x)) -# endif -# if INT_MAX>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clz(_x)) -# elif LONG_MAX>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clzl(_x)) -# elif LLONG_MAX>=9223372036854775807LL|| \ - __LONG_LONG_MAX__>=9223372036854775807LL -# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT) -# define OC_CLZ64(_x) (__builtin_clzll(_x)) -# endif +# if INT_MAX>=2147483647 +# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT) +# define OC_CLZ32(_x) (__builtin_clz(_x)) +# elif LONG_MAX>=2147483647L +# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) +# define OC_CLZ32(_x) (__builtin_clzl(_x)) +# endif +# if INT_MAX>=9223372036854775807LL +# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT) +# define OC_CLZ64(_x) (__builtin_clz(_x)) +# elif LONG_MAX>=9223372036854775807LL +# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT) +# define OC_CLZ64(_x) (__builtin_clzl(_x)) +# elif LLONG_MAX>=9223372036854775807LL|| \ + __LONG_LONG_MAX__>=9223372036854775807LL +# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT) +# define OC_CLZ64(_x) (__builtin_clzll(_x)) # endif # endif @@ -134,8 +132,12 @@ int oc_ilog64(ogg_int64_t _v); # define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v))) #define OC_Q57(_v) ((ogg_int64_t)(_v)<<57) +#define OC_Q10(_v) ((_v)<<10) ogg_int64_t oc_bexp64(ogg_int64_t _z); ogg_int64_t oc_blog64(ogg_int64_t _w); +ogg_uint32_t oc_bexp32_q10(int _z); +int oc_blog32_q10(ogg_uint32_t _w); + #endif diff --git a/thirdparty/libtheora/mcenc.c b/thirdparty/libtheora/mcenc.c index 797e81f4f9..82eb824a80 100644 --- a/thirdparty/libtheora/mcenc.c +++ b/thirdparty/libtheora/mcenc.c @@ -88,9 +88,11 @@ static const int OC_SQUARE_SITES[11][8]={ }; -static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, - int _accum[2],int _mbi,int _frame){ +static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, + oc_mv _accum,int _mbi,int _frame){ oc_mb_enc_info *embs; + int accum_x; + int accum_y; int a[3][2]; int ncandidates; unsigned nmbi; @@ -102,20 +104,24 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, /*Fill in the first part of set A: the vectors from adjacent blocks.*/ for(i=0;i<embs[_mbi].ncneighbors;i++){ nmbi=embs[_mbi].cneighbors[i]; - _mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0]; - _mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1]; + _mcenc->candidates[ncandidates][0]= + OC_MV_X(embs[nmbi].analysis_mv[0][_frame]); + _mcenc->candidates[ncandidates][1]= + OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]); ncandidates++; } } + accum_x=OC_MV_X(_accum); + accum_y=OC_MV_Y(_accum); /*Add a few additional vectors to set A: the vectors used in the previous frames and the (0,0) vector.*/ - _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,_accum[0],31); - _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,_accum[1],31); + _mcenc->candidates[ncandidates][0]=accum_x; + _mcenc->candidates[ncandidates][1]=accum_y; ncandidates++; _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, - embs[_mbi].analysis_mv[1][_frame][0]+_accum[0],31); + OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31); _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, - embs[_mbi].analysis_mv[1][_frame][1]+_accum[1],31); + OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31); ncandidates++; _mcenc->candidates[ncandidates][0]=0; _mcenc->candidates[ncandidates][1]=0; @@ -131,30 +137,33 @@ static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, OC_SORT2I(a[0][1],a[1][1]); _mcenc->candidates[0][0]=a[1][0]; _mcenc->candidates[0][1]=a[1][1]; - /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ _mcenc->setb0=ncandidates; - /*The first time through the loop use the current macro block.*/ - nmbi=_mbi; - for(i=0;;i++){ - _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, - 2*embs[_mbi].analysis_mv[1][_frame][0] - -embs[_mbi].analysis_mv[2][_frame][0]+_accum[0],31); - _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, - 2*embs[_mbi].analysis_mv[1][_frame][1] - -embs[_mbi].analysis_mv[2][_frame][1]+_accum[1],31); - ncandidates++; - if(i>=embs[_mbi].npneighbors)break; - nmbi=embs[_mbi].pneighbors[i]; - } - /*Truncate to full-pel positions.*/ - for(i=0;i<ncandidates;i++){ - _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]); - _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]); - } +} + +static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, + oc_mv _accum,int _mbi,int _frame){ + oc_mb_enc_info *embs; + int accum_x; + int accum_y; + int ncandidates; + embs=_enc->mb_info; + accum_x=OC_MV_X(_accum); + accum_y=OC_MV_Y(_accum); + /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ + ncandidates=_mcenc->setb0; + /*Use only the current block. Using more did not appear to be helpful + with the current selection logic due to escaping the local search too + quickly.*/ + _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, + 2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame]) + -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31); + _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, + 2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame]) + -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31); + ncandidates++; _mcenc->ncandidates=ncandidates; } -#if 0 static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], int _mvoffset0,int _mvoffset1,const unsigned char *_src, @@ -170,20 +179,21 @@ static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, } return err; } -#endif static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc, const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], int _mvoffset0,int _mvoffset1,const unsigned char *_src, const unsigned char *_ref,int _ystride,unsigned _best_err){ unsigned err; + int dc; int bi; err=0; for(bi=0;bi<4;bi++){ ptrdiff_t frag_offs; frag_offs=_frag_buf_offs[_fragis[bi]]; - err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, - _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); + err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs, + _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride); + err+=abs(dc); } return err; } @@ -219,9 +229,17 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, err=0; for(bi=0;bi<4;bi++){ ptrdiff_t frag_offs; + int dc; frag_offs=_frag_buf_offs[_fragis[bi]]; - err+=oc_enc_frag_satd_thresh(_enc, - _src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + err+=oc_enc_frag_satd(_enc,&dc, + _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); + err+=abs(dc); + } + else{ + err+=oc_enc_frag_sad(_enc, + _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); + } } return err; } @@ -229,8 +247,11 @@ static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, ptrdiff_t _frag_offs,int _dx,int _dy, const unsigned char *_src,const unsigned char *_ref,int _ystride){ - return oc_enc_frag_satd_thresh(_enc, - _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX); + unsigned err; + int dc; + err=oc_enc_frag_satd(_enc,&dc, + _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride); + return err+abs(dc); } /*Perform a motion vector search for this macro block against a single @@ -239,11 +260,14 @@ static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, the work can be shared. The actual motion vector is stored in the appropriate place in the oc_mb_enc_info structure. - _mcenc: The motion compensation context. - _accum: Drop frame/golden MV accumulators. - _mbi: The macro block index. - _frame: The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/ -void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ + _accum: Drop frame/golden MV accumulators. + _mbi: The macro block index. + _frame: The frame to use for SATD calculations and refinement, + either OC_FRAME_PREV or OC_FRAME_GOLD. + _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, + OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ +void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, + int _frame_full){ /*Note: Traditionally this search is done using a rate-distortion objective function of the form D+lambda*R. However, xiphmont tested this and found it produced a small degredation, @@ -264,6 +288,7 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ const ptrdiff_t *fragis; const unsigned char *src; const unsigned char *ref; + const unsigned char *satd_ref; int ystride; oc_mb_enc_info *embs; ogg_int32_t hit_cache[31]; @@ -278,17 +303,18 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ int bi; embs=_enc->mb_info; /*Find some candidate motion vectors.*/ - oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame); + oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame); /*Clear the cache of locations we've examined.*/ memset(hit_cache,0,sizeof(hit_cache)); /*Start with the median predictor.*/ - candx=mcenc.candidates[0][0]; - candy=mcenc.candidates[0][1]; + candx=OC_DIV2(mcenc.candidates[0][0]); + candy=OC_DIV2(mcenc.candidates[0][1]); hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; + ref=_enc->state.ref_frame_data[_frame_full]; + satd_ref=_enc->state.ref_frame_data[_frame]; ystride=_enc->state.ref_ystride[0]; /*TODO: customize error function for speed/(quality+size) tradeoff.*/ best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, @@ -317,8 +343,8 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; /*Examine the candidates in set A.*/ for(ci=1;ci<mcenc.setb0;ci++){ - candx=mcenc.candidates[ci][0]; - candy=mcenc.candidates[ci][1]; + candx=OC_DIV2(mcenc.candidates[ci][0]); + candy=OC_DIV2(mcenc.candidates[ci][1]); /*If we've already examined this vector, then we would be using it if it was better than what we are using.*/ hitbit=(ogg_int32_t)1<<candx+15; @@ -340,10 +366,11 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ } } if(best_err>t2){ + oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame); /*Examine the candidates in set B.*/ for(;ci<mcenc.ncandidates;ci++){ - candx=mcenc.candidates[ci][0]; - candy=mcenc.candidates[ci][1]; + candx=OC_DIV2(mcenc.candidates[ci][0]); + candy=OC_DIV2(mcenc.candidates[ci][1]); hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; @@ -475,58 +502,50 @@ void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ candx=best_vec[0]; candy=best_vec[1]; embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc, - frag_buf_offs,fragis,candx,candy,src,ref,ystride); - embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1); - embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1); - if(_frame==OC_FRAME_PREV){ + frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride); + embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1); + if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ for(bi=0;bi<4;bi++){ candx=best_block_vec[bi][0]; candy=best_block_vec[bi][1]; embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc, - frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride); - embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1); - embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1); + frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride); + embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1); } } } void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){ - oc_mv2 *mvs; - int accum_p[2]; - int accum_g[2]; + oc_mv2 *mvs; + oc_mv accum_p; + oc_mv accum_g; + oc_mv mv2_p; mvs=_enc->mb_info[_mbi].analysis_mv; - if(_enc->prevframe_dropped){ - accum_p[0]=mvs[0][OC_FRAME_PREV][0]; - accum_p[1]=mvs[0][OC_FRAME_PREV][1]; - } - else accum_p[1]=accum_p[0]=0; - accum_g[0]=mvs[2][OC_FRAME_GOLD][0]; - accum_g[1]=mvs[2][OC_FRAME_GOLD][1]; - mvs[0][OC_FRAME_PREV][0]-=mvs[2][OC_FRAME_PREV][0]; - mvs[0][OC_FRAME_PREV][1]-=mvs[2][OC_FRAME_PREV][1]; + if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV]; + else accum_p=0; + accum_g=mvs[2][OC_FRAME_GOLD]; /*Move the motion vector predictors back a frame.*/ - memmove(mvs+1,mvs,2*sizeof(*mvs)); + mv2_p=mvs[2][OC_FRAME_PREV]; + mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD]; + mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV]; + mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD]; + mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p); /*Search the last frame.*/ - oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV); - mvs[2][OC_FRAME_PREV][0]=accum_p[0]; - mvs[2][OC_FRAME_PREV][1]=accum_p[1]; + oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG); + mvs[2][OC_FRAME_PREV]=accum_p; /*GOLDEN MVs are different from PREV MVs in that they're each absolute offsets from some frame in the past rather than relative offsets from the frame before. For predictor calculation to make sense, we need them to be in the same form as PREV MVs.*/ - mvs[1][OC_FRAME_GOLD][0]-=mvs[2][OC_FRAME_GOLD][0]; - mvs[1][OC_FRAME_GOLD][1]-=mvs[2][OC_FRAME_GOLD][1]; - mvs[2][OC_FRAME_GOLD][0]-=accum_g[0]; - mvs[2][OC_FRAME_GOLD][1]-=accum_g[1]; + mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); + mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g); /*Search the golden frame.*/ - oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD); + oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG); /*Put GOLDEN MVs back into absolute offset form. The newest MV is already an absolute offset.*/ - mvs[2][OC_FRAME_GOLD][0]+=accum_g[0]; - mvs[2][OC_FRAME_GOLD][1]+=accum_g[1]; - mvs[1][OC_FRAME_GOLD][0]+=mvs[2][OC_FRAME_GOLD][0]; - mvs[1][OC_FRAME_GOLD][1]+=mvs[2][OC_FRAME_GOLD][1]; + mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g); + mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); } #if 0 @@ -543,7 +562,7 @@ static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi, int sitei; int err; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]]; + ref=_enc->state.ref_frame_data[_framei]; frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; ystride=_enc->state.ref_ystride[0]; @@ -598,7 +617,7 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, int sitei; int err; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; + ref=_enc->state.ref_frame_data[_frame]; frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; ystride=_enc->state.ref_ystride[0]; @@ -627,8 +646,14 @@ static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); - err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, - mvoffset0,mvoffset1,src,ref,ystride,_best_err); + if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ + err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, + mvoffset0,mvoffset1,src,ref,ystride,_best_err); + } + else{ + err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, + mvoffset0,mvoffset1,src,ref,ystride,_best_err); + } if(err<_best_err){ _best_err=err; best_site=site; @@ -643,12 +668,11 @@ void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){ oc_mb_enc_info *embs; int vec[2]; embs=_enc->mb_info; - vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]); - vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]); + vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame])); + vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame])); embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc, _mbi,vec,embs[_mbi].satd[_frame],_frame); - embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0]; - embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1]; + embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]); } #if 0 @@ -704,6 +728,7 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, best_site=4; for(sitei=0;sitei<8;sitei++){ unsigned err; + int dc; int site; int xmask; int ymask; @@ -723,8 +748,9 @@ static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); - err=oc_enc_frag_satd2_thresh(_enc,_src, - _ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err); + err=oc_enc_frag_satd2(_enc,&dc,_src, + _ref+mvoffset0,_ref+mvoffset1,_ystride); + err+=abs(dc); if(err<_best_err){ _best_err=err; best_site=site; @@ -748,7 +774,7 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; src=_enc->state.ref_frame_data[OC_FRAME_IO]; - ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]]; + ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; offset_y[0]=offset_y[1]=offset_y[2]=-ystride; offset_y[3]=offset_y[5]=0; offset_y[6]=offset_y[7]=offset_y[8]=ystride; @@ -757,11 +783,10 @@ void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ ptrdiff_t frag_offs; int vec[2]; frag_offs=frag_buf_offs[fragis[bi]]; - vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]); - vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]); + vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi])); + vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi])); embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec, src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]); - embs[_mbi].ref_mv[bi][0]=(signed char)vec[0]; - embs[_mbi].ref_mv[bi][1]=(signed char)vec[1]; + embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]); } } diff --git a/thirdparty/libtheora/modedec.h b/thirdparty/libtheora/modedec.h index ea12c64afd..efe640e263 100644 --- a/thirdparty/libtheora/modedec.h +++ b/thirdparty/libtheora/modedec.h @@ -1,614 +1,91 @@ /*File generated by libtheora with OC_COLLECT_METRICS defined at compile time.*/ #if !defined(_modedec_H) # define _modedec_H (1) +# include "encint.h" -# if defined(OC_COLLECT_METRICS) -typedef struct oc_mode_metrics oc_mode_metrics; +/*The log of the average quantizer for each of the OC_MODE_RD table rows + (e.g., for the represented qi's, and each pli and qti), in Q10 format. + The actual statistics used by the encoder will be interpolated from + that table based on log_plq for the actual quantization matrix used.*/ +# if !defined(OC_COLLECT_METRICS) +static const # endif -typedef struct oc_mode_rd oc_mode_rd; - - - -/*The number of extra bits of precision at which to store rate metrics.*/ -# define OC_BIT_SCALE (6) -/*The number of extra bits of precision at which to store RMSE metrics. - This must be at least half OC_BIT_SCALE (rounded up).*/ -# define OC_RMSE_SCALE (5) -/*The number of bins to partition statistics into.*/ -# define OC_SAD_BINS (24) -/*The number of bits of precision to drop from SAD scores to assign them to a - bin.*/ -# define OC_SAD_SHIFT (9) - - - -# if defined(OC_COLLECT_METRICS) -struct oc_mode_metrics{ - double fragw; - double satd; - double rate; - double rmse; - double satd2; - double satdrate; - double rate2; - double satdrmse; - double rmse2; +ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={ + { {0x1F05,0x2101},{0x206E,0x2101},{0x206E,0x2101} }, + { {0x1C9A,0x1EAC},{0x1E0E,0x1EAC},{0x1E0E,0x1EAC} }, + { {0x1A31,0x1C48},{0x1B6F,0x1C48},{0x1B6F,0x1C48} }, + { {0x17B0,0x19E7},{0x1938,0x19E7},{0x1938,0x19E7} }, + { {0x152F,0x178F},{0x16AB,0x178F},{0x16AB,0x178F} }, + { {0x12F1,0x1534},{0x145D,0x1534},{0x145D,0x1534} }, + { {0x0FF3,0x1321},{0x11BE,0x1321},{0x11BE,0x1321} }, + { {0x0E1F,0x1073},{0x0E93,0x1073},{0x0E93,0x1073} } }; - -int oc_has_mode_metrics; -oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS]; -# endif - - - -struct oc_mode_rd{ - ogg_int16_t rate; - ogg_int16_t rmse; -}; - - # if !defined(OC_COLLECT_METRICS) static const # endif -oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ +oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={ { { /*Y' qi=0 INTRA*/ { - { 87, -66},{ 132, 1611},{ 197, 3474},{ 285, 5130}, - { 376, 6419},{ 450, 7545},{ 521, 8587},{ 600, 9587}, - { 689,10498},{ 790,11348},{ 899,12158},{ 1030,12855}, - { 1166,13459},{ 1276,14052},{ 1353,14732},{ 1444,15425}, - { 1535,16101},{ 1609,16856},{ 1697,17532},{ 1823,17995}, - { 1962,18426},{ 2085,18919},{ 2201,19503},{ 2304,20307} + { 57, 1550},{ 121, 2460},{ 185, 3901},{ 336, 5189}, + { 406, 6243},{ 501, 7329},{ 565, 8292},{ 674, 9257}, + { 746,10219},{ 843,11056},{ 961,11822},{ 1120,12512}, + { 1208,13233},{ 1394,13600},{ 1409,14381},{ 1492,15129}, + { 1593,15804},{ 1639,16573},{ 1731,17161},{ 1844,17707}, + { 1949,18300},{ 2073,18654},{ 2140,19465},{ 2278,19794} }, /*Y' qi=0 INTER*/ { - { 32, -105},{ 40, 1268},{ 54, 2919},{ 91, 4559}, - { 118, 6244},{ 132, 7932},{ 142, 9514},{ 149,10989}, - { 155,12375},{ 161,13679},{ 168,14958},{ 176,16215}, - { 187,17431},{ 196,18623},{ 207,19790},{ 218,20941}, - { 230,22083},{ 246,23213},{ 265,24333},{ 292,25439}, - { 328,26512},{ 372,27538},{ 427,28522},{ 494,29479} + { -18, 1274},{ 23, 2505},{ 32, 3612},{ 57, 5153}, + { 79, 6636},{ 97, 8082},{ 109, 9505},{ 122,10924}, + { 134,12293},{ 145,13634},{ 158,14942},{ 172,16212}, + { 186,17422},{ 198,18604},{ 209,19757},{ 218,20875}, + { 235,21980},{ 253,23056},{ 276,24121},{ 305,25184}, + { 342,26202},{ 393,27140},{ 439,28140},{ 556,28659} } }, { /*Cb qi=0 INTRA*/ { - { 1, 6},{ 27, 368},{ 52, 738},{ 67, 1171}, - { 80, 1642},{ 99, 2134},{ 110, 2642},{ 112, 3144}, - { 126, 3578},{ 154, 3967},{ 167, 4387},{ 172, 4839}, - { 191, 5278},{ 208, 5666},{ 220, 6036},{ 223, 6398}, - { 227, 6814},{ 253, 7157},{ 284, 7403},{ 292, 7699}, - { 314, 7983},{ 339, 8203},{ 363, 8460},{ 399, 8919} + { 32, 1763},{ 56, 2150},{ 78, 2336},{ 88, 2608}, + { 105, 2975},{ 121, 3297},{ 113, 3460},{ 126, 3993}, + { 142, 4432},{ 177, 4733},{ 185, 5058},{ 194, 5447}, + { 220, 5812},{ 227, 6202},{ 246, 6415},{ 269, 6821}, + { 279, 7026},{ 313, 7313},{ 321, 7708},{ 316, 8021}, + { 370, 8203},{ 389, 8573},{ 410, 8607},{ 431, 8816} }, /*Cb qi=0 INTER*/ { - { 68, -55},{ 63, 275},{ 58, 602},{ 53, 936}, - { 50, 1290},{ 54, 1691},{ 58, 2116},{ 62, 2553}, - { 67, 2992},{ 72, 3422},{ 78, 3843},{ 84, 4253}, - { 89, 4658},{ 94, 5062},{ 98, 5455},{ 100, 5848}, - { 102, 6231},{ 104, 6604},{ 104, 6982},{ 105, 7359}, - { 105, 7733},{ 104, 8104},{ 105, 8465},{ 111, 8828} + { 3, 282},{ 3, 1200},{ 3, 1605},{ 6, 2190}, + { 15, 2519},{ 18, 2798},{ 21, 3115},{ 25, 3460}, + { 33, 3839},{ 40, 4217},{ 47, 4592},{ 51, 4958}, + { 56, 5326},{ 59, 5710},{ 63, 6066},{ 65, 6412}, + { 67, 6762},{ 68, 7104},{ 70, 7461},{ 72, 7829}, + { 77, 8200},{ 80, 8566},{ 86, 8906},{ 90, 9203} } }, { /*Cr qi=0 INTRA*/ { - { 1, 8},{ 23, 375},{ 47, 759},{ 63, 1220}, - { 71, 1693},{ 82, 2171},{ 94, 2652},{ 109, 3103}, - { 125, 3567},{ 133, 3995},{ 151, 4375},{ 168, 4819}, - { 174, 5244},{ 190, 5635},{ 215, 6005},{ 242, 6347}, - { 257, 6758},{ 280, 7068},{ 311, 7336},{ 326, 7652}, - { 346, 7968},{ 372, 8213},{ 388, 8515},{ 408, 9060} + { 27, 1720},{ 44, 1920},{ 66, 2255},{ 73, 2429}, + { 95, 2988},{ 103, 3279},{ 123, 3691},{ 129, 4012}, + { 151, 4415},{ 150, 4760},{ 183, 5008},{ 193, 5351}, + { 211, 5788},{ 235, 6134},{ 263, 6400},{ 276, 6711}, + { 291, 7100},{ 346, 7285},{ 329, 7616},{ 387, 7827}, + { 361, 8214},{ 430, 8534},{ 429, 8608},{ 450, 8823} }, /*Cr qi=0 INTER*/ { - { 69, 0},{ 60, 314},{ 49, 624},{ 45, 943}, - { 45, 1285},{ 49, 1691},{ 55, 2130},{ 62, 2560}, - { 71, 2973},{ 79, 3385},{ 85, 3800},{ 89, 4207}, - { 92, 4620},{ 95, 5037},{ 96, 5436},{ 97, 5839}, - { 98, 6252},{ 99, 6653},{ 99, 7038},{ 103, 7426}, - { 107, 7810},{ 108, 8178},{ 107, 8539},{ 106, 8937} - } - } - }, - { - { - /*Y' qi=1 INTRA*/ - { - { 81, -71},{ 133, 1610},{ 203, 3460},{ 296, 5083}, - { 392, 6342},{ 467, 7454},{ 541, 8486},{ 625, 9466}, - { 716,10352},{ 823,11181},{ 940,11961},{ 1074,12643}, - { 1211,13233},{ 1324,13807},{ 1408,14489},{ 1504,15167}, - { 1598,15824},{ 1679,16544},{ 1788,17161},{ 1928,17579}, - { 2070,17991},{ 2202,18456},{ 2324,19021},{ 2425,19894} - }, - /*Y' qi=1 INTER*/ - { - { 34, 4},{ 40, 1307},{ 55, 2914},{ 93, 4555}, - { 120, 6243},{ 134, 7912},{ 144, 9468},{ 152,10918}, - { 158,12275},{ 164,13569},{ 171,14846},{ 180,16098}, - { 191,17310},{ 204,18484},{ 216,19636},{ 228,20779}, - { 242,21912},{ 261,23036},{ 286,24146},{ 320,25221}, - { 363,26265},{ 418,27261},{ 485,28203},{ 551,29148} - } - }, - { - /*Cb qi=1 INTRA*/ - { - { 1, 6},{ 28, 367},{ 52, 738},{ 68, 1172}, - { 86, 1644},{ 106, 2135},{ 115, 2642},{ 119, 3141}, - { 132, 3569},{ 157, 3951},{ 172, 4366},{ 177, 4819}, - { 194, 5258},{ 211, 5638},{ 224, 6006},{ 233, 6367}, - { 236, 6784},{ 258, 7121},{ 299, 7357},{ 319, 7637}, - { 337, 7921},{ 358, 8141},{ 381, 8367},{ 401, 8768} - }, - /*Cb qi=1 INTER*/ - { - { 95, -31},{ 81, 295},{ 67, 614},{ 53, 953}, - { 48, 1305},{ 51, 1700},{ 56, 2125},{ 61, 2563}, - { 67, 3008},{ 73, 3435},{ 79, 3844},{ 85, 4251}, - { 90, 4663},{ 95, 5073},{ 98, 5458},{ 100, 5844}, - { 101, 6231},{ 102, 6606},{ 102, 6980},{ 103, 7347}, - { 104, 7726},{ 105, 8096},{ 105, 8453},{ 105, 8789} - } - }, - { - /*Cr qi=1 INTRA*/ - { - { 1, 8},{ 25, 375},{ 50, 759},{ 65, 1221}, - { 74, 1695},{ 86, 2172},{ 101, 2651},{ 117, 3101}, - { 129, 3561},{ 135, 3985},{ 153, 4368},{ 171, 4807}, - { 182, 5223},{ 202, 5608},{ 225, 5964},{ 251, 6300}, - { 271, 6697},{ 295, 6978},{ 324, 7235},{ 348, 7558}, - { 367, 7877},{ 394, 8101},{ 413, 8386},{ 409, 8945} - }, - /*Cr qi=1 INTER*/ - { - { 66, 11},{ 59, 323},{ 51, 631},{ 44, 949}, - { 44, 1292},{ 49, 1703},{ 56, 2140},{ 62, 2566}, - { 69, 2991},{ 77, 3397},{ 84, 3799},{ 89, 4211}, - { 93, 4634},{ 94, 5049},{ 95, 5444},{ 96, 5854}, - { 94, 6260},{ 95, 6640},{ 96, 7032},{ 101, 7423}, - { 104, 7790},{ 105, 8158},{ 109, 8527},{ 108, 8872} - } - } - }, - { - { - /*Y' qi=2 INTRA*/ - { - { 87, -72},{ 139, 1607},{ 213, 3426},{ 315, 4992}, - { 416, 6217},{ 495, 7315},{ 574, 8317},{ 666, 9265}, - { 763,10124},{ 875,10906},{ 1001,11654},{ 1147,12305}, - { 1289,12865},{ 1407,13424},{ 1503,14076},{ 1610,14724}, - { 1720,15342},{ 1815,16020},{ 1937,16579},{ 2084,16981}, - { 2236,17371},{ 2385,17779},{ 2536,18250},{ 2689,18931} - }, - /*Y' qi=2 INTER*/ - { - { 30, -2},{ 40, 1308},{ 57, 2921},{ 96, 4567}, - { 122, 6260},{ 136, 7902},{ 148, 9418},{ 156,10826}, - { 162,12157},{ 169,13448},{ 177,14709},{ 188,15938}, - { 200,17133},{ 213,18295},{ 228,19433},{ 245,20564}, - { 264,21685},{ 289,22790},{ 323,23876},{ 368,24916}, - { 427,25906},{ 499,26837},{ 585,27700},{ 680,28514} - } - }, - { - /*Cb qi=2 INTRA*/ - { - { 1, 6},{ 30, 367},{ 58, 738},{ 77, 1172}, - { 93, 1645},{ 111, 2137},{ 123, 2642},{ 126, 3133}, - { 136, 3553},{ 162, 3934},{ 178, 4352},{ 183, 4803}, - { 199, 5231},{ 220, 5596},{ 235, 5957},{ 245, 6314}, - { 256, 6718},{ 286, 7048},{ 320, 7285},{ 336, 7568}, - { 366, 7829},{ 387, 8045},{ 405, 8261},{ 445, 8550} - }, - /*Cb qi=2 INTER*/ - { - { 115, -61},{ 93, 277},{ 71, 609},{ 54, 963}, - { 49, 1329},{ 53, 1715},{ 58, 2138},{ 63, 2583}, - { 69, 3017},{ 75, 3442},{ 81, 3857},{ 88, 4263}, - { 93, 4667},{ 96, 5065},{ 101, 5451},{ 101, 5832}, - { 102, 6213},{ 103, 6593},{ 103, 6968},{ 104, 7336}, - { 104, 7710},{ 105, 8076},{ 106, 8440},{ 106, 8822} - } - }, - { - /*Cr qi=2 INTRA*/ - { - { 1, 8},{ 27, 375},{ 54, 759},{ 70, 1222}, - { 79, 1696},{ 89, 2173},{ 106, 2652},{ 123, 3098}, - { 135, 3553},{ 143, 3972},{ 161, 4348},{ 181, 4782}, - { 194, 5189},{ 213, 5565},{ 235, 5907},{ 266, 6229}, - { 286, 6618},{ 311, 6897},{ 339, 7152},{ 362, 7454}, - { 392, 7721},{ 416, 7946},{ 429, 8227},{ 458, 8540} - }, - /*Cr qi=2 INTER*/ - { - { 74, 20},{ 63, 330},{ 51, 635},{ 44, 942}, - { 47, 1287},{ 54, 1710},{ 59, 2147},{ 65, 2571}, - { 72, 2996},{ 79, 3413},{ 86, 3820},{ 91, 4230}, - { 93, 4642},{ 95, 5046},{ 95, 5442},{ 95, 5839}, - { 96, 6243},{ 97, 6641},{ 99, 7021},{ 101, 7396}, - { 103, 7764},{ 106, 8138},{ 109, 8507},{ 114, 8851} - } - } - }, - { - { - /*Y' qi=3 INTRA*/ - { - { 91, -67},{ 141, 1606},{ 219, 3405},{ 328, 4929}, - { 433, 6122},{ 515, 7209},{ 598, 8204},{ 693, 9145}, - { 796, 9986},{ 912,10756},{ 1045,11471},{ 1200,12079}, - { 1345,12640},{ 1471,13179},{ 1571,13809},{ 1678,14450}, - { 1798,15047},{ 1905,15701},{ 2043,16205},{ 2202,16569}, - { 2351,16971},{ 2501,17393},{ 2660,17851},{ 2825,18455} - }, - /*Y' qi=3 INTER*/ - { - { 53, -164},{ 38, 1314},{ 59, 2917},{ 99, 4563}, - { 124, 6253},{ 139, 7882},{ 150, 9375},{ 159,10749}, - { 166,12059},{ 173,13349},{ 183,14608},{ 194,15826}, - { 208,17003},{ 223,18150},{ 240,19287},{ 259,20411}, - { 284,21508},{ 317,22593},{ 359,23656},{ 414,24671}, - { 483,25634},{ 569,26519},{ 670,27332},{ 786,28072} - } - }, - { - /*Cb qi=3 INTRA*/ - { - { 1, 5},{ 31, 367},{ 58, 739},{ 78, 1173}, - { 96, 1645},{ 113, 2134},{ 125, 2638},{ 133, 3127}, - { 148, 3542},{ 171, 3915},{ 184, 4328},{ 192, 4776}, - { 209, 5197},{ 230, 5556},{ 245, 5909},{ 252, 6261}, - { 272, 6641},{ 304, 6942},{ 330, 7184},{ 342, 7477}, - { 380, 7736},{ 404, 7962},{ 428, 8151},{ 469, 8430} - }, - /*Cb qi=3 INTER*/ - { - { 86, -29},{ 72, 296},{ 58, 618},{ 46, 964}, - { 47, 1338},{ 51, 1743},{ 56, 2158},{ 63, 2594}, - { 69, 3035},{ 77, 3455},{ 84, 3859},{ 89, 4266}, - { 94, 4673},{ 98, 5074},{ 101, 5460},{ 101, 5842}, - { 101, 6217},{ 101, 6593},{ 102, 6964},{ 104, 7325}, - { 103, 7696},{ 103, 8056},{ 104, 8430},{ 103, 8792} - } - }, - { - /*Cr qi=3 INTRA*/ - { - { 1, 8},{ 27, 374},{ 56, 759},{ 74, 1221}, - { 83, 1696},{ 96, 2173},{ 113, 2650},{ 127, 3091}, - { 140, 3542},{ 151, 3960},{ 164, 4334},{ 188, 4764}, - { 208, 5144},{ 224, 5493},{ 250, 5841},{ 278, 6162}, - { 298, 6548},{ 334, 6816},{ 365, 7045},{ 388, 7343}, - { 419, 7613},{ 443, 7836},{ 455, 8105},{ 484, 8445} - }, - /*Cr qi=3 INTER*/ - { - { 76, 26},{ 65, 332},{ 53, 638},{ 45, 945}, - { 45, 1304},{ 53, 1725},{ 60, 2153},{ 68, 2584}, - { 74, 3007},{ 81, 3425},{ 87, 3844},{ 91, 4253}, - { 94, 4657},{ 95, 5061},{ 94, 5462},{ 94, 5856}, - { 95, 6250},{ 96, 6635},{ 97, 7014},{ 101, 7393}, - { 104, 7761},{ 106, 8137},{ 109, 8506},{ 111, 8823} - } - } - }, - { - { - /*Y' qi=4 INTRA*/ - { - { 80, -67},{ 143, 1603},{ 227, 3378},{ 344, 4861}, - { 454, 6026},{ 537, 7104},{ 626, 8089},{ 725, 9006}, - { 830, 9827},{ 950,10581},{ 1089,11270},{ 1257,11826}, - { 1409,12366},{ 1535,12912},{ 1640,13528},{ 1753,14173}, - { 1884,14756},{ 2007,15368},{ 2148,15852},{ 2307,16212}, - { 2464,16591},{ 2614,17019},{ 2785,17455},{ 2970,17963} - }, - /*Y' qi=4 INTER*/ - { - { 50, -145},{ 38, 1324},{ 61, 2921},{ 102, 4566}, - { 127, 6248},{ 142, 7845},{ 154, 9300},{ 163,10656}, - { 169,11965},{ 177,13246},{ 188,14495},{ 202,15702}, - { 218,16864},{ 236,18003},{ 256,19124},{ 278,20233}, - { 307,21330},{ 347,22398},{ 398,23437},{ 463,24429}, - { 546,25343},{ 649,26170},{ 767,26935},{ 888,27674} - } - }, - { - /*Cb qi=4 INTRA*/ - { - { 1, 5},{ 33, 367},{ 61, 739},{ 80, 1173}, - { 98, 1646},{ 114, 2136},{ 126, 2639},{ 137, 3124}, - { 152, 3535},{ 176, 3903},{ 194, 4307},{ 206, 4753}, - { 222, 5165},{ 242, 5508},{ 260, 5857},{ 272, 6205}, - { 294, 6559},{ 332, 6848},{ 356, 7104},{ 364, 7389}, - { 396, 7637},{ 415, 7878},{ 446, 8064},{ 506, 8294} - }, - /*Cb qi=4 INTER*/ - { - { 86, -15},{ 73, 308},{ 60, 627},{ 46, 967}, - { 47, 1343},{ 51, 1754},{ 56, 2183},{ 63, 2615}, - { 70, 3044},{ 79, 3459},{ 85, 3866},{ 90, 4276}, - { 94, 4686},{ 97, 5088},{ 100, 5467},{ 102, 5837}, - { 102, 6205},{ 101, 6569},{ 103, 6939},{ 104, 7317}, - { 105, 7690},{ 107, 8043},{ 107, 8394},{ 111, 8736} - } - }, - { - /*Cr qi=4 INTRA*/ - { - { 1, 7},{ 28, 375},{ 57, 759},{ 79, 1221}, - { 92, 1697},{ 105, 2174},{ 122, 2648},{ 135, 3085}, - { 146, 3530},{ 157, 3947},{ 171, 4316},{ 195, 4737}, - { 218, 5117},{ 239, 5445},{ 268, 5767},{ 295, 6074}, - { 315, 6460},{ 355, 6735},{ 392, 6933},{ 418, 7218}, - { 448, 7495},{ 471, 7688},{ 481, 7954},{ 504, 8313} - }, - /*Cr qi=4 INTER*/ - { - { 68, 28},{ 57, 334},{ 47, 639},{ 43, 953}, - { 48, 1314},{ 54, 1736},{ 59, 2169},{ 69, 2592}, - { 78, 3017},{ 84, 3434},{ 88, 3850},{ 92, 4260}, - { 95, 4663},{ 96, 5068},{ 95, 5455},{ 95, 5839}, - { 96, 6243},{ 97, 6626},{ 98, 7006},{ 101, 7390}, - { 104, 7755},{ 108, 8115},{ 111, 8471},{ 110, 8825} - } - } - }, - { - { - /*Y' qi=5 INTRA*/ - { - { 84, -69},{ 147, 1599},{ 237, 3350},{ 360, 4796}, - { 475, 5934},{ 562, 6992},{ 657, 7953},{ 765, 8837}, - { 874, 9641},{ 998,10384},{ 1146,11047},{ 1322,11572}, - { 1484,12076},{ 1617,12609},{ 1731,13203},{ 1856,13806}, - { 1995,14367},{ 2132,14936},{ 2289,15386},{ 2460,15721}, - { 2635,16066},{ 2802,16442},{ 2980,16805},{ 3177,17272} - }, - /*Y' qi=5 INTER*/ - { - { 38, -86},{ 37, 1349},{ 64, 2920},{ 105, 4563}, - { 129, 6236},{ 145, 7809},{ 158, 9236},{ 167,10572}, - { 174,11871},{ 182,13141},{ 195,14368},{ 212,15558}, - { 230,16706},{ 250,17828},{ 274,18944},{ 303,20041}, - { 342,21116},{ 394,22152},{ 460,23144},{ 543,24073}, - { 648,24919},{ 773,25673},{ 922,26323},{ 1084,26924} - } - }, - { - /*Cb qi=5 INTRA*/ - { - { 1, 5},{ 34, 367},{ 63, 739},{ 82, 1174}, - { 102, 1647},{ 119, 2137},{ 134, 2639},{ 145, 3121}, - { 161, 3529},{ 189, 3891},{ 207, 4290},{ 216, 4721}, - { 232, 5113},{ 258, 5455},{ 277, 5798},{ 294, 6124}, - { 322, 6427},{ 352, 6697},{ 370, 6982},{ 384, 7283}, - { 423, 7529},{ 448, 7766},{ 478, 7943},{ 527, 8151} - }, - /*Cb qi=5 INTER*/ - { - { 83, -49},{ 69, 284},{ 55, 611},{ 48, 961}, - { 49, 1355},{ 52, 1769},{ 58, 2191},{ 65, 2616}, - { 73, 3041},{ 80, 3460},{ 87, 3868},{ 92, 4276}, - { 95, 4682},{ 98, 5077},{ 100, 5459},{ 102, 5827}, - { 102, 6200},{ 102, 6568},{ 103, 6930},{ 103, 7303}, - { 104, 7672},{ 106, 8032},{ 106, 8391},{ 106, 8727} - } - }, - { - /*Cr qi=5 INTRA*/ - { - { 1, 8},{ 28, 375},{ 57, 760},{ 81, 1222}, - { 99, 1696},{ 111, 2175},{ 125, 2648},{ 140, 3079}, - { 152, 3520},{ 162, 3927},{ 179, 4294},{ 203, 4714}, - { 225, 5080},{ 254, 5389},{ 286, 5703},{ 318, 5997}, - { 342, 6364},{ 380, 6640},{ 416, 6837},{ 445, 7103}, - { 473, 7370},{ 497, 7562},{ 514, 7811},{ 549, 8148} - }, - /*Cr qi=5 INTER*/ - { - { 60, 6},{ 54, 323},{ 46, 638},{ 43, 958}, - { 45, 1329},{ 54, 1749},{ 61, 2175},{ 70, 2600}, - { 79, 3021},{ 85, 3437},{ 89, 3847},{ 93, 4254}, - { 95, 4660},{ 96, 5065},{ 95, 5456},{ 95, 5849}, - { 96, 6243},{ 96, 6621},{ 97, 6996},{ 101, 7366}, - { 104, 7722},{ 107, 8088},{ 111, 8448},{ 119, 8816} - } - } - }, - { - { - /*Y' qi=6 INTRA*/ - { - { 88, -69},{ 151, 1593},{ 251, 3294},{ 387, 4681}, - { 507, 5790},{ 601, 6837},{ 702, 7787},{ 813, 8648}, - { 927, 9427},{ 1059,10152},{ 1213,10787},{ 1399,11284}, - { 1568,11781},{ 1705,12312},{ 1823,12890},{ 1957,13482}, - { 2106,14036},{ 2249,14600},{ 2411,15042},{ 2588,15359}, - { 2772,15699},{ 2947,16062},{ 3127,16429},{ 3320,16849} - }, - /*Y' qi=6 INTER*/ - { - { 44, -80},{ 36, 1346},{ 69, 2919},{ 111, 4563}, - { 136, 6216},{ 154, 7746},{ 168, 9139},{ 178,10461}, - { 185,11747},{ 195,13007},{ 211,14229},{ 230,15408}, - { 250,16547},{ 274,17663},{ 302,18769},{ 339,19851}, - { 386,20907},{ 446,21933},{ 527,22884},{ 631,23746}, - { 760,24512},{ 914,25178},{ 1087,25758},{ 1278,26262} - } - }, - { - /*Cb qi=6 INTRA*/ - { - { 1, 4},{ 36, 367},{ 66, 739},{ 84, 1174}, - { 105, 1648},{ 126, 2139},{ 140, 2639},{ 149, 3116}, - { 164, 3523},{ 194, 3880},{ 217, 4271},{ 226, 4694}, - { 243, 5077},{ 270, 5407},{ 291, 5742},{ 310, 6061}, - { 340, 6340},{ 373, 6609},{ 394, 6890},{ 409, 7189}, - { 444, 7434},{ 469, 7652},{ 499, 7853},{ 559, 8135} - }, - /*Cb qi=6 INTER*/ - { - { 68, -46},{ 60, 291},{ 50, 623},{ 49, 971}, - { 50, 1357},{ 55, 1781},{ 61, 2211},{ 69, 2634}, - { 78, 3052},{ 86, 3466},{ 91, 3882},{ 95, 4292}, - { 98, 4691},{ 101, 5080},{ 102, 5458},{ 103, 5830}, - { 103, 6192},{ 104, 6554},{ 104, 6916},{ 106, 7278}, - { 108, 7641},{ 110, 8004},{ 112, 8371},{ 112, 8758} - } - }, - { - /*Cr qi=6 INTRA*/ - { - { 1, 8},{ 29, 375},{ 59, 760},{ 84, 1223}, - { 99, 1698},{ 112, 2176},{ 129, 2647},{ 143, 3076}, - { 156, 3510},{ 168, 3906},{ 189, 4269},{ 220, 4682}, - { 241, 5047},{ 266, 5342},{ 299, 5649},{ 331, 5954}, - { 357, 6309},{ 393, 6579},{ 431, 6765},{ 467, 6997}, - { 501, 7276},{ 520, 7488},{ 525, 7749},{ 548, 8146} - }, - /*Cr qi=6 INTER*/ - { - { 94, 31},{ 69, 335},{ 47, 641},{ 43, 967}, - { 50, 1350},{ 57, 1772},{ 65, 2197},{ 74, 2625}, - { 83, 3043},{ 90, 3454},{ 94, 3867},{ 97, 4273}, - { 98, 4671},{ 99, 5068},{ 99, 5461},{ 98, 5857}, - { 98, 6245},{ 99, 6610},{ 103, 6975},{ 105, 7345}, - { 108, 7712},{ 111, 8073},{ 113, 8415},{ 119, 8768} - } - } - }, - { - { - /*Y' qi=7 INTRA*/ - { - { 92, -70},{ 156, 1590},{ 261, 3267},{ 403, 4618}, - { 529, 5704},{ 628, 6730},{ 736, 7657},{ 856, 8491}, - { 978, 9246},{ 1118, 9943},{ 1281,10550},{ 1472,11028}, - { 1645,11507},{ 1793,12008},{ 1924,12565},{ 2067,13130}, - { 2229,13638},{ 2388,14160},{ 2558,14584},{ 2744,14886}, - { 2932,15194},{ 3116,15531},{ 3311,15858},{ 3538,16197} - }, - /*Y' qi=7 INTER*/ - { - { 43, -8},{ 36, 1351},{ 71, 2923},{ 112, 4568}, - { 138, 6201},{ 157, 7705},{ 171, 9083},{ 181,10390}, - { 189,11664},{ 202,12910},{ 220,14121},{ 241,15281}, - { 266,16401},{ 295,17507},{ 328,18608},{ 371,19677}, - { 430,20701},{ 508,21676},{ 604,22588},{ 727,23397}, - { 878,24093},{ 1055,24690},{ 1263,25151},{ 1496,25504} - } - }, - { - /*Cb qi=7 INTRA*/ - { - { 1, 5},{ 40, 367},{ 72, 740},{ 89, 1175}, - { 108, 1649},{ 129, 2140},{ 143, 2637},{ 154, 3110}, - { 169, 3507},{ 198, 3860},{ 224, 4237},{ 235, 4652}, - { 253, 5037},{ 282, 5358},{ 307, 5674},{ 329, 5986}, - { 361, 6273},{ 393, 6527},{ 419, 6777},{ 435, 7078}, - { 467, 7342},{ 495, 7554},{ 529, 7757},{ 591, 8053} - }, - /*Cb qi=7 INTER*/ - { - { 79, -33},{ 68, 299},{ 56, 627},{ 50, 978}, - { 51, 1366},{ 55, 1786},{ 61, 2213},{ 70, 2642}, - { 80, 3062},{ 87, 3474},{ 92, 3886},{ 96, 4292}, - { 99, 4684},{ 102, 5072},{ 103, 5450},{ 104, 5814}, - { 104, 6176},{ 104, 6538},{ 107, 6905},{ 110, 7270}, - { 110, 7625},{ 110, 7978},{ 111, 8340},{ 117, 8674} - } - }, - { - /*Cr qi=7 INTRA*/ - { - { 2, 7},{ 31, 375},{ 62, 760},{ 87, 1223}, - { 103, 1698},{ 115, 2175},{ 131, 2644},{ 147, 3066}, - { 161, 3494},{ 175, 3889},{ 199, 4250},{ 229, 4653}, - { 250, 5001},{ 279, 5275},{ 311, 5577},{ 343, 5889}, - { 376, 6227},{ 417, 6486},{ 457, 6689},{ 484, 6925}, - { 518, 7174},{ 544, 7393},{ 549, 7662},{ 577, 8050} - }, - /*Cr qi=7 INTER*/ - { - { 89, 22},{ 62, 332},{ 45, 641},{ 47, 976}, - { 52, 1363},{ 59, 1779},{ 67, 2203},{ 76, 2628}, - { 84, 3046},{ 90, 3460},{ 94, 3875},{ 98, 4272}, - { 99, 4666},{ 98, 5063},{ 98, 5459},{ 98, 5849}, - { 99, 6226},{ 101, 6594},{ 104, 6957},{ 109, 7324}, - { 109, 7686},{ 111, 8042},{ 115, 8379},{ 119, 8699} - } - } - }, - { - { - /*Y' qi=8 INTRA*/ - { - { 91, -69},{ 160, 1585},{ 274, 3226},{ 423, 4538}, - { 557, 5596},{ 664, 6595},{ 778, 7506},{ 905, 8319}, - { 1038, 9035},{ 1186, 9701},{ 1355,10292},{ 1554,10754}, - { 1739,11196},{ 1904,11639},{ 2047,12184},{ 2194,12763}, - { 2361,13256},{ 2529,13753},{ 2709,14155},{ 2902,14433}, - { 3100,14723},{ 3292,15026},{ 3489,15327},{ 3714,15705} - }, - /*Y' qi=8 INTER*/ - { - { 32, -157},{ 33, 1346},{ 74, 2914},{ 116, 4554}, - { 142, 6172},{ 162, 7648},{ 177, 9004},{ 186,10300}, - { 196,11570},{ 210,12808},{ 231,14001},{ 256,15150}, - { 285,16259},{ 319,17352},{ 359,18435},{ 415,19475}, - { 489,20470},{ 584,21400},{ 703,22246},{ 852,22968}, - { 1038,23556},{ 1253,24032},{ 1503,24367},{ 1778,24628} - } - }, - { - /*Cb qi=8 INTRA*/ - { - { 1, 4},{ 42, 367},{ 75, 740},{ 93, 1176}, - { 111, 1649},{ 128, 2139},{ 144, 2635},{ 157, 3103}, - { 174, 3494},{ 206, 3844},{ 233, 4207},{ 251, 4605}, - { 277, 4980},{ 304, 5284},{ 335, 5584},{ 359, 5888}, - { 393, 6152},{ 432, 6398},{ 455, 6656},{ 471, 6956}, - { 502, 7193},{ 528, 7405},{ 562, 7630},{ 603, 7922} - }, - /*Cb qi=8 INTER*/ - { - { 77, -37},{ 68, 299},{ 58, 632},{ 50, 991}, - { 50, 1382},{ 55, 1799},{ 62, 2226},{ 73, 2647}, - { 82, 3066},{ 90, 3480},{ 94, 3891},{ 96, 4296}, - { 98, 4687},{ 101, 5073},{ 103, 5456},{ 104, 5817}, - { 105, 6170},{ 106, 6523},{ 107, 6886},{ 108, 7250}, - { 109, 7600},{ 110, 7955},{ 111, 8305},{ 112, 8641} - } - }, - { - /*Cr qi=8 INTRA*/ - { - { 2, 7},{ 33, 375},{ 64, 760},{ 92, 1224}, - { 111, 1700},{ 122, 2173},{ 137, 2637},{ 156, 3055}, - { 172, 3476},{ 186, 3856},{ 211, 4211},{ 242, 4597}, - { 263, 4939},{ 292, 5214},{ 335, 5489},{ 376, 5772}, - { 406, 6099},{ 440, 6378},{ 483, 6578},{ 517, 6797}, - { 550, 7049},{ 571, 7283},{ 583, 7560},{ 618, 7967} - }, - /*Cr qi=8 INTER*/ - { - { 74, 25},{ 58, 328},{ 43, 637},{ 45, 980}, - { 51, 1371},{ 59, 1788},{ 69, 2207},{ 79, 2630}, - { 86, 3051},{ 91, 3470},{ 95, 3880},{ 97, 4280}, - { 98, 4680},{ 97, 5074},{ 96, 5456},{ 97, 5839}, - { 99, 6219},{ 101, 6583},{ 103, 6945},{ 106, 7312}, - { 110, 7671},{ 114, 8009},{ 115, 8345},{ 117, 8686} + { 4, 439},{ 2, 1131},{ 3, 1593},{ 6, 2130}, + { 14, 2535},{ 17, 2786},{ 21, 3128},{ 27, 3494}, + { 35, 3875},{ 42, 4256},{ 48, 4637},{ 53, 5019}, + { 57, 5395},{ 61, 5777},{ 64, 6156},{ 66, 6512}, + { 68, 6853},{ 71, 7183},{ 77, 7511},{ 81, 7841}, + { 83, 8192},{ 88, 8510},{ 93, 8834},{ 98, 9138} } } }, @@ -616,557 +93,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=9 INTRA*/ { - { 104, -68},{ 164, 1580},{ 288, 3173},{ 448, 4439}, - { 587, 5485},{ 702, 6465},{ 824, 7351},{ 958, 8148}, - { 1096, 8845},{ 1253, 9480},{ 1432,10047},{ 1640,10494}, - { 1835,10926},{ 2015,11350},{ 2166,11871},{ 2321,12428}, - { 2508,12876},{ 2684,13345},{ 2866,13741},{ 3069,13991}, - { 3281,14243},{ 3487,14518},{ 3689,14813},{ 3911,15175} + { 76, 777},{ 178, 1995},{ 340, 3162},{ 591, 4097}, + { 746, 4973},{ 916, 5847},{ 1047, 6687},{ 1218, 7430}, + { 1385, 8079},{ 1566, 8685},{ 1755, 9167},{ 1992, 9572}, + { 2164,10023},{ 2395,10270},{ 2536,10755},{ 2694,11285}, + { 2895,11580},{ 3029,12143},{ 3182,12543},{ 3377,12800}, + { 3525,13228},{ 3718,13463},{ 3878,13852},{ 4077,14001} }, /*Y' qi=9 INTER*/ { - { 47, -140},{ 34, 1348},{ 77, 2915},{ 119, 4552}, - { 145, 6150},{ 166, 7600},{ 182, 8936},{ 192,10221}, - { 203,11482},{ 220,12711},{ 244,13886},{ 274,15012}, - { 308,16111},{ 349,17190},{ 401,18244},{ 470,19257}, - { 561,20209},{ 680,21069},{ 830,21822},{ 1010,22463}, - { 1227,22971},{ 1482,23328},{ 1769,23544},{ 2077,23655} + { 10, 770},{ 45, 1845},{ 59, 3227},{ 99, 4708}, + { 135, 6092},{ 164, 7425},{ 190, 8729},{ 218, 9991}, + { 246,11234},{ 281,12427},{ 315,13573},{ 354,14678}, + { 402,15734},{ 467,16728},{ 543,17709},{ 639,18610}, + { 736,19503},{ 855,20312},{ 995,21033},{ 1151,21656}, + { 1341,22130},{ 1525,22582},{ 1735,22922},{ 1922,23102} } }, { /*Cb qi=9 INTRA*/ { - { 1, 5},{ 43, 367},{ 76, 740},{ 95, 1176}, - { 114, 1649},{ 135, 2138},{ 153, 2629},{ 165, 3091}, - { 184, 3481},{ 217, 3831},{ 244, 4187},{ 260, 4572}, - { 290, 4930},{ 320, 5231},{ 351, 5521},{ 379, 5812}, - { 414, 6055},{ 452, 6307},{ 483, 6564},{ 502, 6848}, - { 525, 7115},{ 554, 7321},{ 589, 7533},{ 626, 7833} + { 41, 1227},{ 70, 1452},{ 102, 1697},{ 110, 1967}, + { 134, 2326},{ 153, 2695},{ 160, 3007},{ 196, 3393}, + { 232, 3769},{ 266, 4067},{ 297, 4376},{ 326, 4728}, + { 351, 5040},{ 390, 5299},{ 398, 5538},{ 443, 5900}, + { 448, 6107},{ 506, 6370},{ 519, 6636},{ 525, 6953}, + { 567, 7177},{ 625, 7386},{ 622, 7613},{ 654, 7764} }, /*Cb qi=9 INTER*/ { - { 101, -43},{ 81, 298},{ 62, 637},{ 49, 989}, - { 51, 1381},{ 56, 1806},{ 65, 2231},{ 74, 2653}, - { 84, 3071},{ 91, 3482},{ 95, 3892},{ 97, 4293}, - { 99, 4684},{ 101, 5066},{ 103, 5437},{ 103, 5793}, - { 103, 6148},{ 104, 6511},{ 105, 6867},{ 107, 7221}, - { 110, 7572},{ 111, 7926},{ 112, 8283},{ 116, 8625} + { 7, 377},{ 2, 1102},{ 7, 1262},{ 19, 1693}, + { 22, 1957},{ 27, 2302},{ 35, 2654},{ 43, 3034}, + { 52, 3431},{ 58, 3826},{ 63, 4207},{ 67, 4570}, + { 71, 4927},{ 75, 5283},{ 79, 5624},{ 82, 5944}, + { 85, 6279},{ 88, 6616},{ 94, 6955},{ 102, 7284}, + { 108, 7622},{ 116, 7944},{ 124, 8293},{ 133, 8568} } }, { /*Cr qi=9 INTRA*/ { - { 2, 7},{ 35, 375},{ 66, 761},{ 93, 1224}, - { 112, 1700},{ 126, 2173},{ 144, 2633},{ 165, 3047}, - { 183, 3458},{ 199, 3835},{ 224, 4191},{ 257, 4558}, - { 283, 4887},{ 309, 5176},{ 351, 5446},{ 397, 5713}, - { 433, 6017},{ 469, 6283},{ 508, 6480},{ 546, 6687}, - { 579, 6945},{ 600, 7182},{ 610, 7434},{ 623, 7793} + { 38, 1217},{ 61, 1473},{ 88, 1650},{ 100, 1908}, + { 137, 2400},{ 147, 2777},{ 176, 3149},{ 205, 3433}, + { 227, 3772},{ 249, 4092},{ 286, 4370},{ 313, 4746}, + { 342, 5053},{ 368, 5261},{ 411, 5530},{ 442, 5859}, + { 494, 6061},{ 526, 6340},{ 532, 6646},{ 580, 6799}, + { 567, 7203},{ 649, 7357},{ 625, 7559},{ 660, 7709} }, /*Cr qi=9 INTER*/ { - { 77, 15},{ 57, 330},{ 45, 640},{ 48, 980}, - { 54, 1380},{ 61, 1802},{ 70, 2220},{ 80, 2639}, - { 87, 3057},{ 92, 3474},{ 94, 3882},{ 98, 4282}, - { 98, 4675},{ 97, 5062},{ 97, 5450},{ 98, 5829}, - { 100, 6197},{ 101, 6561},{ 104, 6927},{ 107, 7289}, - { 113, 7638},{ 117, 7978},{ 119, 8311},{ 117, 8629} - } - } - }, - { - { - /*Y' qi=10 INTRA*/ - { - { 101, -69},{ 168, 1574},{ 299, 3143},{ 465, 4386}, - { 610, 5410},{ 736, 6353},{ 866, 7207},{ 1006, 7982}, - { 1153, 8655},{ 1319, 9261},{ 1504, 9812},{ 1719,10248}, - { 1928,10653},{ 2116,11056},{ 2282,11550},{ 2458,12070}, - { 2654,12492},{ 2846,12923},{ 3043,13291},{ 3249,13537}, - { 3466,13764},{ 3682,13999},{ 3896,14268},{ 4145,14548} - }, - /*Y' qi=10 INTER*/ - { - { 48, -94},{ 34, 1355},{ 81, 2920},{ 124, 4545}, - { 151, 6113},{ 174, 7532},{ 190, 8850},{ 201,10125}, - { 214,11379},{ 235,12591},{ 264,13745},{ 299,14859}, - { 338,15948},{ 388,17008},{ 456,18029},{ 546,18988}, - { 661,19877},{ 808,20666},{ 993,21321},{ 1218,21835}, - { 1481,22203},{ 1783,22420},{ 2117,22504},{ 2469,22481} - } - }, - { - /*Cb qi=10 INTRA*/ - { - { 2, 4},{ 44, 367},{ 79, 740},{ 99, 1178}, - { 117, 1652},{ 137, 2141},{ 156, 2630},{ 170, 3089}, - { 192, 3474},{ 227, 3813},{ 259, 4157},{ 282, 4526}, - { 310, 4860},{ 342, 5140},{ 377, 5425},{ 400, 5714}, - { 436, 5952},{ 475, 6194},{ 496, 6468},{ 522, 6748}, - { 559, 6996},{ 587, 7216},{ 617, 7433},{ 673, 7678} - }, - /*Cb qi=10 INTER*/ - { - { 87, -37},{ 72, 301},{ 58, 636},{ 49, 995}, - { 51, 1394},{ 57, 1819},{ 66, 2241},{ 78, 2660}, - { 87, 3074},{ 93, 3482},{ 97, 3891},{ 99, 4294}, - { 101, 4678},{ 103, 5050},{ 105, 5414},{ 106, 5773}, - { 107, 6134},{ 108, 6485},{ 110, 6832},{ 113, 7187}, - { 113, 7547},{ 114, 7887},{ 117, 8230},{ 112, 8590} - } - }, - { - /*Cr qi=10 INTRA*/ - { - { 2, 7},{ 38, 375},{ 69, 761},{ 96, 1224}, - { 116, 1701},{ 131, 2175},{ 148, 2634},{ 168, 3041}, - { 190, 3439},{ 211, 3802},{ 238, 4151},{ 271, 4506}, - { 297, 4824},{ 331, 5103},{ 373, 5360},{ 415, 5632}, - { 459, 5928},{ 500, 6176},{ 535, 6386},{ 573, 6586}, - { 608, 6834},{ 629, 7079},{ 642, 7337},{ 686, 7680} - }, - /*Cr qi=10 INTER*/ - { - { 81, 34},{ 63, 333},{ 50, 633},{ 48, 987}, - { 53, 1397},{ 61, 1820},{ 71, 2237},{ 83, 2651}, - { 91, 3065},{ 95, 3479},{ 98, 3882},{ 100, 4279}, - { 101, 4673},{ 101, 5054},{ 100, 5429},{ 101, 5801}, - { 102, 6173},{ 104, 6541},{ 108, 6904},{ 110, 7264}, - { 114, 7609},{ 119, 7945},{ 123, 8275},{ 128, 8615} - } - } - }, - { - { - /*Y' qi=11 INTRA*/ - { - { 110, -66},{ 176, 1564},{ 316, 3087},{ 492, 4296}, - { 645, 5299},{ 781, 6217},{ 924, 7039},{ 1075, 7776}, - { 1232, 8421},{ 1410, 9005},{ 1607, 9532},{ 1834, 9929}, - { 2053,10300},{ 2249,10697},{ 2427,11184},{ 2619,11682}, - { 2826,12083},{ 3019,12508},{ 3225,12869},{ 3452,13064}, - { 3670,13280},{ 3890,13519},{ 4123,13750},{ 4367,14059} - }, - /*Y' qi=11 INTER*/ - { - { 72, -115},{ 32, 1354},{ 83, 2911},{ 126, 4534}, - { 154, 6080},{ 178, 7475},{ 194, 8779},{ 205,10047}, - { 222,11290},{ 246,12488},{ 281,13621},{ 322,14714}, - { 372,15786},{ 436,16821},{ 519,17813},{ 628,18728}, - { 770,19549},{ 950,20254},{ 1175,20800},{ 1443,21197}, - { 1752,21446},{ 2095,21555},{ 2457,21553},{ 2808,21544} - } - }, - { - /*Cb qi=11 INTRA*/ - { - { 2, 4},{ 45, 367},{ 81, 740},{ 101, 1177}, - { 121, 1650},{ 142, 2136},{ 159, 2621},{ 174, 3075}, - { 199, 3451},{ 234, 3778},{ 265, 4117},{ 297, 4473}, - { 333, 4789},{ 367, 5054},{ 402, 5319},{ 427, 5613}, - { 462, 5871},{ 503, 6107},{ 532, 6336},{ 560, 6584}, - { 601, 6842},{ 631, 7092},{ 662, 7292},{ 721, 7497} - }, - /*Cb qi=11 INTER*/ - { - { 117, -24},{ 93, 308},{ 69, 638},{ 52, 993}, - { 52, 1395},{ 58, 1822},{ 68, 2246},{ 80, 2665}, - { 89, 3082},{ 94, 3492},{ 96, 3900},{ 98, 4299}, - { 101, 4679},{ 103, 5047},{ 104, 5405},{ 106, 5763}, - { 106, 6120},{ 107, 6474},{ 109, 6823},{ 112, 7163}, - { 115, 7516},{ 117, 7868},{ 118, 8213},{ 119, 8561} - } - }, - { - /*Cr qi=11 INTRA*/ - { - { 2, 7},{ 40, 375},{ 75, 761},{ 100, 1224}, - { 119, 1700},{ 137, 2169},{ 154, 2622},{ 178, 3025}, - { 198, 3416},{ 220, 3770},{ 255, 4114},{ 294, 4459}, - { 323, 4756},{ 359, 5028},{ 399, 5292},{ 438, 5556}, - { 483, 5827},{ 518, 6073},{ 551, 6298},{ 598, 6501}, - { 634, 6754},{ 652, 6997},{ 670, 7211},{ 689, 7560} - }, - /*Cr qi=11 INTER*/ - { - { 75, 30},{ 61, 334},{ 51, 639},{ 49, 995}, - { 53, 1403},{ 62, 1821},{ 73, 2237},{ 84, 2654}, - { 91, 3070},{ 95, 3485},{ 96, 3890},{ 98, 4287}, - { 98, 4672},{ 99, 5050},{ 99, 5427},{ 100, 5798}, - { 103, 6169},{ 105, 6528},{ 107, 6881},{ 113, 7233}, - { 118, 7580},{ 121, 7916},{ 125, 8240},{ 130, 8551} - } - } - }, - { - { - /*Y' qi=12 INTRA*/ - { - { 104, -69},{ 182, 1557},{ 335, 3040},{ 521, 4205}, - { 684, 5178},{ 831, 6068},{ 986, 6854},{ 1151, 7559}, - { 1323, 8169},{ 1523, 8704},{ 1736, 9192},{ 1978, 9558}, - { 2213, 9908},{ 2421,10298},{ 2613,10757},{ 2822,11208}, - { 3042,11585},{ 3250,11991},{ 3474,12308},{ 3710,12480}, - { 3939,12687},{ 4174,12902},{ 4416,13102},{ 4672,13369} - }, - /*Y' qi=12 INTER*/ - { - { 52, -91},{ 34, 1355},{ 86, 2911},{ 129, 4518}, - { 159, 6037},{ 184, 7405},{ 200, 8694},{ 213, 9955}, - { 232,11185},{ 263,12360},{ 304,13479},{ 354,14555}, - { 415,15601},{ 495,16608},{ 601,17549},{ 738,18400}, - { 915,19136},{ 1139,19724},{ 1414,20150},{ 1731,20412}, - { 2090,20520},{ 2473,20509},{ 2851,20442},{ 3227,20328} - } - }, - { - /*Cb qi=12 INTRA*/ - { - { 1, 4},{ 46, 367},{ 85, 740},{ 109, 1178}, - { 126, 1650},{ 145, 2134},{ 165, 2617},{ 182, 3061}, - { 209, 3428},{ 245, 3749},{ 281, 4077},{ 316, 4417}, - { 354, 4718},{ 392, 4970},{ 430, 5217},{ 456, 5501}, - { 490, 5771},{ 534, 5996},{ 571, 6207},{ 600, 6458}, - { 644, 6697},{ 675, 6942},{ 707, 7151},{ 766, 7342} - }, - /*Cb qi=12 INTER*/ - { - { 84, -24},{ 73, 311},{ 60, 644},{ 52, 998}, - { 53, 1398},{ 60, 1825},{ 71, 2249},{ 83, 2665}, - { 90, 3081},{ 94, 3490},{ 97, 3893},{ 99, 4286}, - { 102, 4663},{ 104, 5032},{ 105, 5393},{ 106, 5751}, - { 107, 6102},{ 108, 6445},{ 111, 6788},{ 113, 7136}, - { 114, 7483},{ 117, 7828},{ 121, 8163},{ 122, 8496} - } - }, - { - /*Cr qi=12 INTRA*/ - { - { 3, 7},{ 41, 375},{ 78, 761},{ 106, 1225}, - { 124, 1700},{ 140, 2167},{ 163, 2616},{ 188, 3010}, - { 213, 3385},{ 240, 3718},{ 271, 4062},{ 309, 4406}, - { 345, 4691},{ 387, 4956},{ 430, 5212},{ 469, 5467}, - { 513, 5729},{ 554, 5970},{ 587, 6176},{ 633, 6395}, - { 673, 6659},{ 692, 6868},{ 712, 7061},{ 758, 7259} - }, - /*Cr qi=12 INTER*/ - { - { 73, 31},{ 59, 335},{ 48, 638},{ 50, 998}, - { 56, 1410},{ 65, 1827},{ 75, 2240},{ 85, 2657}, - { 92, 3073},{ 95, 3485},{ 97, 3888},{ 99, 4279}, - { 98, 4663},{ 99, 5042},{ 101, 5412},{ 102, 5779}, - { 105, 6142},{ 107, 6498},{ 108, 6848},{ 113, 7198}, - { 118, 7540},{ 121, 7867},{ 127, 8188},{ 132, 8508} - } - } - }, - { - { - /*Y' qi=13 INTRA*/ - { - { 109, -68},{ 187, 1551},{ 347, 3010},{ 541, 4153}, - { 709, 5107},{ 864, 5975},{ 1026, 6745},{ 1194, 7433}, - { 1375, 8021},{ 1581, 8550},{ 1803, 9026},{ 2054, 9371}, - { 2301, 9713},{ 2522,10082},{ 2728,10515},{ 2949,10956}, - { 3184,11297},{ 3408,11653},{ 3643,11946},{ 3886,12100}, - { 4124,12277},{ 4377,12459},{ 4632,12635},{ 4898,12861} - }, - /*Y' qi=13 INTER*/ - { - { 48, -78},{ 35, 1357},{ 89, 2914},{ 133, 4512}, - { 164, 6004},{ 190, 7348},{ 207, 8627},{ 222, 9881}, - { 247,11096},{ 284,12251},{ 333,13350},{ 392,14407}, - { 466,15426},{ 565,16391},{ 696,17279},{ 865,18058}, - { 1085,18689},{ 1358,19156},{ 1684,19456},{ 2050,19605}, - { 2447,19614},{ 2855,19524},{ 3243,19398},{ 3611,19201} - } - }, - { - /*Cb qi=13 INTRA*/ - { - { 2, 4},{ 47, 367},{ 86, 741},{ 108, 1179}, - { 127, 1651},{ 150, 2133},{ 173, 2611},{ 194, 3050}, - { 222, 3417},{ 262, 3733},{ 303, 4048},{ 337, 4375}, - { 378, 4657},{ 420, 4897},{ 456, 5148},{ 486, 5422}, - { 518, 5682},{ 558, 5903},{ 592, 6113},{ 623, 6372}, - { 662, 6628},{ 700, 6833},{ 751, 6989},{ 805, 7147} - }, - /*Cb qi=13 INTER*/ - { - { 94, -34},{ 78, 303},{ 60, 638},{ 51, 994}, - { 54, 1406},{ 61, 1836},{ 73, 2253},{ 84, 2668}, - { 92, 3082},{ 96, 3492},{ 99, 3894},{ 101, 4284}, - { 103, 4659},{ 105, 5023},{ 106, 5376},{ 108, 5726}, - { 109, 6070},{ 110, 6418},{ 113, 6765},{ 117, 7105}, - { 119, 7448},{ 122, 7784},{ 126, 8119},{ 131, 8463} - } - }, - { - /*Cr qi=13 INTRA*/ - { - { 3, 7},{ 43, 375},{ 80, 762},{ 110, 1226}, - { 131, 1701},{ 149, 2166},{ 172, 2610},{ 196, 2999}, - { 221, 3359},{ 254, 3679},{ 292, 4005},{ 332, 4329}, - { 369, 4612},{ 408, 4880},{ 456, 5139},{ 500, 5388}, - { 544, 5631},{ 581, 5877},{ 615, 6101},{ 660, 6316}, - { 692, 6594},{ 714, 6795},{ 736, 6997},{ 789, 7290} - }, - /*Cr qi=13 INTER*/ - { - { 73, 28},{ 61, 336},{ 46, 642},{ 50, 1003}, - { 58, 1414},{ 67, 1832},{ 79, 2245},{ 87, 2660}, - { 93, 3075},{ 97, 3484},{ 99, 3888},{ 100, 4277}, - { 100, 4651},{ 100, 5027},{ 101, 5403},{ 102, 5765}, - { 105, 6116},{ 109, 6470},{ 113, 6825},{ 119, 7163}, - { 124, 7497},{ 127, 7827},{ 131, 8137},{ 135, 8437} - } - } - }, - { - { - /*Y' qi=14 INTRA*/ - { - { 113, -68},{ 191, 1545},{ 358, 2981},{ 559, 4104}, - { 733, 5044},{ 896, 5890},{ 1066, 6636},{ 1241, 7304}, - { 1428, 7886},{ 1642, 8402},{ 1872, 8871},{ 2128, 9219}, - { 2380, 9547},{ 2609, 9908},{ 2825,10321},{ 3055,10728}, - { 3294,11076},{ 3523,11425},{ 3766,11689},{ 4013,11845}, - { 4254,12022},{ 4506,12209},{ 4759,12383},{ 5013,12637} - }, - /*Y' qi=14 INTER*/ - { - { 58, -82},{ 38, 1362},{ 93, 2914},{ 138, 4492}, - { 171, 5962},{ 198, 7289},{ 216, 8559},{ 234, 9804}, - { 263,11005},{ 306,12143},{ 363,13222},{ 434,14259}, - { 523,15255},{ 639,16188},{ 794,17021},{ 1000,17717}, - { 1262,18260},{ 1575,18645},{ 1943,18841},{ 2356,18872}, - { 2782,18802},{ 3194,18682},{ 3576,18559},{ 3923,18447} - } - }, - { - /*Cb qi=14 INTRA*/ - { - { 2, 3},{ 50, 367},{ 91, 741},{ 114, 1180}, - { 134, 1651},{ 157, 2131},{ 181, 2601},{ 208, 3028}, - { 239, 3391},{ 279, 3706},{ 322, 4000},{ 361, 4309}, - { 406, 4587},{ 445, 4822},{ 482, 5067},{ 515, 5344}, - { 546, 5612},{ 589, 5821},{ 626, 6020},{ 655, 6276}, - { 701, 6523},{ 748, 6717},{ 796, 6876},{ 815, 7151} - }, - /*Cb qi=14 INTER*/ - { - { 80, -43},{ 68, 301},{ 56, 644},{ 50, 1004}, - { 54, 1412},{ 63, 1836},{ 75, 2253},{ 87, 2670}, - { 94, 3083},{ 98, 3487},{ 101, 3885},{ 103, 4271}, - { 106, 4645},{ 107, 5004},{ 108, 5358},{ 109, 5705}, - { 112, 6047},{ 115, 6388},{ 118, 6731},{ 121, 7081}, - { 126, 7421},{ 129, 7747},{ 132, 8076},{ 137, 8419} - } - }, - { - /*Cr qi=14 INTRA*/ - { - { 3, 6},{ 45, 375},{ 85, 762},{ 116, 1226}, - { 138, 1700},{ 158, 2163},{ 180, 2602},{ 206, 2985}, - { 236, 3333},{ 270, 3639},{ 310, 3956},{ 359, 4258}, - { 397, 4524},{ 430, 4802},{ 478, 5068},{ 527, 5316}, - { 572, 5560},{ 613, 5802},{ 654, 6012},{ 699, 6216}, - { 734, 6489},{ 755, 6707},{ 775, 6898},{ 841, 7111} - }, - /*Cr qi=14 INTER*/ - { - { 78, 0},{ 59, 322},{ 46, 649},{ 51, 1016}, - { 58, 1422},{ 68, 1839},{ 81, 2253},{ 90, 2666}, - { 95, 3080},{ 98, 3486},{ 101, 3881},{ 102, 4268}, - { 102, 4644},{ 103, 5017},{ 105, 5382},{ 106, 5743}, - { 108, 6093},{ 112, 6442},{ 118, 6791},{ 124, 7130}, - { 127, 7463},{ 133, 7784},{ 138, 8085},{ 142, 8395} - } - } - }, - { - { - /*Y' qi=15 INTRA*/ - { - { 111, -66},{ 197, 1538},{ 370, 2949},{ 579, 4050}, - { 762, 4968},{ 933, 5798},{ 1112, 6520},{ 1299, 7161}, - { 1497, 7725},{ 1723, 8219},{ 1967, 8654},{ 2234, 8990}, - { 2499, 9302},{ 2740, 9637},{ 2968,10039},{ 3215,10414}, - { 3473,10709},{ 3721,11015},{ 3971,11270},{ 4228,11402}, - { 4487,11543},{ 4752,11707},{ 5011,11871},{ 5290,12099} - }, - /*Y' qi=15 INTER*/ - { - { 59, -113},{ 37, 1349},{ 95, 2904},{ 139, 4478}, - { 174, 5929},{ 201, 7244},{ 220, 8505},{ 241, 9736}, - { 275,10922},{ 327,12040},{ 395,13097},{ 477,14114}, - { 585,15071},{ 730,15947},{ 917,16714},{ 1162,17326}, - { 1468,17770},{ 1833,18029},{ 2251,18111},{ 2694,18068}, - { 3125,17968},{ 3529,17845},{ 3908,17713},{ 4260,17587} - } - }, - { - /*Cb qi=15 INTRA*/ - { - { 2, 3},{ 51, 367},{ 94, 741},{ 120, 1180}, - { 140, 1651},{ 160, 2129},{ 184, 2591},{ 213, 3010}, - { 246, 3371},{ 289, 3680},{ 335, 3969},{ 374, 4274}, - { 418, 4546},{ 460, 4783},{ 498, 5019},{ 532, 5280}, - { 565, 5553},{ 608, 5765},{ 647, 5958},{ 683, 6193}, - { 732, 6433},{ 782, 6620},{ 832, 6769},{ 848, 7027} - }, - /*Cb qi=15 INTER*/ - { - { 71, -52},{ 63, 296},{ 54, 644},{ 50, 1010}, - { 53, 1417},{ 64, 1837},{ 77, 2253},{ 88, 2666}, - { 95, 3079},{ 98, 3487},{ 100, 3882},{ 103, 4264}, - { 106, 4633},{ 108, 4991},{ 109, 5343},{ 109, 5693}, - { 112, 6038},{ 114, 6371},{ 119, 6709},{ 123, 7051}, - { 125, 7385},{ 130, 7716},{ 135, 8050},{ 140, 8374} - } - }, - { - /*Cr qi=15 INTRA*/ - { - { 2, 6},{ 47, 375},{ 87, 763},{ 119, 1225}, - { 143, 1699},{ 162, 2158},{ 185, 2595},{ 213, 2971}, - { 246, 3315},{ 279, 3618},{ 320, 3920},{ 372, 4210}, - { 409, 4480},{ 446, 4756},{ 496, 5017},{ 542, 5263}, - { 590, 5487},{ 639, 5721},{ 687, 5923},{ 724, 6132}, - { 753, 6417},{ 781, 6622},{ 805, 6806},{ 856, 6977} - }, - /*Cr qi=15 INTER*/ - { - { 71, 3},{ 61, 326},{ 52, 651},{ 50, 1017}, - { 58, 1422},{ 69, 1837},{ 82, 2251},{ 90, 2668}, - { 95, 3080},{ 98, 3484},{ 101, 3877},{ 102, 4257}, - { 102, 4632},{ 101, 5005},{ 103, 5370},{ 106, 5733}, - { 110, 6082},{ 116, 6424},{ 120, 6774},{ 124, 7106}, - { 130, 7427},{ 135, 7748},{ 141, 8052},{ 147, 8333} - } - } - }, - { - { - /*Y' qi=16 INTRA*/ - { - { 114, -63},{ 206, 1525},{ 396, 2887},{ 618, 3945}, - { 816, 4832},{ 1002, 5626},{ 1196, 6319},{ 1401, 6923}, - { 1616, 7458},{ 1857, 7928},{ 2121, 8334},{ 2405, 8645}, - { 2685, 8934},{ 2938, 9255},{ 3175, 9638},{ 3433, 9990}, - { 3707,10263},{ 3958,10577},{ 4218,10807},{ 4488,10906}, - { 4760,11028},{ 5037,11148},{ 5306,11286},{ 5625,11463} - }, - /*Y' qi=16 INTER*/ - { - { 69, -153},{ 39, 1348},{ 98, 2894},{ 144, 4448}, - { 181, 5872},{ 209, 7167},{ 228, 8422},{ 254, 9644}, - { 297,10810},{ 359,11908},{ 438,12944},{ 539,13930}, - { 672,14842},{ 850,15650},{ 1085,16318},{ 1391,16793}, - { 1769,17082},{ 2200,17198},{ 2659,17174},{ 3116,17072}, - { 3547,16948},{ 3943,16819},{ 4299,16701},{ 4611,16644} - } - }, - { - /*Cb qi=16 INTRA*/ - { - { 3, 4},{ 54, 367},{ 97, 742},{ 122, 1181}, - { 143, 1651},{ 168, 2123},{ 197, 2575},{ 226, 2985}, - { 263, 3338},{ 314, 3631},{ 367, 3903},{ 409, 4200}, - { 453, 4468},{ 491, 4703},{ 528, 4932},{ 566, 5188}, - { 601, 5459},{ 647, 5672},{ 693, 5844},{ 734, 6058}, - { 784, 6305},{ 836, 6460},{ 882, 6602},{ 905, 6891} - }, - /*Cb qi=16 INTER*/ - { - { 75, -64},{ 67, 292},{ 56, 645},{ 51, 1016}, - { 54, 1421},{ 66, 1842},{ 79, 2257},{ 89, 2670}, - { 95, 3082},{ 98, 3488},{ 101, 3879},{ 104, 4258}, - { 106, 4623},{ 108, 4974},{ 109, 5321},{ 113, 5664}, - { 116, 6001},{ 117, 6341},{ 123, 6677},{ 128, 7004}, - { 130, 7336},{ 136, 7671},{ 143, 7996},{ 148, 8310} - } - }, - { - /*Cr qi=16 INTRA*/ - { - { 4, 7},{ 50, 375},{ 90, 763},{ 124, 1225}, - { 148, 1698},{ 168, 2154},{ 195, 2582},{ 227, 2948}, - { 263, 3279},{ 302, 3575},{ 343, 3865},{ 394, 4137}, - { 439, 4402},{ 482, 4672},{ 533, 4925},{ 579, 5165}, - { 626, 5382},{ 675, 5616},{ 725, 5812},{ 769, 5991}, - { 810, 6242},{ 848, 6430},{ 868, 6615},{ 944, 6732} - }, - /*Cr qi=16 INTER*/ - { - { 78, 11},{ 62, 327},{ 49, 650},{ 50, 1025}, - { 59, 1431},{ 72, 1841},{ 83, 2253},{ 90, 2671}, - { 95, 3084},{ 98, 3487},{ 100, 3879},{ 101, 4254}, - { 102, 4625},{ 103, 4994},{ 106, 5355},{ 108, 5708}, - { 111, 6058},{ 115, 6400},{ 121, 6733},{ 128, 7058}, - { 134, 7374},{ 140, 7691},{ 146, 7993},{ 146, 8317} - } - } - }, - { - { - /*Y' qi=17 INTRA*/ - { - { 112, -59},{ 210, 1515},{ 409, 2850},{ 640, 3882}, - { 844, 4748},{ 1038, 5529},{ 1240, 6206},{ 1452, 6803}, - { 1676, 7330},{ 1925, 7792},{ 2194, 8201},{ 2483, 8512}, - { 2766, 8801},{ 3027, 9121},{ 3279, 9482},{ 3548, 9810}, - { 3825,10069},{ 4088,10345},{ 4362,10544},{ 4638,10644}, - { 4915,10744},{ 5196,10850},{ 5471,10981},{ 5802,11136} - }, - /*Y' qi=17 INTER*/ - { - { 70, -147},{ 45, 1349},{ 106, 2894},{ 155, 4425}, - { 195, 5818},{ 225, 7099},{ 247, 8348},{ 278, 9565}, - { 328,10717},{ 399,11794},{ 491,12807},{ 609,13760}, - { 766,14623},{ 984,15349},{ 1274,15902},{ 1642,16256}, - { 2082,16411},{ 2563,16409},{ 3048,16315},{ 3508,16194}, - { 3924,16064},{ 4306,15938},{ 4656,15828},{ 4966,15733} - } - }, - { - /*Cb qi=17 INTRA*/ - { - { 3, 4},{ 57, 367},{ 101, 742},{ 126, 1182}, - { 148, 1650},{ 175, 2118},{ 207, 2565},{ 241, 2966}, - { 279, 3307},{ 331, 3588},{ 389, 3845},{ 435, 4132}, - { 474, 4408},{ 517, 4641},{ 560, 4869},{ 602, 5122}, - { 638, 5389},{ 672, 5610},{ 716, 5787},{ 758, 6002}, - { 817, 6226},{ 869, 6393},{ 916, 6530},{ 950, 6799} - }, - /*Cb qi=17 INTER*/ - { - { 105, -65},{ 86, 288},{ 66, 638},{ 54, 1014}, - { 59, 1427},{ 71, 1844},{ 86, 2257},{ 95, 2668}, - { 100, 3075},{ 103, 3476},{ 106, 3867},{ 110, 4241}, - { 112, 4598},{ 114, 4948},{ 117, 5294},{ 121, 5633}, - { 123, 5968},{ 126, 6301},{ 131, 6637},{ 136, 6968}, - { 144, 7287},{ 152, 7606},{ 158, 7931},{ 162, 8262} - } - }, - { - /*Cr qi=17 INTRA*/ - { - { 4, 6},{ 55, 376},{ 97, 765},{ 128, 1226}, - { 152, 1696},{ 175, 2144},{ 204, 2568},{ 241, 2928}, - { 282, 3250},{ 323, 3530},{ 368, 3811},{ 420, 4089}, - { 463, 4347},{ 505, 4609},{ 562, 4860},{ 609, 5094}, - { 655, 5303},{ 709, 5535},{ 759, 5740},{ 803, 5913}, - { 844, 6153},{ 879, 6350},{ 905, 6527},{ 972, 6637} - }, - /*Cr qi=17 INTER*/ - { - { 88, 8},{ 68, 330},{ 51, 653},{ 54, 1028}, - { 65, 1433},{ 77, 1845},{ 89, 2257},{ 96, 2669}, - { 100, 3081},{ 102, 3481},{ 105, 3867},{ 106, 4245}, - { 108, 4613},{ 110, 4971},{ 112, 5328},{ 115, 5679}, - { 120, 6019},{ 127, 6355},{ 133, 6686},{ 140, 7007}, - { 149, 7316},{ 158, 7618},{ 166, 7924},{ 170, 8232} + { 5, 408},{ 3, 1197},{ 7, 1275},{ 16, 1695}, + { 22, 1979},{ 30, 2324},{ 38, 2691},{ 47, 3071}, + { 53, 3462},{ 59, 3857},{ 64, 4255},{ 69, 4612}, + { 74, 4975},{ 76, 5347},{ 81, 5694},{ 86, 6020}, + { 91, 6357},{ 96, 6687},{ 102, 7020},{ 108, 7351}, + { 115, 7663},{ 122, 7979},{ 125, 8298},{ 136, 8576} } } }, @@ -1174,557 +155,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=18 INTRA*/ { - { 122, -58},{ 216, 1506},{ 425, 2815},{ 665, 3822}, - { 882, 4666},{ 1088, 5425},{ 1301, 6084},{ 1529, 6653}, - { 1766, 7162},{ 2026, 7611},{ 2312, 7987},{ 2612, 8278}, - { 2913, 8551},{ 3196, 8840},{ 3454, 9184},{ 3734, 9490}, - { 4030, 9725},{ 4305, 9973},{ 4585,10162},{ 4864,10251}, - { 5150,10324},{ 5443,10420},{ 5727,10536},{ 6053,10682} + { 83, 534},{ 261, 1697},{ 507, 2691},{ 852, 3418}, + { 1127, 4094},{ 1378, 4775},{ 1626, 5442},{ 1905, 5975}, + { 2164, 6468},{ 2445, 6913},{ 2704, 7301},{ 3001, 7631}, + { 3285, 7934},{ 3536, 8217},{ 3837, 8489},{ 4076, 8814}, + { 4325, 9046},{ 4590, 9313},{ 4794, 9546},{ 5062, 9751}, + { 5285, 9963},{ 5578,10079},{ 5777,10302},{ 6054,10296} }, /*Y' qi=18 INTER*/ { - { 66, -143},{ 47, 1351},{ 108, 2886},{ 158, 4401}, - { 200, 5775},{ 232, 7044},{ 256, 8288},{ 292, 9493}, - { 351,10625},{ 434,11679},{ 541,12665},{ 681,13578}, - { 875,14379},{ 1136,15025},{ 1483,15475},{ 1914,15709}, - { 2399,15767},{ 2907,15699},{ 3400,15579},{ 3852,15453}, - { 4259,15332},{ 4630,15221},{ 4976,15121},{ 5294,15061} + { 33, 490},{ 62, 1599},{ 96, 3015},{ 164, 4378}, + { 225, 5633},{ 285, 6831},{ 351, 7999},{ 427, 9133}, + { 526,10181},{ 652,11141},{ 829,11991},{ 1049,12732}, + { 1310,13367},{ 1592,13896},{ 1881,14350},{ 2207,14667}, + { 2529,14877},{ 2873,14980},{ 3231,14949},{ 3571,14926}, + { 3922,14816},{ 4246,14715},{ 4559,14579},{ 4778,14590} } }, { /*Cb qi=18 INTRA*/ { - { 2, 3},{ 61, 367},{ 107, 743},{ 131, 1182}, - { 155, 1648},{ 183, 2110},{ 220, 2542},{ 260, 2927}, - { 303, 3265},{ 359, 3540},{ 416, 3785},{ 462, 4063}, - { 506, 4334},{ 553, 4567},{ 595, 4797},{ 636, 5049}, - { 676, 5304},{ 717, 5516},{ 759, 5698},{ 801, 5904}, - { 861, 6133},{ 911, 6311},{ 962, 6443},{ 1021, 6645} + { 55, 825},{ 95, 1021},{ 131, 1276},{ 150, 1618}, + { 180, 1958},{ 220, 2306},{ 256, 2608},{ 322, 2939}, + { 385, 3239},{ 436, 3530},{ 475, 3771},{ 518, 4078}, + { 557, 4348},{ 604, 4592},{ 620, 4851},{ 676, 5083}, + { 704, 5363},{ 739, 5582},{ 788, 5782},{ 819, 6000}, + { 893, 6158},{ 940, 6418},{ 984, 6499},{ 1035, 6596} }, /*Cb qi=18 INTER*/ { - { 126, 5},{ 95, 326},{ 66, 643},{ 55, 1015}, - { 60, 1427},{ 73, 1843},{ 87, 2256},{ 96, 2667}, - { 101, 3073},{ 104, 3470},{ 108, 3853},{ 111, 4226}, - { 114, 4584},{ 117, 4928},{ 119, 5274},{ 122, 5612}, - { 126, 5942},{ 130, 6271},{ 136, 6606},{ 141, 6931}, - { 148, 7247},{ 156, 7568},{ 164, 7891},{ 173, 8211} + { -2, 642},{ 12, 771},{ 20, 1054},{ 29, 1394}, + { 35, 1721},{ 45, 2080},{ 53, 2450},{ 63, 2835}, + { 73, 3225},{ 81, 3596},{ 87, 3952},{ 95, 4300}, + { 102, 4634},{ 109, 4959},{ 115, 5283},{ 120, 5608}, + { 130, 5931},{ 139, 6254},{ 152, 6571},{ 163, 6887}, + { 179, 7204},{ 191, 7508},{ 198, 7834},{ 224, 8066} } }, { /*Cr qi=18 INTRA*/ { - { 4, 6},{ 59, 376},{ 104, 765},{ 133, 1226}, - { 156, 1692},{ 184, 2136},{ 218, 2548},{ 260, 2893}, - { 308, 3204},{ 348, 3481},{ 397, 3751},{ 448, 4024}, - { 490, 4281},{ 541, 4523},{ 593, 4776},{ 634, 5022}, - { 685, 5236},{ 748, 5455},{ 812, 5638},{ 856, 5818}, - { 891, 6048},{ 928, 6230},{ 961, 6405},{ 1055, 6449} + { 49, 780},{ 86, 986},{ 120, 1261},{ 137, 1588}, + { 183, 1998},{ 228, 2339},{ 291, 2670},{ 334, 2938}, + { 376, 3239},{ 412, 3522},{ 459, 3783},{ 490, 4113}, + { 547, 4321},{ 593, 4571},{ 640, 4828},{ 675, 5137}, + { 730, 5254},{ 774, 5524},{ 821, 5754},{ 859, 5911}, + { 887, 6178},{ 982, 6266},{ 941, 6536},{ 996, 6630} }, /*Cr qi=18 INTER*/ { - { 81, 34},{ 68, 342},{ 57, 652},{ 59, 1027}, - { 67, 1439},{ 80, 1848},{ 91, 2257},{ 97, 2670}, - { 100, 3076},{ 103, 3473},{ 106, 3857},{ 108, 4231}, - { 109, 4599},{ 110, 4958},{ 113, 5307},{ 119, 5650}, - { 125, 5991},{ 130, 6325},{ 138, 6651},{ 147, 6971}, - { 153, 7278},{ 162, 7578},{ 172, 7874},{ 177, 8156} - } - } - }, - { - { - /*Y' qi=19 INTRA*/ - { - { 128, -55},{ 228, 1495},{ 448, 2775},{ 699, 3758}, - { 931, 4571},{ 1154, 5296},{ 1386, 5914},{ 1636, 6450}, - { 1894, 6930},{ 2177, 7342},{ 2479, 7698},{ 2792, 7976}, - { 3099, 8235},{ 3392, 8517},{ 3658, 8853},{ 3938, 9155}, - { 4242, 9371},{ 4527, 9605},{ 4810, 9781},{ 5089, 9853}, - { 5378, 9920},{ 5674,10009},{ 5972,10110},{ 6336,10196} - }, - /*Y' qi=19 INTER*/ - { - { 69, -147},{ 49, 1353},{ 111, 2883},{ 162, 4381}, - { 205, 5737},{ 237, 6996},{ 264, 8232},{ 307, 9421}, - { 376,10534},{ 472,11567},{ 596,12525},{ 761,13395}, - { 990,14130},{ 1298,14694},{ 1695,15053},{ 2172,15195}, - { 2696,15173},{ 3213,15075},{ 3696,14948},{ 4141,14829}, - { 4541,14721},{ 4910,14609},{ 5245,14506},{ 5536,14399} - } - }, - { - /*Cb qi=19 INTRA*/ - { - { 3, 3},{ 61, 367},{ 109, 743},{ 135, 1182}, - { 161, 1646},{ 191, 2101},{ 229, 2524},{ 273, 2898}, - { 318, 3221},{ 376, 3490},{ 436, 3731},{ 487, 3994}, - { 539, 4251},{ 584, 4485},{ 621, 4721},{ 664, 4967}, - { 709, 5225},{ 752, 5431},{ 801, 5595},{ 846, 5796}, - { 912, 6011},{ 959, 6193},{ 1015, 6321},{ 1121, 6504} - }, - /*Cb qi=19 INTER*/ - { - { 126, 4},{ 97, 329},{ 69, 649},{ 56, 1017}, - { 61, 1432},{ 74, 1846},{ 88, 2255},{ 98, 2663}, - { 103, 3065},{ 106, 3460},{ 110, 3844},{ 114, 4211}, - { 117, 4564},{ 120, 4911},{ 122, 5253},{ 125, 5588}, - { 129, 5916},{ 135, 6241},{ 142, 6567},{ 149, 6885}, - { 155, 7206},{ 163, 7527},{ 174, 7843},{ 188, 8145} - } - }, - { - /*Cr qi=19 INTRA*/ - { - { 5, 6},{ 61, 376},{ 106, 765},{ 135, 1225}, - { 160, 1689},{ 192, 2126},{ 229, 2531},{ 271, 2869}, - { 321, 3168},{ 370, 3433},{ 421, 3704},{ 476, 3965}, - { 520, 4212},{ 572, 4452},{ 629, 4691},{ 671, 4939}, - { 724, 5152},{ 792, 5347},{ 858, 5510},{ 895, 5696}, - { 939, 5905},{ 991, 6056},{ 1027, 6244},{ 1127, 6333} - }, - /*Cr qi=19 INTER*/ - { - { 80, 45},{ 66, 344},{ 55, 654},{ 56, 1030}, - { 66, 1440},{ 80, 1850},{ 91, 2259},{ 98, 2668}, - { 102, 3072},{ 104, 3466},{ 107, 3845},{ 109, 4215}, - { 110, 4578},{ 112, 4933},{ 116, 5283},{ 122, 5625}, - { 129, 5963},{ 136, 6287},{ 143, 6611},{ 151, 6927}, - { 160, 7229},{ 170, 7528},{ 181, 7818},{ 191, 8092} - } - } - }, - { - { - /*Y' qi=20 INTRA*/ - { - { 129, -50},{ 238, 1481},{ 469, 2728},{ 730, 3684}, - { 974, 4473},{ 1213, 5171},{ 1463, 5763},{ 1729, 6281}, - { 2002, 6744},{ 2299, 7146},{ 2613, 7492},{ 2940, 7746}, - { 3265, 7978},{ 3571, 8228},{ 3853, 8543},{ 4156, 8815}, - { 4476, 9001},{ 4775, 9218},{ 5070, 9373},{ 5352, 9446}, - { 5649, 9510},{ 5956, 9580},{ 6268, 9660},{ 6647, 9705} - }, - /*Y' qi=20 INTER*/ - { - { 64, -93},{ 52, 1340},{ 116, 2862},{ 170, 4344}, - { 216, 5678},{ 249, 6928},{ 281, 8155},{ 333, 9326}, - { 418,10410},{ 533,11411},{ 683,12329},{ 890,13127}, - { 1183,13750},{ 1579,14162},{ 2066,14357},{ 2611,14370}, - { 3159,14284},{ 3675,14167},{ 4142,14053},{ 4568,13953}, - { 4961,13852},{ 5320,13755},{ 5649,13675},{ 5933,13610} - } - }, - { - /*Cb qi=20 INTRA*/ - { - { 3, 3},{ 62, 367},{ 112, 743},{ 140, 1183}, - { 165, 1646},{ 196, 2099},{ 235, 2517},{ 284, 2883}, - { 334, 3198},{ 393, 3460},{ 457, 3690},{ 509, 3945}, - { 560, 4198},{ 605, 4435},{ 647, 4658},{ 699, 4888}, - { 742, 5155},{ 788, 5350},{ 835, 5517},{ 880, 5730}, - { 956, 5914},{ 1007, 6060},{ 1053, 6199},{ 1158, 6358} - }, - /*Cb qi=20 INTER*/ - { - { 128, -6},{ 96, 322},{ 66, 653},{ 54, 1025}, - { 63, 1431},{ 79, 1844},{ 91, 2256},{ 99, 2665}, - { 104, 3065},{ 107, 3455},{ 111, 3831},{ 115, 4189}, - { 120, 4539},{ 123, 4885},{ 126, 5219},{ 130, 5548}, - { 135, 5876},{ 141, 6199},{ 149, 6519},{ 156, 6837}, - { 166, 7153},{ 179, 7468},{ 189, 7784},{ 194, 8102} - } - }, - { - /*Cr qi=20 INTRA*/ - { - { 4, 6},{ 63, 376},{ 109, 765},{ 139, 1225}, - { 165, 1689},{ 199, 2124},{ 239, 2523},{ 285, 2852}, - { 340, 3140},{ 388, 3398},{ 438, 3662},{ 499, 3914}, - { 547, 4155},{ 596, 4392},{ 652, 4634},{ 699, 4877}, - { 759, 5074},{ 824, 5257},{ 883, 5428},{ 936, 5589}, - { 986, 5790},{ 1030, 5960},{ 1074, 6119},{ 1172, 6191} - }, - /*Cr qi=20 INTER*/ - { - { 92, 40},{ 70, 345},{ 55, 658},{ 57, 1034}, - { 69, 1441},{ 84, 1852},{ 94, 2261},{ 98, 2669}, - { 102, 3074},{ 105, 3465},{ 107, 3841},{ 110, 4206}, - { 112, 4562},{ 116, 4915},{ 121, 5260},{ 127, 5591}, - { 134, 5920},{ 142, 6246},{ 153, 6562},{ 163, 6870}, - { 173, 7170},{ 186, 7463},{ 198, 7746},{ 199, 8030} - } - } - }, - { - { - /*Y' qi=21 INTRA*/ - { - { 130, -51},{ 244, 1476},{ 483, 2705},{ 756, 3635}, - { 1013, 4396},{ 1266, 5070},{ 1530, 5647},{ 1806, 6153}, - { 2093, 6600},{ 2411, 6976},{ 2739, 7299},{ 3079, 7534}, - { 3422, 7744},{ 3738, 7987},{ 4032, 8274},{ 4348, 8533}, - { 4675, 8721},{ 4989, 8909},{ 5291, 9051},{ 5577, 9111}, - { 5879, 9163},{ 6190, 9228},{ 6506, 9286},{ 6899, 9295} - }, - /*Y' qi=21 INTER*/ - { - { 64, -56},{ 55, 1341},{ 119, 2859},{ 174, 4324}, - { 223, 5640},{ 258, 6880},{ 295, 8096},{ 359, 9246}, - { 460,10302},{ 595,11268},{ 778,12131},{ 1032,12857}, - { 1387,13385},{ 1850,13683},{ 2399,13774},{ 2976,13729}, - { 3527,13619},{ 4034,13504},{ 4492,13401},{ 4912,13291}, - { 5298,13209},{ 5648,13137},{ 5974,13046},{ 6308,12977} - } - }, - { - /*Cb qi=21 INTRA*/ - { - { 4, 3},{ 64, 367},{ 114, 743},{ 141, 1183}, - { 166, 1645},{ 201, 2092},{ 247, 2502},{ 299, 2856}, - { 352, 3158},{ 413, 3412},{ 480, 3642},{ 536, 3893}, - { 588, 4137},{ 637, 4367},{ 678, 4598},{ 725, 4834}, - { 774, 5083},{ 827, 5269},{ 883, 5420},{ 930, 5633}, - { 999, 5829},{ 1057, 5959},{ 1113, 6082},{ 1200, 6265} - }, - /*Cb qi=21 INTER*/ - { - { 109, -8},{ 84, 321},{ 62, 654},{ 54, 1028}, - { 64, 1434},{ 80, 1847},{ 92, 2259},{ 100, 2664}, - { 105, 3060},{ 109, 3445},{ 114, 3815},{ 118, 4172}, - { 122, 4519},{ 126, 4861},{ 128, 5194},{ 133, 5520}, - { 139, 5847},{ 146, 6169},{ 155, 6487},{ 166, 6801}, - { 177, 7114},{ 189, 7423},{ 201, 7729},{ 208, 8035} - } - }, - { - /*Cr qi=21 INTRA*/ - { - { 4, 6},{ 64, 377},{ 111, 766},{ 144, 1225}, - { 174, 1683},{ 206, 2114},{ 248, 2506},{ 302, 2824}, - { 357, 3099},{ 404, 3357},{ 455, 3622},{ 519, 3867}, - { 573, 4098},{ 625, 4331},{ 683, 4571},{ 733, 4802}, - { 793, 4994},{ 863, 5173},{ 926, 5337},{ 978, 5492}, - { 1030, 5685},{ 1079, 5856},{ 1126, 6027},{ 1217, 6159} - }, - /*Cr qi=21 INTER*/ - { - { 82, 29},{ 67, 341},{ 55, 660},{ 58, 1038}, - { 71, 1443},{ 85, 1851},{ 95, 2258},{ 99, 2666}, - { 103, 3069},{ 107, 3456},{ 110, 3826},{ 112, 4188}, - { 114, 4544},{ 118, 4891},{ 124, 5231},{ 132, 5567}, - { 139, 5894},{ 148, 6210},{ 159, 6520},{ 171, 6822}, - { 185, 7111},{ 196, 7403},{ 209, 7691},{ 225, 7945} - } - } - }, - { - { - /*Y' qi=22 INTRA*/ - { - { 128, -45},{ 254, 1463},{ 507, 2662},{ 794, 3562}, - { 1070, 4292},{ 1340, 4941},{ 1622, 5492},{ 1920, 5968}, - { 2229, 6387},{ 2565, 6742},{ 2911, 7047},{ 3263, 7264}, - { 3615, 7464},{ 3944, 7689},{ 4258, 7950},{ 4591, 8183}, - { 4934, 8347},{ 5259, 8517},{ 5573, 8634},{ 5870, 8683}, - { 6186, 8723},{ 6508, 8762},{ 6831, 8801},{ 7232, 8830} - }, - /*Y' qi=22 INTER*/ - { - { 77, -48},{ 57, 1343},{ 122, 2853},{ 180, 4299}, - { 231, 5597},{ 269, 6826},{ 314, 8025},{ 393, 9150}, - { 512,10179},{ 673,11103},{ 894,11908},{ 1207,12542}, - { 1635,12956},{ 2166,13148},{ 2755,13167},{ 3345,13088}, - { 3895,12966},{ 4386,12848},{ 4832,12746},{ 5252,12647}, - { 5634,12563},{ 5978,12497},{ 6299,12412},{ 6633,12338} - } - }, - { - /*Cb qi=22 INTRA*/ - { - { 4, 3},{ 66, 367},{ 122, 744},{ 153, 1182}, - { 177, 1640},{ 213, 2080},{ 263, 2475},{ 323, 2811}, - { 382, 3103},{ 451, 3346},{ 522, 3568},{ 581, 3814}, - { 633, 4054},{ 674, 4288},{ 719, 4523},{ 768, 4756}, - { 823, 4979},{ 883, 5162},{ 937, 5325},{ 996, 5510}, - { 1070, 5687},{ 1129, 5807},{ 1193, 5929},{ 1311, 6099} - }, - /*Cb qi=22 INTER*/ - { - { 107, -5},{ 83, 322},{ 61, 653},{ 55, 1030}, - { 66, 1436},{ 81, 1845},{ 94, 2253},{ 102, 2656}, - { 107, 3050},{ 111, 3435},{ 115, 3804},{ 119, 4158}, - { 124, 4501},{ 128, 4835},{ 132, 5164},{ 138, 5490}, - { 146, 5812},{ 154, 6128},{ 163, 6442},{ 174, 6754}, - { 188, 7060},{ 205, 7361},{ 219, 7662},{ 233, 7953} - } - }, - { - /*Cr qi=22 INTRA*/ - { - { 4, 6},{ 67, 378},{ 118, 767},{ 151, 1222}, - { 182, 1675},{ 221, 2097},{ 269, 2476},{ 329, 2774}, - { 389, 3039},{ 444, 3292},{ 500, 3545},{ 560, 3788}, - { 615, 4020},{ 671, 4251},{ 734, 4484},{ 781, 4712}, - { 850, 4887},{ 925, 5060},{ 981, 5229},{ 1031, 5369}, - { 1092, 5549},{ 1148, 5715},{ 1200, 5861},{ 1291, 5943} - }, - /*Cr qi=22 INTER*/ - { - { 88, 34},{ 69, 340},{ 57, 657},{ 60, 1039}, - { 73, 1445},{ 87, 1851},{ 96, 2257},{ 100, 2662}, - { 103, 3058},{ 107, 3442},{ 111, 3812},{ 115, 4172}, - { 118, 4524},{ 123, 4864},{ 129, 5199},{ 136, 5531}, - { 145, 5855},{ 156, 6168},{ 170, 6468},{ 184, 6765}, - { 193, 7066},{ 207, 7353},{ 222, 7628},{ 230, 7900} - } - } - }, - { - { - /*Y' qi=23 INTRA*/ - { - { 126, -40},{ 257, 1458},{ 521, 2636},{ 825, 3501}, - { 1111, 4207},{ 1391, 4842},{ 1684, 5385},{ 1992, 5858}, - { 2311, 6277},{ 2653, 6626},{ 3005, 6929},{ 3366, 7134}, - { 3729, 7311},{ 4071, 7526},{ 4396, 7770},{ 4734, 7986}, - { 5086, 8131},{ 5421, 8286},{ 5735, 8404},{ 6033, 8456}, - { 6357, 8486},{ 6682, 8525},{ 7003, 8573},{ 7387, 8604} - }, - /*Y' qi=23 INTER*/ - { - { 64, -57},{ 60, 1345},{ 124, 2853},{ 185, 4284}, - { 239, 5565},{ 282, 6783},{ 336, 7967},{ 429, 9069}, - { 568,10063},{ 758,10943},{ 1028,11679},{ 1407,12216}, - { 1909,12520},{ 2502,12616},{ 3126,12573},{ 3722,12461}, - { 4258,12344},{ 4742,12236},{ 5185,12136},{ 5590,12052}, - { 5970,11980},{ 6315,11901},{ 6631,11826},{ 6954,11769} - } - }, - { - /*Cb qi=23 INTRA*/ - { - { 3, 3},{ 70, 367},{ 124, 744},{ 151, 1182}, - { 181, 1637},{ 222, 2071},{ 276, 2460},{ 343, 2785}, - { 403, 3072},{ 468, 3317},{ 542, 3534},{ 605, 3773}, - { 659, 4009},{ 703, 4243},{ 747, 4479},{ 795, 4707}, - { 852, 4923},{ 908, 5105},{ 972, 5254},{ 1043, 5423}, - { 1118, 5594},{ 1172, 5731},{ 1240, 5853},{ 1365, 6005} - }, - /*Cb qi=23 INTER*/ - { - { 109, -10},{ 87, 325},{ 63, 650},{ 57, 1031}, - { 67, 1439},{ 83, 1847},{ 96, 2253},{ 103, 2652}, - { 109, 3041},{ 114, 3421},{ 117, 3789},{ 122, 4141}, - { 128, 4480},{ 134, 4811},{ 139, 5138},{ 144, 5463}, - { 152, 5781},{ 161, 6096},{ 174, 6404},{ 185, 6714}, - { 198, 7023},{ 216, 7320},{ 233, 7621},{ 245, 7935} - } - }, - { - /*Cr qi=23 INTRA*/ - { - { 5, 6},{ 70, 379},{ 122, 768},{ 155, 1222}, - { 187, 1671},{ 231, 2088},{ 283, 2459},{ 346, 2750}, - { 411, 3009},{ 465, 3261},{ 523, 3509},{ 585, 3746}, - { 639, 3980},{ 695, 4219},{ 754, 4449},{ 803, 4671}, - { 873, 4840},{ 953, 5001},{ 1015, 5156},{ 1071, 5286}, - { 1137, 5464},{ 1191, 5629},{ 1249, 5782},{ 1359, 5885} - }, - /*Cr qi=23 INTER*/ - { - { 84, 29},{ 69, 343},{ 58, 660},{ 62, 1041}, - { 75, 1448},{ 88, 1853},{ 97, 2258},{ 102, 2659}, - { 105, 3050},{ 108, 3430},{ 113, 3799},{ 116, 4155}, - { 121, 4505},{ 126, 4845},{ 132, 5176},{ 142, 5504}, - { 153, 5826},{ 165, 6133},{ 180, 6432},{ 197, 6722}, - { 212, 7005},{ 226, 7287},{ 244, 7555},{ 258, 7828} - } - } - }, - { - { - /*Y' qi=24 INTRA*/ - { - { 125, -34},{ 268, 1444},{ 547, 2590},{ 866, 3422}, - { 1172, 4098},{ 1476, 4702},{ 1790, 5222},{ 2117, 5678}, - { 2453, 6080},{ 2811, 6418},{ 3178, 6700},{ 3552, 6895}, - { 3928, 7055},{ 4286, 7243},{ 4627, 7477},{ 4981, 7674}, - { 5344, 7802},{ 5683, 7944},{ 6009, 8043},{ 6313, 8082}, - { 6633, 8111},{ 6959, 8151},{ 7280, 8197},{ 7660, 8221} - }, - /*Y' qi=24 INTER*/ - { - { 62, -63},{ 68, 1345},{ 134, 2840},{ 199, 4245}, - { 256, 5508},{ 304, 6715},{ 371, 7880},{ 484, 8950}, - { 652, 9899},{ 892,10709},{ 1238,11334},{ 1722,11722}, - { 2326,11875},{ 2983,11864},{ 3616,11783},{ 4189,11678}, - { 4707,11570},{ 5178,11476},{ 5617,11395},{ 6017,11319}, - { 6380,11252},{ 6720,11185},{ 7044,11126},{ 7377,11118} - } - }, - { - /*Cb qi=24 INTRA*/ - { - { 4, 3},{ 75, 367},{ 132, 745},{ 159, 1182}, - { 187, 1634},{ 230, 2061},{ 289, 2439},{ 361, 2753}, - { 425, 3034},{ 492, 3278},{ 566, 3490},{ 630, 3720}, - { 686, 3956},{ 732, 4190},{ 777, 4420},{ 829, 4637}, - { 894, 4840},{ 958, 5012},{ 1023, 5155},{ 1090, 5326}, - { 1165, 5502},{ 1226, 5622},{ 1299, 5717},{ 1408, 5887} - }, - /*Cb qi=24 INTER*/ - { - { 110, 35},{ 92, 337},{ 70, 651},{ 63, 1033}, - { 74, 1440},{ 91, 1846},{ 102, 2248},{ 109, 2644}, - { 114, 3031},{ 120, 3404},{ 127, 3762},{ 133, 4109}, - { 138, 4445},{ 144, 4772},{ 151, 5094},{ 159, 5411}, - { 168, 5728},{ 180, 6037},{ 195, 6338},{ 210, 6640}, - { 227, 6944},{ 249, 7236},{ 272, 7528},{ 299, 7809} - } - }, - { - /*Cr qi=24 INTRA*/ - { - { 5, 6},{ 72, 380},{ 124, 770},{ 158, 1222}, - { 195, 1668},{ 240, 2079},{ 297, 2438},{ 367, 2715}, - { 433, 2966},{ 488, 3218},{ 549, 3467},{ 609, 3701}, - { 664, 3935},{ 728, 4165},{ 792, 4379},{ 845, 4586}, - { 917, 4744},{ 995, 4898},{ 1063, 5049},{ 1120, 5187}, - { 1190, 5359},{ 1249, 5522},{ 1304, 5672},{ 1397, 5806} - }, - /*Cr qi=24 INTER*/ - { - { 91, 56},{ 73, 353},{ 61, 664},{ 66, 1045}, - { 80, 1449},{ 95, 1851},{ 103, 2250},{ 107, 2648}, - { 111, 3038},{ 116, 3413},{ 120, 3774},{ 124, 4128}, - { 130, 4471},{ 138, 4802},{ 145, 5130},{ 156, 5453}, - { 171, 5764},{ 187, 6061},{ 204, 6355},{ 220, 6643}, - { 238, 6923},{ 254, 7204},{ 275, 7475},{ 289, 7752} - } - } - }, - { - { - /*Y' qi=25 INTRA*/ - { - { 125, -28},{ 285, 1426},{ 582, 2540},{ 917, 3351}, - { 1244, 3997},{ 1569, 4570},{ 1903, 5071},{ 2258, 5498}, - { 2626, 5866},{ 3002, 6182},{ 3382, 6448},{ 3770, 6623}, - { 4162, 6760},{ 4528, 6934},{ 4882, 7144},{ 5249, 7328}, - { 5610, 7453},{ 5958, 7578},{ 6291, 7672},{ 6597, 7708}, - { 6928, 7715},{ 7258, 7737},{ 7575, 7781},{ 7950, 7829} - }, - /*Y' qi=25 INTER*/ - { - { 64, -16},{ 72, 1348},{ 139, 2832},{ 206, 4218}, - { 268, 5465},{ 322, 6659},{ 403, 7803},{ 540, 8838}, - { 747, 9734},{ 1044,10465},{ 1473,10981},{ 2048,11249}, - { 2717,11311},{ 3397,11257},{ 4025,11161},{ 4589,11052}, - { 5099,10947},{ 5560,10859},{ 5989,10786},{ 6389,10717}, - { 6753,10652},{ 7078,10592},{ 7389,10535},{ 7697,10460} - } - }, - { - /*Cb qi=25 INTRA*/ - { - { 3, 3},{ 78, 368},{ 133, 745},{ 159, 1180}, - { 193, 1627},{ 242, 2046},{ 304, 2411},{ 381, 2714}, - { 456, 2983},{ 527, 3224},{ 598, 3437},{ 667, 3655}, - { 726, 3888},{ 776, 4117},{ 826, 4333},{ 883, 4543}, - { 954, 4727},{ 1019, 4878},{ 1095, 5014},{ 1171, 5187}, - { 1255, 5342},{ 1319, 5458},{ 1396, 5546},{ 1536, 5678} - }, - /*Cb qi=25 INTER*/ - { - { 117, 32},{ 89, 342},{ 67, 660},{ 64, 1037}, - { 77, 1441},{ 93, 1845},{ 105, 2243},{ 113, 2633}, - { 120, 3016},{ 125, 3387},{ 131, 3739},{ 137, 4080}, - { 144, 4416},{ 152, 4741},{ 160, 5057},{ 169, 5369}, - { 180, 5680},{ 193, 5990},{ 209, 6294},{ 227, 6594}, - { 249, 6888},{ 269, 7180},{ 294, 7467},{ 317, 7768} - } - }, - { - /*Cr qi=25 INTRA*/ - { - { 6, 6},{ 74, 380},{ 129, 770},{ 165, 1220}, - { 201, 1658},{ 253, 2061},{ 315, 2410},{ 388, 2676}, - { 462, 2920},{ 523, 3166},{ 584, 3404},{ 647, 3637}, - { 701, 3870},{ 769, 4086},{ 838, 4296},{ 898, 4491}, - { 980, 4627},{ 1065, 4759},{ 1126, 4920},{ 1187, 5058}, - { 1283, 5180},{ 1347, 5332},{ 1404, 5475},{ 1527, 5534} - }, - /*Cr qi=25 INTER*/ - { - { 92, 41},{ 75, 347},{ 64, 664},{ 70, 1045}, - { 85, 1448},{ 98, 1849},{ 105, 2245},{ 110, 2637}, - { 115, 3023},{ 120, 3395},{ 126, 3753},{ 131, 4102}, - { 136, 4439},{ 145, 4768},{ 156, 5094},{ 168, 5410}, - { 184, 5717},{ 203, 6010},{ 221, 6300},{ 239, 6577}, - { 262, 6847},{ 282, 7123},{ 303, 7390},{ 322, 7665} - } - } - }, - { - { - /*Y' qi=26 INTRA*/ - { - { 130, -24},{ 292, 1423},{ 594, 2525},{ 943, 3307}, - { 1289, 3921},{ 1633, 4467},{ 1991, 4943},{ 2368, 5348}, - { 2753, 5696},{ 3148, 5991},{ 3545, 6247},{ 3942, 6415}, - { 4342, 6535},{ 4726, 6690},{ 5093, 6883},{ 5466, 7047}, - { 5840, 7159},{ 6202, 7274},{ 6545, 7351},{ 6855, 7375}, - { 7186, 7384},{ 7517, 7416},{ 7840, 7447},{ 8238, 7450} - }, - /*Y' qi=26 INTER*/ - { - { 52, 16},{ 75, 1336},{ 143, 2815},{ 213, 4191}, - { 278, 5427},{ 339, 6611},{ 436, 7734},{ 600, 8732}, - { 843, 9579},{ 1195,10243},{ 1702,10660},{ 2355,10825}, - { 3070,10820},{ 3755,10743},{ 4372,10643},{ 4925,10538}, - { 5426,10440},{ 5882,10354},{ 6296,10290},{ 6686,10224}, - { 7049,10163},{ 7380,10113},{ 7672,10062},{ 7937,10021} - } - }, - { - /*Cb qi=26 INTRA*/ - { - { 4, 3},{ 79, 368},{ 138, 745},{ 167, 1180}, - { 200, 1623},{ 252, 2034},{ 322, 2389},{ 403, 2682}, - { 480, 2941},{ 558, 3176},{ 631, 3393},{ 700, 3608}, - { 766, 3825},{ 819, 4046},{ 868, 4265},{ 926, 4472}, - { 1002, 4645},{ 1070, 4800},{ 1151, 4924},{ 1242, 5063}, - { 1325, 5221},{ 1393, 5338},{ 1464, 5431},{ 1595, 5559} - }, - /*Cb qi=26 INTER*/ - { - { 98, 33},{ 83, 343},{ 65, 662},{ 65, 1037}, - { 80, 1437},{ 96, 1839},{ 107, 2238},{ 115, 2628}, - { 122, 3007},{ 128, 3373},{ 134, 3722},{ 142, 4060}, - { 149, 4390},{ 158, 4713},{ 167, 5029},{ 178, 5341}, - { 191, 5647},{ 208, 5948},{ 227, 6244},{ 247, 6539}, - { 269, 6833},{ 295, 7114},{ 328, 7388},{ 369, 7658} - } - }, - { - /*Cr qi=26 INTRA*/ - { - { 5, 6},{ 75, 380},{ 133, 769},{ 172, 1217}, - { 212, 1652},{ 266, 2048},{ 333, 2384},{ 412, 2643}, - { 490, 2880},{ 552, 3124},{ 616, 3365},{ 681, 3594}, - { 739, 3816},{ 810, 4024},{ 880, 4224},{ 945, 4405}, - { 1029, 4538},{ 1114, 4674},{ 1183, 4822},{ 1254, 4946}, - { 1346, 5063},{ 1417, 5201},{ 1478, 5345},{ 1597, 5411} - }, - /*Cr qi=26 INTER*/ - { - { 97, 29},{ 75, 342},{ 62, 667},{ 70, 1047}, - { 87, 1447},{ 100, 1846},{ 107, 2242},{ 113, 2633}, - { 118, 3016},{ 123, 3382},{ 128, 3737},{ 135, 4082}, - { 142, 4417},{ 151, 4746},{ 162, 5066},{ 176, 5377}, - { 194, 5679},{ 217, 5963},{ 239, 6244},{ 260, 6522}, - { 284, 6789},{ 309, 7052},{ 335, 7313},{ 355, 7582} + { 0, 741},{ 9, 743},{ 16, 1034},{ 26, 1385}, + { 39, 1741},{ 48, 2090},{ 56, 2459},{ 64, 2850}, + { 72, 3242},{ 81, 3622},{ 89, 3980},{ 98, 4323}, + { 104, 4667},{ 110, 5005},{ 118, 5337},{ 126, 5675}, + { 137, 5998},{ 146, 6311},{ 156, 6621},{ 170, 6914}, + { 181, 7205},{ 196, 7490},{ 203, 7779},{ 232, 8012} } } }, @@ -1732,557 +217,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=27 INTRA*/ { - { 118, -10},{ 308, 1404},{ 630, 2473},{ 997, 3227}, - { 1360, 3819},{ 1719, 4354},{ 2086, 4829},{ 2470, 5233}, - { 2863, 5576},{ 3267, 5870},{ 3677, 6117},{ 4085, 6268}, - { 4499, 6376},{ 4888, 6521},{ 5257, 6705},{ 5638, 6865}, - { 6020, 6962},{ 6394, 7056},{ 6744, 7130},{ 7051, 7158}, - { 7386, 7164},{ 7717, 7185},{ 8042, 7209},{ 8444, 7206} + { 121, 378},{ 379, 1464},{ 810, 2335},{ 1447, 2725}, + { 1851, 3194},{ 2311, 3655},{ 2747, 4081},{ 3211, 4393}, + { 3640, 4672},{ 4056, 4933},{ 4427, 5150},{ 4842, 5259}, + { 5220, 5381},{ 5584, 5443},{ 5925, 5648},{ 6233, 5783}, + { 6547, 5944},{ 6905, 6056},{ 7203, 6181},{ 7526, 6207}, + { 7800, 6330},{ 8175, 6312},{ 8415, 6437},{ 8705, 6459} }, /*Y' qi=27 INTER*/ { - { 54, 19},{ 77, 1333},{ 147, 2806},{ 221, 4166}, - { 290, 5390},{ 360, 6564},{ 474, 7665},{ 664, 8630}, - { 949, 9423},{ 1370,10002},{ 1958,10323},{ 2670,10414}, - { 3406,10375},{ 4086,10285},{ 4691,10182},{ 5233,10085}, - { 5724, 9994},{ 6169, 9918},{ 6582, 9863},{ 6962, 9813}, - { 7316, 9759},{ 7645, 9707},{ 7948, 9660},{ 8262, 9623} + { 48, 199},{ 90, 1458},{ 167, 2824},{ 291, 4050}, + { 434, 5144},{ 638, 6133},{ 901, 7011},{ 1249, 7743}, + { 1726, 8280},{ 2317, 8616},{ 2957, 8789},{ 3561, 8896}, + { 4126, 8936},{ 4646, 8933},{ 5115, 8931},{ 5579, 8890}, + { 6008, 8804},{ 6411, 8744},{ 6774, 8646},{ 7153, 8549}, + { 7475, 8462},{ 7790, 8372},{ 8069, 8280},{ 8299, 8278} } }, { /*Cb qi=27 INTRA*/ { - { 4, 3},{ 79, 368},{ 137, 745},{ 166, 1180}, - { 200, 1622},{ 253, 2030},{ 324, 2381},{ 407, 2671}, - { 487, 2925},{ 567, 3156},{ 640, 3372},{ 712, 3580}, - { 782, 3792},{ 833, 4015},{ 887, 4227},{ 954, 4422}, - { 1031, 4592},{ 1103, 4738},{ 1187, 4856},{ 1280, 4990}, - { 1371, 5135},{ 1442, 5244},{ 1520, 5321},{ 1684, 5398} + { 75, 612},{ 117, 751},{ 160, 1068},{ 195, 1406}, + { 240, 1741},{ 305, 2066},{ 364, 2359},{ 454, 2639}, + { 538, 2899},{ 609, 3149},{ 664, 3384},{ 730, 3625}, + { 785, 3860},{ 836, 4094},{ 872, 4312},{ 948, 4507}, + { 1023, 4677},{ 1081, 4843},{ 1165, 4985},{ 1238, 5092}, + { 1316, 5235},{ 1418, 5345},{ 1430, 5478},{ 1505, 5538} }, /*Cb qi=27 INTER*/ { - { 113, 20},{ 90, 338},{ 66, 661},{ 67, 1034}, - { 82, 1438},{ 97, 1842},{ 108, 2238},{ 115, 2624}, - { 123, 3000},{ 130, 3361},{ 138, 3708},{ 146, 4040}, - { 155, 4367},{ 164, 4688},{ 174, 4999},{ 186, 5306}, - { 203, 5609},{ 222, 5908},{ 243, 6202},{ 268, 6494}, - { 295, 6781},{ 326, 7058},{ 367, 7319},{ 420, 7551} + { 16, 637},{ 13, 634},{ 32, 869},{ 46, 1230}, + { 55, 1583},{ 67, 1950},{ 79, 2320},{ 93, 2690}, + { 107, 3052},{ 120, 3399},{ 133, 3733},{ 146, 4054}, + { 162, 4367},{ 175, 4679},{ 191, 4984},{ 211, 5285}, + { 232, 5581},{ 252, 5875},{ 276, 6155},{ 305, 6433}, + { 333, 6706},{ 364, 6967},{ 398, 7244},{ 474, 7394} } }, { /*Cr qi=27 INTRA*/ { - { 5, 6},{ 75, 380},{ 133, 770},{ 173, 1217}, - { 214, 1650},{ 268, 2040},{ 337, 2375},{ 418, 2631}, - { 496, 2862},{ 558, 3104},{ 625, 3346},{ 692, 3571}, - { 753, 3786},{ 825, 3989},{ 896, 4182},{ 969, 4352}, - { 1059, 4479},{ 1144, 4614},{ 1212, 4757},{ 1284, 4871}, - { 1380, 4982},{ 1457, 5125},{ 1528, 5267},{ 1651, 5346} + { 64, 632},{ 107, 763},{ 147, 1054},{ 176, 1411}, + { 255, 1770},{ 324, 2079},{ 411, 2359},{ 475, 2621}, + { 545, 2880},{ 590, 3158},{ 647, 3425},{ 709, 3648}, + { 766, 3878},{ 831, 4082},{ 911, 4260},{ 960, 4493}, + { 1042, 4558},{ 1115, 4760},{ 1200, 4852},{ 1280, 4950}, + { 1327, 5186},{ 1445, 5157},{ 1443, 5431},{ 1518, 5493} }, /*Cr qi=27 INTER*/ { - { 92, 24},{ 74, 341},{ 61, 669},{ 71, 1049}, - { 88, 1448},{ 100, 1849},{ 107, 2243},{ 113, 2631}, - { 119, 3010},{ 125, 3373},{ 131, 3723},{ 137, 4064}, - { 146, 4396},{ 159, 4720},{ 172, 5033},{ 189, 5340}, - { 210, 5636},{ 233, 5920},{ 256, 6197},{ 282, 6465}, - { 310, 6730},{ 332, 7000},{ 359, 7259},{ 385, 7515} - } - } - }, - { - { - /*Y' qi=28 INTRA*/ - { - { 116, -8},{ 314, 1400},{ 640, 2458},{ 1013, 3197}, - { 1386, 3768},{ 1762, 4279},{ 2151, 4733},{ 2558, 5117}, - { 2970, 5442},{ 3393, 5714},{ 3820, 5935},{ 4243, 6069}, - { 4671, 6161},{ 5074, 6289},{ 5456, 6457},{ 5849, 6598}, - { 6244, 6689},{ 6632, 6777},{ 6984, 6833},{ 7294, 6855}, - { 7625, 6862},{ 7961, 6875},{ 8302, 6890},{ 8720, 6883} - }, - /*Y' qi=28 INTER*/ - { - { 54, 8},{ 81, 1333},{ 154, 2793},{ 231, 4138}, - { 304, 5352},{ 384, 6512},{ 519, 7585},{ 743, 8508}, - { 1082, 9236},{ 1587, 9717},{ 2267, 9928},{ 3034, 9944}, - { 3775, 9878},{ 4438, 9786},{ 5031, 9686},{ 5563, 9601}, - { 6042, 9523},{ 6481, 9456},{ 6890, 9405},{ 7266, 9356}, - { 7614, 9313},{ 7933, 9265},{ 8238, 9220},{ 8545, 9193} - } - }, - { - /*Cb qi=28 INTRA*/ - { - { 3, 3},{ 80, 368},{ 138, 746},{ 168, 1179}, - { 208, 1615},{ 268, 2014},{ 345, 2354},{ 432, 2637}, - { 515, 2884},{ 595, 3108},{ 669, 3323},{ 745, 3533}, - { 818, 3740},{ 876, 3953},{ 932, 4160},{ 1003, 4349}, - { 1088, 4501},{ 1154, 4648},{ 1241, 4768},{ 1349, 4889}, - { 1441, 5023},{ 1524, 5113},{ 1611, 5187},{ 1783, 5283} - }, - /*Cb qi=28 INTER*/ - { - { 117, 29},{ 91, 341},{ 65, 663},{ 68, 1038}, - { 85, 1440},{ 100, 1841},{ 110, 2234},{ 119, 2616}, - { 127, 2985},{ 135, 3342},{ 142, 3685},{ 151, 4015}, - { 162, 4337},{ 174, 4652},{ 186, 4960},{ 201, 5264}, - { 218, 5567},{ 239, 5863},{ 266, 6149},{ 295, 6434}, - { 328, 6715},{ 371, 6976},{ 409, 7239},{ 460, 7477} - } - }, - { - /*Cr qi=28 INTRA*/ - { - { 6, 7},{ 79, 381},{ 138, 771},{ 178, 1215}, - { 222, 1644},{ 285, 2026},{ 359, 2347},{ 441, 2597}, - { 521, 2827},{ 588, 3066},{ 655, 3303},{ 725, 3523}, - { 791, 3728},{ 870, 3920},{ 950, 4103},{ 1030, 4265}, - { 1121, 4388},{ 1198, 4520},{ 1266, 4659},{ 1356, 4759}, - { 1461, 4865},{ 1540, 4993},{ 1619, 5115},{ 1786, 5160} - }, - /*Cr qi=28 INTER*/ - { - { 96, 18},{ 78, 340},{ 66, 672},{ 74, 1051}, - { 90, 1450},{ 103, 1845},{ 110, 2235},{ 116, 2619}, - { 122, 2995},{ 129, 3356},{ 137, 3702},{ 146, 4038}, - { 156, 4365},{ 168, 4684},{ 182, 4995},{ 203, 5297}, - { 227, 5588},{ 253, 5866},{ 282, 6131},{ 311, 6394}, - { 339, 6664},{ 366, 6918},{ 400, 7171},{ 424, 7450} - } - } - }, - { - { - /*Y' qi=29 INTRA*/ - { - { 112, 7},{ 334, 1382},{ 681, 2410},{ 1081, 3112}, - { 1484, 3650},{ 1894, 4128},{ 2316, 4547},{ 2749, 4905}, - { 3188, 5208},{ 3634, 5458},{ 4079, 5666},{ 4517, 5791}, - { 4952, 5870},{ 5359, 5983},{ 5754, 6137},{ 6165, 6268}, - { 6568, 6351},{ 6958, 6423},{ 7320, 6471},{ 7638, 6490}, - { 7979, 6490},{ 8313, 6499},{ 8651, 6517},{ 9085, 6499} - }, - /*Y' qi=29 INTER*/ - { - { 55, 15},{ 85, 1336},{ 160, 2780},{ 242, 4104}, - { 323, 5302},{ 418, 6443},{ 586, 7480},{ 859, 8342}, - { 1278, 8982},{ 1888, 9347},{ 2658, 9457},{ 3457, 9425}, - { 4192, 9343},{ 4842, 9247},{ 5417, 9162},{ 5935, 9086}, - { 6404, 9011},{ 6841, 8952},{ 7241, 8907},{ 7609, 8867}, - { 7953, 8832},{ 8267, 8792},{ 8562, 8740},{ 8836, 8701} - } - }, - { - /*Cb qi=29 INTRA*/ - { - { 5, 3},{ 84, 368},{ 144, 746},{ 176, 1175}, - { 219, 1604},{ 285, 1991},{ 372, 2318},{ 462, 2591}, - { 546, 2833},{ 628, 3058},{ 704, 3274},{ 788, 3473}, - { 870, 3664},{ 935, 3865},{ 995, 4059},{ 1072, 4239}, - { 1167, 4388},{ 1248, 4518},{ 1334, 4634},{ 1429, 4765}, - { 1536, 4884},{ 1628, 4964},{ 1716, 5038},{ 1885, 5128} - }, - /*Cb qi=29 INTER*/ - { - { 126, 25},{ 95, 340},{ 69, 662},{ 71, 1039}, - { 88, 1440},{ 102, 1839},{ 113, 2227},{ 122, 2604}, - { 132, 2969},{ 141, 3320},{ 151, 3659},{ 161, 3985}, - { 172, 4301},{ 186, 4612},{ 200, 4917},{ 219, 5213}, - { 241, 5509},{ 265, 5800},{ 296, 6081},{ 329, 6360}, - { 369, 6633},{ 414, 6899},{ 465, 7148},{ 520, 7387} - } - }, - { - /*Cr qi=29 INTRA*/ - { - { 6, 7},{ 82, 382},{ 142, 772},{ 185, 1211}, - { 233, 1632},{ 303, 2000},{ 388, 2306},{ 475, 2550}, - { 556, 2779},{ 627, 3007},{ 707, 3237},{ 778, 3459}, - { 843, 3654},{ 927, 3834},{ 1012, 4012},{ 1101, 4152}, - { 1197, 4262},{ 1275, 4399},{ 1359, 4511},{ 1455, 4596}, - { 1562, 4708},{ 1644, 4833},{ 1719, 4954},{ 1888, 4988} - }, - /*Cr qi=29 INTER*/ - { - { 101, 28},{ 81, 343},{ 67, 673},{ 75, 1053}, - { 93, 1450},{ 106, 1844},{ 113, 2230},{ 119, 2610}, - { 127, 2980},{ 135, 3334},{ 143, 3676},{ 153, 4007}, - { 165, 4330},{ 180, 4645},{ 201, 4951},{ 224, 5243}, - { 253, 5522},{ 284, 5794},{ 314, 6060},{ 345, 6322}, - { 381, 6578},{ 419, 6828},{ 455, 7073},{ 495, 7316} - } - } - }, - { - { - /*Y' qi=30 INTRA*/ - { - { 112, 8},{ 335, 1380},{ 682, 2401},{ 1083, 3093}, - { 1489, 3619},{ 1902, 4092},{ 2332, 4511},{ 2777, 4865}, - { 3231, 5156},{ 3693, 5394},{ 4153, 5585},{ 4605, 5689}, - { 5049, 5764},{ 5468, 5871},{ 5875, 6004},{ 6295, 6120}, - { 6706, 6201},{ 7099, 6273},{ 7461, 6311},{ 7785, 6320}, - { 8128, 6322},{ 8469, 6331},{ 8806, 6342},{ 9220, 6338} - }, - /*Y' qi=30 INTER*/ - { - { 58, 8},{ 90, 1340},{ 169, 2771},{ 257, 4079}, - { 345, 5266},{ 459, 6387},{ 660, 7383},{ 990, 8180}, - { 1496, 8726},{ 2203, 8992},{ 3029, 9038},{ 3833, 8984}, - { 4549, 8900},{ 5183, 8813},{ 5745, 8735},{ 6250, 8674}, - { 6715, 8619},{ 7138, 8565},{ 7529, 8528},{ 7899, 8495}, - { 8234, 8465},{ 8550, 8429},{ 8856, 8395},{ 9160, 8374} - } - }, - { - /*Cb qi=30 INTRA*/ - { - { 7, 3},{ 88, 369},{ 149, 747},{ 185, 1175}, - { 232, 1599},{ 304, 1976},{ 392, 2293},{ 486, 2557}, - { 573, 2797},{ 656, 3027},{ 735, 3243},{ 819, 3442}, - { 903, 3629},{ 966, 3828},{ 1025, 4027},{ 1105, 4204}, - { 1201, 4343},{ 1282, 4469},{ 1379, 4575},{ 1486, 4689}, - { 1588, 4813},{ 1678, 4900},{ 1767, 4969},{ 1911, 5080} - }, - /*Cb qi=30 INTER*/ - { - { 120, 23},{ 96, 336},{ 72, 661},{ 75, 1043}, - { 91, 1441},{ 105, 1837},{ 117, 2221},{ 127, 2592}, - { 137, 2953},{ 148, 3301},{ 159, 3635},{ 170, 3959}, - { 184, 4271},{ 199, 4578},{ 216, 4879},{ 238, 5175}, - { 262, 5466},{ 294, 5750},{ 332, 6027},{ 373, 6298}, - { 421, 6559},{ 473, 6805},{ 526, 7053},{ 587, 7298} - } - }, - { - /*Cr qi=30 INTRA*/ - { - { 10, 7},{ 89, 384},{ 147, 773},{ 192, 1211}, - { 245, 1627},{ 322, 1984},{ 412, 2280},{ 501, 2520}, - { 583, 2750},{ 654, 2982},{ 736, 3207},{ 810, 3419}, - { 873, 3614},{ 957, 3794},{ 1048, 3965},{ 1139, 4102}, - { 1237, 4208},{ 1327, 4328},{ 1408, 4448},{ 1496, 4545}, - { 1604, 4652},{ 1699, 4760},{ 1780, 4877},{ 1937, 4942} - }, - /*Cr qi=30 INTER*/ - { - { 115, 26},{ 89, 342},{ 70, 672},{ 79, 1055}, - { 96, 1451},{ 108, 1841},{ 116, 2222},{ 124, 2599}, - { 132, 2965},{ 141, 3316},{ 151, 3655},{ 163, 3984}, - { 178, 4301},{ 197, 4609},{ 219, 4909},{ 247, 5195}, - { 280, 5469},{ 317, 5734},{ 351, 5991},{ 383, 6248}, - { 423, 6500},{ 467, 6744},{ 502, 6995},{ 558, 7226} - } - } - }, - { - { - /*Y' qi=31 INTRA*/ - { - { 116, 20},{ 359, 1361},{ 732, 2350},{ 1162, 3010}, - { 1597, 3507},{ 2042, 3950},{ 2503, 4339},{ 2974, 4670}, - { 3446, 4951},{ 3922, 5179},{ 4394, 5357},{ 4858, 5454}, - { 5313, 5519},{ 5734, 5626},{ 6154, 5755},{ 6585, 5859}, - { 7004, 5928},{ 7408, 5998},{ 7775, 6039},{ 8102, 6048}, - { 8442, 6051},{ 8790, 6054},{ 9136, 6057},{ 9554, 6041} - }, - /*Y' qi=31 INTER*/ - { - { 53, 12},{ 90, 1340},{ 169, 2765},{ 259, 4062}, - { 353, 5236},{ 483, 6340},{ 713, 7305},{ 1086, 8059}, - { 1651, 8548},{ 2423, 8751},{ 3288, 8754},{ 4106, 8674}, - { 4827, 8572},{ 5451, 8482},{ 6007, 8407},{ 6514, 8344}, - { 6970, 8282},{ 7397, 8225},{ 7795, 8193},{ 8159, 8161}, - { 8498, 8120},{ 8814, 8093},{ 9127, 8066},{ 9432, 8040} - } - }, - { - /*Cb qi=31 INTRA*/ - { - { 7, 3},{ 88, 369},{ 149, 746},{ 185, 1173}, - { 234, 1595},{ 308, 1967},{ 399, 2278},{ 494, 2537}, - { 583, 2774},{ 669, 2997},{ 755, 3204},{ 847, 3390}, - { 936, 3569},{ 1008, 3759},{ 1078, 3942},{ 1162, 4104}, - { 1262, 4238},{ 1352, 4364},{ 1442, 4470},{ 1557, 4567}, - { 1676, 4674},{ 1759, 4781},{ 1850, 4853},{ 2043, 4897} - }, - /*Cb qi=31 INTER*/ - { - { 121, 23},{ 96, 335},{ 72, 660},{ 74, 1043}, - { 90, 1440},{ 105, 1834},{ 116, 2217},{ 127, 2586}, - { 138, 2945},{ 148, 3293},{ 159, 3626},{ 172, 3945}, - { 185, 4256},{ 202, 4559},{ 223, 4856},{ 245, 5150}, - { 272, 5440},{ 306, 5719},{ 346, 5989},{ 391, 6253}, - { 443, 6511},{ 510, 6743},{ 583, 6965},{ 651, 7182} - } - }, - { - /*Cr qi=31 INTRA*/ - { - { 10, 7},{ 88, 384},{ 147, 773},{ 192, 1209}, - { 247, 1622},{ 326, 1974},{ 417, 2262},{ 509, 2500}, - { 596, 2726},{ 670, 2949},{ 754, 3170},{ 836, 3370}, - { 912, 3548},{ 999, 3724},{ 1093, 3888},{ 1198, 4000}, - { 1304, 4095},{ 1384, 4230},{ 1470, 4347},{ 1577, 4422}, - { 1696, 4513},{ 1798, 4620},{ 1869, 4746},{ 1991, 4798} - }, - /*Cr qi=31 INTER*/ - { - { 113, 32},{ 88, 345},{ 69, 674},{ 79, 1055}, - { 96, 1451},{ 108, 1839},{ 115, 2218},{ 123, 2592}, - { 132, 2957},{ 141, 3308},{ 151, 3643},{ 163, 3968}, - { 179, 4285},{ 200, 4590},{ 225, 4886},{ 254, 5169}, - { 291, 5436},{ 330, 5696},{ 368, 5951},{ 409, 6200}, - { 452, 6448},{ 493, 6695},{ 536, 6940},{ 571, 7204} - } - } - }, - { - { - /*Y' qi=32 INTRA*/ - { - { 123, 26},{ 370, 1356},{ 756, 2321},{ 1211, 2944}, - { 1674, 3408},{ 2148, 3826},{ 2639, 4193},{ 3138, 4504}, - { 3634, 4765},{ 4133, 4973},{ 4625, 5137},{ 5101, 5225}, - { 5567, 5274},{ 6002, 5363},{ 6437, 5482},{ 6885, 5566}, - { 7312, 5625},{ 7723, 5686},{ 8101, 5721},{ 8429, 5732}, - { 8769, 5728},{ 9120, 5726},{ 9472, 5723},{ 9918, 5700} - }, - /*Y' qi=32 INTER*/ - { - { 54, -3},{ 95, 1343},{ 179, 2750},{ 276, 4027}, - { 382, 5185},{ 543, 6256},{ 830, 7161},{ 1301, 7815}, - { 2003, 8172},{ 2883, 8266},{ 3779, 8217},{ 4578, 8127}, - { 5274, 8035},{ 5886, 7952},{ 6430, 7887},{ 6929, 7835}, - { 7380, 7779},{ 7796, 7737},{ 8190, 7705},{ 8552, 7672}, - { 8896, 7640},{ 9210, 7612},{ 9510, 7589},{ 9746, 7552} - } - }, - { - /*Cb qi=32 INTRA*/ - { - { 6, 3},{ 89, 369},{ 153, 746},{ 193, 1167}, - { 247, 1577},{ 330, 1935},{ 429, 2236},{ 528, 2494}, - { 620, 2732},{ 712, 2948},{ 801, 3146},{ 898, 3325}, - { 999, 3489},{ 1078, 3664},{ 1155, 3832},{ 1251, 3985}, - { 1360, 4115},{ 1451, 4236},{ 1549, 4338},{ 1667, 4433}, - { 1797, 4522},{ 1891, 4613},{ 1989, 4687},{ 2162, 4776} - }, - /*Cb qi=32 INTER*/ - { - { 116, -1},{ 98, 321},{ 80, 656},{ 80, 1042}, - { 96, 1438},{ 110, 1827},{ 122, 2205},{ 133, 2570}, - { 144, 2925},{ 157, 3268},{ 170, 3597},{ 185, 3911}, - { 202, 4216},{ 221, 4516},{ 244, 4809},{ 273, 5096}, - { 308, 5376},{ 350, 5644},{ 401, 5907},{ 459, 6160}, - { 520, 6401},{ 592, 6630},{ 676, 6837},{ 758, 7050} - } - }, - { - /*Cr qi=32 INTRA*/ - { - { 12, 7},{ 91, 386},{ 152, 773},{ 201, 1202}, - { 261, 1603},{ 347, 1942},{ 447, 2223},{ 540, 2460}, - { 626, 2684},{ 711, 2901},{ 801, 3115},{ 887, 3312}, - { 969, 3480},{ 1068, 3633},{ 1176, 3779},{ 1283, 3885}, - { 1392, 3969},{ 1485, 4090},{ 1573, 4206},{ 1686, 4274}, - { 1813, 4354},{ 1911, 4459},{ 2004, 4563},{ 2162, 4590} - }, - /*Cr qi=32 INTER*/ - { - { 129, 5},{ 98, 334},{ 75, 673},{ 84, 1055}, - { 101, 1448},{ 113, 1832},{ 121, 2206},{ 129, 2577}, - { 140, 2937},{ 151, 3282},{ 163, 3614},{ 179, 3932}, - { 198, 4240},{ 221, 4542},{ 252, 4830},{ 290, 5102}, - { 329, 5364},{ 373, 5618},{ 420, 5864},{ 468, 6105}, - { 513, 6351},{ 564, 6587},{ 624, 6810},{ 697, 7017} - } - } - }, - { - { - /*Y' qi=33 INTRA*/ - { - { 115, 36},{ 388, 1338},{ 791, 2289},{ 1258, 2899}, - { 1732, 3352},{ 2220, 3760},{ 2730, 4117},{ 3244, 4415}, - { 3751, 4662},{ 4261, 4858},{ 4766, 5012},{ 5249, 5094}, - { 5719, 5141},{ 6159, 5225},{ 6597, 5333},{ 7044, 5416}, - { 7474, 5472},{ 7893, 5531},{ 8268, 5570},{ 8591, 5580}, - { 8931, 5578},{ 9283, 5579},{ 9634, 5582},{10067, 5560} - }, - /*Y' qi=33 INTER*/ - { - { 65, -14},{ 102, 1345},{ 190, 2736},{ 294, 3999}, - { 411, 5146},{ 597, 6192},{ 934, 7045},{ 1488, 7622}, - { 2281, 7895},{ 3213, 7937},{ 4108, 7871},{ 4883, 7784}, - { 5556, 7709},{ 6150, 7643},{ 6685, 7585},{ 7176, 7539}, - { 7620, 7502},{ 8034, 7466},{ 8427, 7435},{ 8793, 7409}, - { 9136, 7386},{ 9446, 7364},{ 9743, 7339},{10025, 7303} - } - }, - { - /*Cb qi=33 INTRA*/ - { - { 5, 3},{ 92, 369},{ 159, 746},{ 203, 1163}, - { 263, 1564},{ 353, 1911},{ 458, 2204},{ 557, 2460}, - { 650, 2697},{ 744, 2913},{ 836, 3110},{ 934, 3292}, - { 1036, 3454},{ 1125, 3616},{ 1204, 3781},{ 1298, 3932}, - { 1410, 4058},{ 1507, 4170},{ 1606, 4265},{ 1725, 4358}, - { 1853, 4445},{ 1955, 4535},{ 2067, 4597},{ 2258, 4663} - }, - /*Cb qi=33 INTER*/ - { - { 109, 37},{ 94, 343},{ 81, 662},{ 85, 1042}, - { 102, 1436},{ 116, 1823},{ 128, 2195},{ 141, 2554}, - { 154, 2906},{ 167, 3246},{ 183, 3570},{ 202, 3881}, - { 220, 4185},{ 241, 4482},{ 268, 4772},{ 302, 5053}, - { 341, 5328},{ 388, 5592},{ 446, 5846},{ 507, 6096}, - { 581, 6328},{ 670, 6534},{ 762, 6731},{ 842, 6922} - } - }, - { - /*Cr qi=33 INTRA*/ - { - { 11, 7},{ 93, 387},{ 158, 774},{ 211, 1197}, - { 278, 1589},{ 372, 1917},{ 475, 2191},{ 569, 2429}, - { 658, 2655},{ 744, 2868},{ 835, 3083},{ 926, 3271}, - { 1010, 3430},{ 1110, 3586},{ 1224, 3724},{ 1336, 3826}, - { 1449, 3908},{ 1547, 4021},{ 1636, 4136},{ 1751, 4200}, - { 1886, 4277},{ 1977, 4384},{ 2070, 4474},{ 2232, 4510} - }, - /*Cr qi=33 INTER*/ - { - { 77, 9},{ 90, 347},{ 80, 674},{ 91, 1053}, - { 107, 1444},{ 119, 1825},{ 127, 2196},{ 137, 2563}, - { 149, 2919},{ 161, 3259},{ 176, 3588},{ 194, 3905}, - { 217, 4209},{ 246, 4504},{ 280, 4786},{ 320, 5055}, - { 364, 5316},{ 409, 5565},{ 460, 5804},{ 517, 6039}, - { 578, 6264},{ 640, 6489},{ 701, 6721},{ 772, 6948} - } - } - }, - { - { - /*Y' qi=34 INTRA*/ - { - { 124, 40},{ 401, 1333},{ 823, 2262},{ 1318, 2842}, - { 1823, 3265},{ 2339, 3650},{ 2872, 3991},{ 3405, 4274}, - { 3926, 4513},{ 4448, 4704},{ 4961, 4845},{ 5450, 4921}, - { 5925, 4971},{ 6372, 5053},{ 6813, 5160},{ 7264, 5242}, - { 7704, 5291},{ 8124, 5346},{ 8500, 5382},{ 8831, 5384}, - { 9178, 5380},{ 9525, 5387},{ 9869, 5389},{10310, 5356} - }, - /*Y' qi=34 INTER*/ - { - { 64, -17},{ 101, 1344},{ 190, 2730},{ 299, 3981}, - { 430, 5110},{ 648, 6127},{ 1036, 6933},{ 1664, 7445}, - { 2535, 7652},{ 3504, 7653},{ 4402, 7572},{ 5173, 7479}, - { 5843, 7400},{ 6441, 7334},{ 6976, 7280},{ 7464, 7231}, - { 7910, 7189},{ 8332, 7157},{ 8730, 7125},{ 9091, 7103}, - { 9422, 7086},{ 9753, 7061},{10067, 7036},{10316, 7029} - } - }, - { - /*Cb qi=34 INTRA*/ - { - { 5, 3},{ 91, 369},{ 158, 746},{ 204, 1162}, - { 266, 1561},{ 358, 1903},{ 466, 2189},{ 570, 2439}, - { 665, 2671},{ 765, 2880},{ 864, 3069},{ 970, 3238}, - { 1079, 3392},{ 1174, 3545},{ 1265, 3693},{ 1360, 3841}, - { 1471, 3968},{ 1572, 4083},{ 1675, 4181},{ 1804, 4255}, - { 1939, 4332},{ 2048, 4411},{ 2155, 4484},{ 2339, 4584} - }, - /*Cb qi=34 INTER*/ - { - { 99, 44},{ 92, 345},{ 82, 661},{ 86, 1043}, - { 101, 1436},{ 116, 1821},{ 128, 2191},{ 140, 2549}, - { 154, 2898},{ 168, 3235},{ 185, 3556},{ 203, 3865}, - { 224, 4166},{ 248, 4457},{ 278, 4741},{ 315, 5021}, - { 361, 5289},{ 416, 5546},{ 483, 5792},{ 559, 6025}, - { 651, 6237},{ 752, 6432},{ 849, 6626},{ 967, 6790} - } - }, - { - /*Cr qi=34 INTRA*/ - { - { 11, 7},{ 93, 387},{ 158, 773},{ 212, 1195}, - { 282, 1584},{ 378, 1909},{ 483, 2179},{ 578, 2414}, - { 671, 2633},{ 766, 2837},{ 866, 3038},{ 960, 3223}, - { 1049, 3376},{ 1158, 3520},{ 1285, 3644},{ 1400, 3740}, - { 1505, 3828},{ 1616, 3928},{ 1713, 4030},{ 1820, 4104}, - { 1957, 4185},{ 2063, 4280},{ 2160, 4355},{ 2320, 4341} - }, - /*Cr qi=34 INTER*/ - { - { 78, 11},{ 89, 347},{ 79, 674},{ 90, 1053}, - { 106, 1444},{ 117, 1823},{ 127, 2192},{ 137, 2558}, - { 149, 2912},{ 163, 3249},{ 178, 3574},{ 197, 3888}, - { 222, 4189},{ 252, 4481},{ 293, 4755},{ 341, 5013}, - { 386, 5268},{ 436, 5512},{ 498, 5743},{ 563, 5970}, - { 622, 6200},{ 694, 6415},{ 776, 6622},{ 871, 6818} - } - } - }, - { - { - /*Y' qi=35 INTRA*/ - { - { 116, 51},{ 433, 1312},{ 881, 2221},{ 1406, 2771}, - { 1948, 3156},{ 2511, 3501},{ 3085, 3811},{ 3654, 4066}, - { 4212, 4273},{ 4763, 4444},{ 5298, 4572},{ 5799, 4638}, - { 6285, 4678},{ 6747, 4746},{ 7203, 4838},{ 7673, 4905}, - { 8124, 4950},{ 8552, 5003},{ 8938, 5027},{ 9275, 5026}, - { 9628, 5019},{ 9981, 5024},{10331, 5030},{10795, 5000} - }, - /*Y' qi=35 INTER*/ - { - { 71, -10},{ 108, 1348},{ 203, 2710},{ 325, 3938}, - { 485, 5040},{ 766, 6000},{ 1267, 6706},{ 2048, 7089}, - { 3037, 7191},{ 4032, 7146},{ 4903, 7061},{ 5648, 6977}, - { 6301, 6912},{ 6884, 6857},{ 7413, 6812},{ 7898, 6775}, - { 8342, 6739},{ 8764, 6710},{ 9160, 6688},{ 9519, 6668}, - { 9859, 6646},{10190, 6625},{10492, 6612},{10755, 6595} - } - }, - { - /*Cb qi=35 INTRA*/ - { - { 6, 3},{ 95, 369},{ 164, 746},{ 214, 1156}, - { 287, 1542},{ 390, 1869},{ 504, 2143},{ 611, 2388}, - { 712, 2613},{ 822, 2811},{ 937, 2987},{ 1055, 3147}, - { 1174, 3285},{ 1286, 3420},{ 1386, 3560},{ 1488, 3698}, - { 1604, 3814},{ 1714, 3916},{ 1825, 4008},{ 1958, 4088}, - { 2101, 4159},{ 2224, 4226},{ 2339, 4292},{ 2538, 4383} - }, - /*Cb qi=35 INTER*/ - { - { 98, 41},{ 90, 348},{ 86, 665},{ 92, 1042}, - { 108, 1432},{ 122, 1812},{ 136, 2175},{ 151, 2528}, - { 165, 2872},{ 182, 3202},{ 202, 3516},{ 225, 3819}, - { 251, 4112},{ 281, 4398},{ 320, 4675},{ 367, 4944}, - { 421, 5204},{ 493, 5450},{ 579, 5679},{ 672, 5892}, - { 785, 6082},{ 906, 6258},{ 1026, 6432},{ 1153, 6592} - } - }, - { - /*Cr qi=35 INTRA*/ - { - { 12, 7},{ 98, 388},{ 166, 773},{ 226, 1187}, - { 306, 1563},{ 411, 1874},{ 524, 2134},{ 622, 2365}, - { 721, 2577},{ 826, 2768},{ 947, 2946},{ 1066, 3106}, - { 1163, 3250},{ 1274, 3395},{ 1417, 3508},{ 1539, 3590}, - { 1639, 3671},{ 1754, 3765},{ 1865, 3855},{ 1979, 3921}, - { 2127, 3998},{ 2249, 4085},{ 2346, 4172},{ 2473, 4210} - }, - /*Cr qi=35 INTER*/ - { - { 86, 12},{ 94, 354},{ 85, 677},{ 96, 1052}, - { 113, 1439},{ 125, 1811},{ 135, 2177},{ 147, 2537}, - { 160, 2884},{ 177, 3215},{ 195, 3535},{ 219, 3842}, - { 252, 4133},{ 292, 4413},{ 339, 4680},{ 396, 4928}, - { 455, 5169},{ 514, 5408},{ 588, 5626},{ 672, 5835}, - { 750, 6051},{ 837, 6257},{ 943, 6442},{ 1073, 6595} + { 12, 688},{ 11, 660},{ 28, 869},{ 46, 1227}, + { 60, 1598},{ 68, 1954},{ 79, 2318},{ 93, 2693}, + { 108, 3054},{ 123, 3406},{ 138, 3748},{ 151, 4078}, + { 165, 4400},{ 180, 4716},{ 197, 5024},{ 217, 5314}, + { 243, 5599},{ 275, 5866},{ 301, 6128},{ 327, 6394}, + { 352, 6644},{ 375, 6894},{ 376, 7180},{ 458, 7334} } } }, @@ -2290,557 +279,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=36 INTRA*/ { - { 116, 52},{ 432, 1312},{ 881, 2215},{ 1407, 2759}, - { 1948, 3140},{ 2511, 3484},{ 3090, 3789},{ 3672, 4036}, - { 4243, 4236},{ 4803, 4397},{ 5346, 4517},{ 5856, 4581}, - { 6350, 4614},{ 6821, 4675},{ 7286, 4763},{ 7754, 4832}, - { 8201, 4875},{ 8631, 4922},{ 9015, 4948},{ 9351, 4945}, - { 9706, 4941},{10061, 4948},{10408, 4949},{10878, 4923} + { 156, 263},{ 484, 1370},{ 1174, 2110},{ 1914, 2456}, + { 2601, 2695},{ 3221, 2984},{ 3865, 3284},{ 4450, 3530}, + { 4979, 3739},{ 5470, 3928},{ 5905, 4080},{ 6375, 4200}, + { 6761, 4373},{ 7175, 4429},{ 7615, 4616},{ 8069, 4687}, + { 8417, 4820},{ 8813, 4908},{ 9211, 5001},{ 9508, 5073}, + { 9888, 5133},{10209, 5140},{10529, 5196},{10830, 5173} }, /*Y' qi=36 INTER*/ { - { 63, -16},{ 114, 1332},{ 216, 2690},{ 343, 3914}, - { 515, 5009},{ 829, 5939},{ 1399, 6586},{ 2263, 6901}, - { 3290, 6967},{ 4272, 6920},{ 5115, 6847},{ 5839, 6779}, - { 6478, 6726},{ 7051, 6685},{ 7571, 6649},{ 8050, 6614}, - { 8495, 6587},{ 8908, 6567},{ 9298, 6550},{ 9673, 6530}, - {10005, 6512},{10324, 6499},{10640, 6483},{10936, 6487} + { 68, 151},{ 107, 1413},{ 262, 2665},{ 542, 3715}, + { 946, 4584},{ 1508, 5279},{ 2167, 5829},{ 2968, 6179}, + { 3758, 6392},{ 4481, 6517},{ 5139, 6577},{ 5706, 6636}, + { 6271, 6612},{ 6746, 6585},{ 7216, 6533},{ 7622, 6496}, + { 8045, 6403},{ 8393, 6389},{ 8799, 6272},{ 9062, 6281}, + { 9436, 6184},{ 9637, 6238},{ 9864, 6215},{10147, 6215} } }, { /*Cb qi=36 INTRA*/ { - { 6, 3},{ 98, 370},{ 170, 746},{ 225, 1150}, - { 306, 1527},{ 416, 1845},{ 534, 2116},{ 642, 2363}, - { 743, 2591},{ 851, 2794},{ 964, 2972},{ 1081, 3133}, - { 1198, 3275},{ 1311, 3410},{ 1411, 3547},{ 1519, 3680}, - { 1642, 3789},{ 1750, 3892},{ 1860, 3982},{ 1998, 4054}, - { 2141, 4129},{ 2256, 4204},{ 2372, 4278},{ 2567, 4356} + { 91, 385},{ 138, 613},{ 205, 932},{ 265, 1239}, + { 353, 1549},{ 443, 1839},{ 518, 2104},{ 655, 2341}, + { 764, 2559},{ 876, 2756},{ 967, 2950},{ 1088, 3107}, + { 1184, 3266},{ 1295, 3396},{ 1375, 3548},{ 1502, 3664}, + { 1610, 3764},{ 1731, 3844},{ 1839, 3938},{ 1954, 4016}, + { 2069, 4100},{ 2207, 4167},{ 2274, 4253},{ 2374, 4289} }, /*Cb qi=36 INTER*/ { - { 107, 30},{ 96, 346},{ 88, 667},{ 100, 1039}, - { 115, 1426},{ 128, 1804},{ 142, 2164},{ 158, 2512}, - { 176, 2851},{ 195, 3178},{ 218, 3491},{ 243, 3791}, - { 270, 4084},{ 307, 4365},{ 348, 4638},{ 397, 4908}, - { 464, 5157},{ 545, 5392},{ 635, 5620},{ 734, 5831}, - { 854, 6015},{ 993, 6170},{ 1124, 6327},{ 1234, 6502} + { 59, 18},{ 56, 463},{ 50, 790},{ 76, 1155}, + { 90, 1515},{ 108, 1877},{ 125, 2226},{ 150, 2562}, + { 177, 2890},{ 203, 3203},{ 231, 3501},{ 259, 3789}, + { 289, 4074},{ 325, 4348},{ 367, 4608},{ 418, 4857}, + { 486, 5093},{ 574, 5307},{ 677, 5494},{ 784, 5688}, + { 914, 5844},{ 1033, 6004},{ 1142, 6179},{ 1307, 6220} } }, { /*Cr qi=36 INTRA*/ { - { 12, 7},{ 102, 388},{ 172, 773},{ 239, 1182}, - { 328, 1546},{ 439, 1848},{ 554, 2106},{ 651, 2341}, - { 747, 2561},{ 850, 2757},{ 972, 2934},{ 1086, 3097}, - { 1182, 3245},{ 1302, 3382},{ 1447, 3491},{ 1572, 3567}, - { 1677, 3641},{ 1793, 3733},{ 1899, 3828},{ 2013, 3894}, - { 2163, 3967},{ 2283, 4059},{ 2387, 4142},{ 2559, 4145} + { 87, 376},{ 132, 616},{ 190, 931},{ 268, 1260}, + { 358, 1550},{ 457, 1833},{ 592, 2082},{ 685, 2318}, + { 781, 2548},{ 867, 2757},{ 968, 2953},{ 1080, 3124}, + { 1173, 3255},{ 1282, 3390},{ 1410, 3477},{ 1528, 3593}, + { 1645, 3612},{ 1766, 3739},{ 1885, 3789},{ 1954, 3892}, + { 2115, 3987},{ 2202, 4052},{ 2280, 4172},{ 2379, 4213} }, /*Cr qi=36 INTER*/ { - { 98, -10},{ 96, 347},{ 89, 676},{ 102, 1048}, - { 118, 1433},{ 130, 1804},{ 141, 2167},{ 154, 2523}, - { 171, 2866},{ 190, 3194},{ 212, 3508},{ 240, 3809}, - { 276, 4099},{ 320, 4377},{ 372, 4638},{ 428, 4887}, - { 492, 5122},{ 560, 5353},{ 638, 5572},{ 725, 5779}, - { 814, 5985},{ 902, 6192},{ 1013, 6377},{ 1155, 6527} - } - } - }, - { - { - /*Y' qi=37 INTRA*/ - { - { 109, 58},{ 445, 1302},{ 927, 2177},{ 1489, 2689}, - { 2053, 3052},{ 2632, 3387},{ 3230, 3683},{ 3830, 3922}, - { 4417, 4114},{ 4992, 4266},{ 5546, 4375},{ 6067, 4430}, - { 6571, 4459},{ 7046, 4516},{ 7513, 4599},{ 7991, 4663}, - { 8445, 4706},{ 8883, 4749},{ 9273, 4771},{ 9612, 4770}, - { 9970, 4765},{10325, 4773},{10672, 4778},{11106, 4758} - }, - /*Y' qi=37 INTER*/ - { - { 56, -14},{ 114, 1333},{ 218, 2683},{ 354, 3894}, - { 550, 4966},{ 916, 5854},{ 1569, 6437},{ 2520, 6685}, - { 3596, 6704},{ 4585, 6635},{ 5424, 6556},{ 6147, 6489}, - { 6787, 6437},{ 7358, 6395},{ 7876, 6358},{ 8361, 6325}, - { 8807, 6294},{ 9229, 6271},{ 9631, 6253},{10002, 6238}, - {10356, 6228},{10678, 6212},{10975, 6197},{11274, 6185} - } - }, - { - /*Cb qi=37 INTRA*/ - { - { 6, 3},{ 99, 370},{ 171, 746},{ 227, 1149}, - { 309, 1522},{ 421, 1836},{ 541, 2104},{ 652, 2347}, - { 757, 2572},{ 871, 2768},{ 989, 2936},{ 1111, 3087}, - { 1238, 3223},{ 1357, 3352},{ 1465, 3486},{ 1576, 3612}, - { 1709, 3705},{ 1828, 3801},{ 1937, 3895},{ 2076, 3967}, - { 2220, 4035},{ 2345, 4104},{ 2466, 4173},{ 2680, 4265} - }, - /*Cb qi=37 INTER*/ - { - { 111, 27},{ 97, 344},{ 87, 667},{ 99, 1038}, - { 115, 1425},{ 128, 1802},{ 143, 2160},{ 159, 2506}, - { 176, 2843},{ 198, 3167},{ 220, 3477},{ 247, 3774}, - { 280, 4061},{ 321, 4338},{ 368, 4608},{ 427, 4867}, - { 501, 5109},{ 595, 5332},{ 701, 5544},{ 818, 5738}, - { 956, 5905},{ 1105, 6066},{ 1248, 6217},{ 1381, 6353} - } - }, - { - /*Cr qi=37 INTRA*/ - { - { 12, 7},{ 102, 388},{ 173, 773},{ 242, 1180}, - { 331, 1541},{ 444, 1839},{ 562, 2095},{ 662, 2326}, - { 763, 2540},{ 871, 2728},{ 1003, 2892},{ 1130, 3045}, - { 1230, 3188},{ 1350, 3321},{ 1503, 3418},{ 1634, 3492}, - { 1737, 3568},{ 1856, 3653},{ 1970, 3744},{ 2091, 3802}, - { 2247, 3871},{ 2371, 3962},{ 2477, 4041},{ 2655, 4052} - }, - /*Cr qi=37 INTER*/ - { - { 89, -9},{ 97, 347},{ 88, 677},{ 102, 1048}, - { 118, 1432},{ 130, 1802},{ 141, 2163},{ 154, 2517}, - { 172, 2857},{ 192, 3181},{ 216, 3494},{ 246, 3793}, - { 286, 4074},{ 337, 4343},{ 395, 4600},{ 464, 4837}, - { 534, 5066},{ 608, 5289},{ 694, 5501},{ 788, 5704}, - { 893, 5901},{ 1010, 6088},{ 1151, 6249},{ 1331, 6374} - } - } - }, - { - { - /*Y' qi=38 INTRA*/ - { - { 107, 65},{ 476, 1286},{ 968, 2148},{ 1548, 2641}, - { 2141, 2979},{ 2757, 3289},{ 3390, 3564},{ 4020, 3784}, - { 4632, 3957},{ 5224, 4097},{ 5794, 4201},{ 6326, 4250}, - { 6828, 4274},{ 7309, 4322},{ 7790, 4401},{ 8271, 4463}, - { 8729, 4498},{ 9165, 4540},{ 9552, 4566},{ 9901, 4560}, - {10266, 4552},{10617, 4563},{10964, 4572},{11393, 4567} - }, - /*Y' qi=38 INTER*/ - { - { 57, -13},{ 118, 1332},{ 233, 2665},{ 386, 3856}, - { 620, 4899},{ 1070, 5722},{ 1849, 6211},{ 2898, 6384}, - { 3989, 6376},{ 4947, 6311},{ 5754, 6249},{ 6454, 6199}, - { 7077, 6161},{ 7640, 6132},{ 8159, 6101},{ 8639, 6076}, - { 9081, 6054},{ 9502, 6037},{ 9900, 6027},{10274, 6012}, - {10621, 5999},{10938, 5991},{11237, 5977},{11557, 5966} - } - }, - { - /*Cb qi=38 INTRA*/ - { - { 8, 3},{ 104, 370},{ 179, 744},{ 243, 1139}, - { 338, 1498},{ 458, 1801},{ 584, 2060},{ 700, 2297}, - { 812, 2514},{ 935, 2699},{ 1061, 2858},{ 1189, 3007}, - { 1321, 3141},{ 1446, 3266},{ 1563, 3388},{ 1684, 3512}, - { 1816, 3614},{ 1942, 3702},{ 2055, 3793},{ 2201, 3857}, - { 2357, 3923},{ 2477, 3994},{ 2593, 4061},{ 2768, 4178} - }, - /*Cb qi=38 INTER*/ - { - { 118, 24},{ 102, 342},{ 91, 663},{ 101, 1040}, - { 116, 1427},{ 131, 1799},{ 147, 2152},{ 168, 2491}, - { 191, 2822},{ 215, 3139},{ 244, 3441},{ 276, 3731}, - { 316, 4013},{ 363, 4286},{ 423, 4546},{ 495, 4795}, - { 584, 5028},{ 691, 5242},{ 814, 5439},{ 959, 5608}, - { 1119, 5759},{ 1277, 5906},{ 1449, 6035},{ 1655, 6144} - } - }, - { - /*Cr qi=38 INTRA*/ - { - { 12, 6},{ 106, 387},{ 182, 771},{ 261, 1168}, - { 364, 1514},{ 483, 1802},{ 603, 2053},{ 707, 2282}, - { 817, 2489},{ 933, 2670},{ 1074, 2825},{ 1210, 2967}, - { 1320, 3104},{ 1444, 3229},{ 1599, 3324},{ 1735, 3396}, - { 1846, 3464},{ 1971, 3547},{ 2086, 3646},{ 2206, 3711}, - { 2366, 3773},{ 2499, 3859},{ 2603, 3945},{ 2766, 3952} - }, - /*Cr qi=38 INTER*/ - { - { 86, -9},{ 91, 352},{ 85, 680},{ 102, 1053}, - { 119, 1435},{ 132, 1799},{ 146, 2153},{ 162, 2501}, - { 183, 2835},{ 209, 3154},{ 240, 3458},{ 278, 3751}, - { 327, 4025},{ 388, 4284},{ 455, 4532},{ 529, 4766}, - { 616, 4980},{ 711, 5188},{ 815, 5386},{ 920, 5583}, - { 1042, 5770},{ 1186, 5936},{ 1348, 6080},{ 1542, 6196} - } - } - }, - { - { - /*Y' qi=39 INTRA*/ - { - { 103, 66},{ 479, 1283},{ 998, 2125},{ 1610, 2591}, - { 2223, 2913},{ 2855, 3214},{ 3501, 3482},{ 4146, 3698}, - { 4772, 3868},{ 5376, 3999},{ 5956, 4095},{ 6496, 4140}, - { 7008, 4162},{ 7499, 4209},{ 7987, 4282},{ 8478, 4338}, - { 8947, 4374},{ 9385, 4417},{ 9783, 4437},{10143, 4433}, - {10504, 4424},{10866, 4435},{11225, 4444},{11665, 4430} - }, - /*Y' qi=39 INTER*/ - { - { 56, 2},{ 118, 1332},{ 235, 2660},{ 395, 3843}, - { 653, 4867},{ 1153, 5652},{ 2003, 6089},{ 3113, 6214}, - { 4228, 6178},{ 5189, 6102},{ 6002, 6031},{ 6707, 5976}, - { 7336, 5936},{ 7901, 5900},{ 8424, 5870},{ 8915, 5844}, - { 9361, 5822},{ 9784, 5807},{10187, 5794},{10571, 5778}, - {10931, 5763},{11264, 5751},{11582, 5742},{11916, 5730} - } - }, - { - /*Cb qi=39 INTRA*/ - { - { 8, 3},{ 104, 370},{ 179, 744},{ 244, 1138}, - { 340, 1496},{ 461, 1796},{ 588, 2053},{ 705, 2288}, - { 820, 2503},{ 945, 2684},{ 1073, 2840},{ 1210, 2981}, - { 1352, 3106},{ 1480, 3225},{ 1603, 3342},{ 1728, 3464}, - { 1865, 3559},{ 1990, 3645},{ 2106, 3734},{ 2258, 3796}, - { 2413, 3856},{ 2540, 3920},{ 2667, 3986},{ 2887, 4060} - }, - /*Cb qi=39 INTER*/ - { - { 119, 19},{ 103, 340},{ 90, 664},{ 100, 1040}, - { 115, 1426},{ 131, 1797},{ 148, 2148},{ 169, 2486}, - { 192, 2816},{ 217, 3131},{ 247, 3432},{ 282, 3721}, - { 324, 3999},{ 374, 4268},{ 435, 4526},{ 520, 4766}, - { 621, 4990},{ 738, 5194},{ 878, 5376},{ 1035, 5543}, - { 1202, 5686},{ 1374, 5819},{ 1545, 5950},{ 1729, 6064} - } - }, - { - /*Cr qi=39 INTRA*/ - { - { 12, 6},{ 106, 387},{ 182, 771},{ 262, 1167}, - { 365, 1512},{ 486, 1798},{ 608, 2047},{ 713, 2274}, - { 824, 2479},{ 945, 2655},{ 1091, 2804},{ 1231, 2941}, - { 1346, 3073},{ 1475, 3194},{ 1633, 3282},{ 1778, 3345}, - { 1891, 3414},{ 2013, 3501},{ 2138, 3584},{ 2266, 3640}, - { 2428, 3701},{ 2568, 3782},{ 2674, 3863},{ 2816, 3894} - }, - /*Cr qi=39 INTER*/ - { - { 88, -7},{ 92, 352},{ 85, 680},{ 102, 1053}, - { 119, 1434},{ 132, 1797},{ 146, 2151},{ 163, 2498}, - { 185, 2830},{ 211, 3147},{ 243, 3451},{ 285, 3735}, - { 337, 4005},{ 401, 4260},{ 477, 4499},{ 565, 4721}, - { 655, 4937},{ 749, 5148},{ 858, 5344},{ 979, 5529}, - { 1110, 5710},{ 1264, 5871},{ 1460, 5990},{ 1677, 6086} - } - } - }, - { - { - /*Y' qi=40 INTRA*/ - { - { 98, 71},{ 491, 1274},{ 1023, 2103},{ 1641, 2559}, - { 2257, 2877},{ 2898, 3171},{ 3566, 3429},{ 4233, 3629}, - { 4881, 3784},{ 5499, 3906},{ 6088, 3997},{ 6631, 4040}, - { 7145, 4060},{ 7640, 4107},{ 8128, 4178},{ 8618, 4233}, - { 9077, 4267},{ 9514, 4304},{ 9919, 4324},{10277, 4317}, - {10635, 4312},{10985, 4324},{11338, 4331},{11792, 4334} - }, - /*Y' qi=40 INTER*/ - { - { 63, -26},{ 125, 1331},{ 256, 2640},{ 439, 3801}, - { 757, 4782},{ 1391, 5474},{ 2399, 5805},{ 3582, 5870}, - { 4678, 5824},{ 5600, 5763},{ 6386, 5710},{ 7076, 5667}, - { 7693, 5637},{ 8252, 5610},{ 8775, 5586},{ 9255, 5571}, - { 9694, 5556},{10115, 5541},{10530, 5530},{10903, 5522}, - {11242, 5515},{11596, 5501},{11904, 5482},{12205, 5475} - } - }, - { - /*Cb qi=40 INTRA*/ - { - { 8, 3},{ 108, 371},{ 189, 743},{ 265, 1128}, - { 371, 1475},{ 499, 1767},{ 628, 2022},{ 746, 2256}, - { 864, 2467},{ 991, 2647},{ 1124, 2801},{ 1270, 2933}, - { 1412, 3054},{ 1547, 3165},{ 1677, 3277},{ 1804, 3393}, - { 1946, 3483},{ 2078, 3569},{ 2201, 3651},{ 2352, 3711}, - { 2513, 3766},{ 2643, 3826},{ 2775, 3880},{ 3025, 3919} - }, - /*Cb qi=40 INTER*/ - { - { 114, 35},{ 104, 349},{ 96, 667},{ 106, 1040}, - { 121, 1423},{ 138, 1789},{ 158, 2132},{ 184, 2464}, - { 212, 2787},{ 242, 3095},{ 279, 3389},{ 321, 3671}, - { 374, 3941},{ 438, 4199},{ 517, 4446},{ 617, 4673}, - { 740, 4881},{ 891, 5064},{ 1058, 5225},{ 1239, 5372}, - { 1441, 5499},{ 1638, 5610},{ 1840, 5719},{ 2076, 5814} - } - }, - { - /*Cr qi=40 INTRA*/ - { - { 14, 7},{ 114, 389},{ 193, 771},{ 283, 1156}, - { 399, 1488},{ 523, 1768},{ 643, 2018},{ 752, 2245}, - { 865, 2450},{ 984, 2626},{ 1139, 2763},{ 1290, 2887}, - { 1413, 3014},{ 1550, 3128},{ 1711, 3211},{ 1865, 3268}, - { 1981, 3334},{ 2103, 3415},{ 2237, 3486},{ 2365, 3543}, - { 2529, 3610},{ 2666, 3700},{ 2775, 3779},{ 2929, 3803} - }, - /*Cr qi=40 INTER*/ - { - { 89, -8},{ 95, 353},{ 90, 681},{ 107, 1053}, - { 124, 1430},{ 139, 1787},{ 156, 2136},{ 177, 2477}, - { 203, 2803},{ 237, 3112},{ 276, 3406},{ 329, 3683}, - { 395, 3942},{ 475, 4182},{ 567, 4407},{ 665, 4624}, - { 767, 4834},{ 879, 5032},{ 1011, 5213},{ 1169, 5375}, - { 1348, 5525},{ 1547, 5654},{ 1785, 5743},{ 2066, 5787} - } - } - }, - { - { - /*Y' qi=41 INTRA*/ - { - { 98, 71},{ 495, 1272},{ 1040, 2090},{ 1675, 2533}, - { 2302, 2842},{ 2953, 3132},{ 3631, 3381},{ 4309, 3574}, - { 4966, 3726},{ 5593, 3846},{ 6189, 3934},{ 6738, 3972}, - { 7256, 3991},{ 7754, 4036},{ 8250, 4099},{ 8747, 4150}, - { 9207, 4185},{ 9650, 4222},{10057, 4242},{10411, 4237}, - {10771, 4230},{11127, 4244},{11486, 4254},{11933, 4252} - }, - /*Y' qi=41 INTER*/ - { - { 65, -25},{ 125, 1331},{ 260, 2633},{ 457, 3782}, - { 807, 4740},{ 1499, 5397},{ 2562, 5693},{ 3766, 5743}, - { 4859, 5695},{ 5776, 5638},{ 6556, 5590},{ 7243, 5554}, - { 7859, 5529},{ 8417, 5506},{ 8935, 5486},{ 9419, 5473}, - { 9869, 5460},{10296, 5446},{10711, 5436},{11089, 5430}, - {11445, 5421},{11802, 5412},{12129, 5404},{12465, 5393} - } - }, - { - /*Cb qi=41 INTRA*/ - { - { 8, 3},{ 108, 371},{ 189, 743},{ 267, 1126}, - { 374, 1471},{ 504, 1760},{ 635, 2011},{ 758, 2241}, - { 881, 2447},{ 1013, 2621},{ 1147, 2773},{ 1293, 2906}, - { 1441, 3023},{ 1580, 3131},{ 1712, 3243},{ 1844, 3360}, - { 1985, 3451},{ 2114, 3532},{ 2240, 3613},{ 2390, 3680}, - { 2550, 3740},{ 2687, 3800},{ 2825, 3862},{ 3052, 3944} - }, - /*Cb qi=41 INTER*/ - { - { 104, 39},{ 100, 350},{ 95, 667},{ 105, 1040}, - { 121, 1422},{ 137, 1787},{ 159, 2129},{ 185, 2459}, - { 216, 2778},{ 249, 3083},{ 287, 3374},{ 335, 3653}, - { 393, 3920},{ 462, 4175},{ 549, 4414},{ 660, 4636}, - { 791, 4839},{ 952, 5014},{ 1135, 5166},{ 1337, 5297}, - { 1552, 5411},{ 1752, 5530},{ 1972, 5634},{ 2224, 5724} - } - }, - { - /*Cr qi=41 INTRA*/ - { - { 15, 7},{ 115, 389},{ 193, 770},{ 284, 1154}, - { 401, 1484},{ 528, 1761},{ 652, 2005},{ 764, 2228}, - { 882, 2427},{ 1008, 2599},{ 1167, 2734},{ 1320, 2859}, - { 1443, 2990},{ 1580, 3103},{ 1743, 3181},{ 1894, 3241}, - { 2012, 3309},{ 2141, 3385},{ 2272, 3459},{ 2398, 3519}, - { 2566, 3584},{ 2707, 3680},{ 2816, 3762},{ 2991, 3770} - }, - /*Cr qi=41 INTER*/ - { - { 92, -9},{ 98, 354},{ 90, 682},{ 107, 1052}, - { 124, 1429},{ 139, 1786},{ 156, 2132},{ 178, 2471}, - { 207, 2794},{ 241, 3100},{ 285, 3391},{ 345, 3662}, - { 417, 3915},{ 503, 4151},{ 600, 4375},{ 703, 4589}, - { 815, 4791},{ 942, 4981},{ 1088, 5155},{ 1250, 5316}, - { 1432, 5462},{ 1653, 5575},{ 1930, 5639},{ 2250, 5655} - } - } - }, - { - { - /*Y' qi=42 INTRA*/ - { - { 109, 75},{ 534, 1257},{ 1114, 2047},{ 1793, 2456}, - { 2461, 2735},{ 3157, 2994},{ 3879, 3221},{ 4595, 3396}, - { 5282, 3531},{ 5931, 3638},{ 6546, 3714},{ 7105, 3749}, - { 7633, 3766},{ 8147, 3803},{ 8652, 3865},{ 9148, 3915}, - { 9613, 3946},{10075, 3976},{10489, 3997},{10835, 3994}, - {11195, 3985},{11553, 3997},{11909, 4004},{12369, 3990} - }, - /*Y' qi=42 INTER*/ - { - { 69, -23},{ 134, 1332},{ 287, 2611},{ 521, 3730}, - { 970, 4624},{ 1827, 5176},{ 3028, 5382},{ 4262, 5389}, - { 5325, 5338},{ 6214, 5291},{ 6976, 5255},{ 7651, 5228}, - { 8260, 5206},{ 8821, 5190},{ 9343, 5177},{ 9823, 5165}, - {10273, 5152},{10709, 5143},{11121, 5136},{11502, 5129}, - {11857, 5125},{12193, 5115},{12520, 5107},{12802, 5097} - } - }, - { - /*Cb qi=42 INTRA*/ - { - { 9, 3},{ 113, 371},{ 199, 743},{ 279, 1123}, - { 390, 1462},{ 525, 1743},{ 662, 1986},{ 789, 2208}, - { 916, 2406},{ 1057, 2571},{ 1204, 2712},{ 1362, 2835}, - { 1524, 2943},{ 1676, 3040},{ 1815, 3145},{ 1959, 3249}, - { 2117, 3325},{ 2249, 3406},{ 2377, 3488},{ 2537, 3547}, - { 2706, 3597},{ 2854, 3646},{ 2999, 3705},{ 3236, 3759} - }, - /*Cb qi=42 INTER*/ - { - { 114, 44},{ 107, 353},{ 101, 670},{ 111, 1041}, - { 129, 1418},{ 148, 1775},{ 174, 2110},{ 208, 2432}, - { 244, 2746},{ 283, 3046},{ 330, 3330},{ 388, 3602}, - { 460, 3858},{ 546, 4101},{ 655, 4326},{ 793, 4530}, - { 966, 4703},{ 1165, 4851},{ 1388, 4980},{ 1630, 5088}, - { 1869, 5189},{ 2122, 5268},{ 2403, 5328},{ 2667, 5417} - } - }, - { - /*Cr qi=42 INTRA*/ - { - { 15, 7},{ 120, 390},{ 202, 771},{ 298, 1150}, - { 421, 1473},{ 553, 1743},{ 681, 1982},{ 796, 2199}, - { 923, 2388},{ 1062, 2547},{ 1225, 2678},{ 1392, 2792}, - { 1531, 2907},{ 1682, 3007},{ 1856, 3074},{ 2009, 3134}, - { 2138, 3192},{ 2274, 3257},{ 2407, 3333},{ 2536, 3393}, - { 2711, 3455},{ 2875, 3531},{ 3000, 3598},{ 3186, 3599} - }, - /*Cr qi=42 INTER*/ - { - { 87, -4},{ 95, 358},{ 97, 683},{ 113, 1052}, - { 131, 1423},{ 148, 1774},{ 170, 2116},{ 198, 2448}, - { 234, 2762},{ 276, 3062},{ 331, 3343},{ 404, 3603}, - { 494, 3844},{ 598, 4067},{ 715, 4276},{ 842, 4471}, - { 977, 4661},{ 1128, 4840},{ 1311, 4991},{ 1516, 5127}, - { 1759, 5233},{ 2050, 5300},{ 2377, 5323},{ 2710, 5304} - } - } - }, - { - { - /*Y' qi=43 INTRA*/ - { - { 99, 79},{ 557, 1244},{ 1175, 2016},{ 1882, 2408}, - { 2570, 2677},{ 3288, 2926},{ 4030, 3141},{ 4760, 3307}, - { 5458, 3435},{ 6115, 3537},{ 6743, 3608},{ 7312, 3636}, - { 7841, 3652},{ 8357, 3687},{ 8870, 3742},{ 9376, 3788}, - { 9850, 3821},{10315, 3853},{10734, 3873},{11084, 3870}, - {11442, 3862},{11800, 3874},{12160, 3879},{12618, 3876} - }, - /*Y' qi=43 INTER*/ - { - { 69, -22},{ 134, 1331},{ 294, 2601},{ 551, 3703}, - { 1056, 4563},{ 2003, 5061},{ 3276, 5215},{ 4534, 5194}, - { 5599, 5133},{ 6488, 5083},{ 7257, 5044},{ 7938, 5014}, - { 8556, 4992},{ 9124, 4975},{ 9648, 4960},{10138, 4948}, - {10594, 4939},{11039, 4926},{11462, 4919},{11847, 4912}, - {12216, 4904},{12570, 4896},{12883, 4889},{13189, 4879} - } - }, - { - /*Cb qi=43 INTRA*/ - { - { 9, 3},{ 114, 371},{ 202, 740},{ 294, 1110}, - { 417, 1440},{ 558, 1716},{ 700, 1956},{ 833, 2172}, - { 966, 2365},{ 1116, 2524},{ 1269, 2661},{ 1431, 2781}, - { 1599, 2885},{ 1756, 2980},{ 1902, 3082},{ 2051, 3185}, - { 2209, 3261},{ 2337, 3342},{ 2464, 3420},{ 2633, 3475}, - { 2809, 3525},{ 2948, 3579},{ 3094, 3633},{ 3347, 3678} - }, - /*Cb qi=43 INTER*/ - { - { 111, 44},{ 106, 353},{ 102, 670},{ 112, 1040}, - { 128, 1416},{ 148, 1771},{ 176, 2104},{ 211, 2424}, - { 250, 2734},{ 293, 3030},{ 347, 3309},{ 411, 3575}, - { 490, 3828},{ 589, 4064},{ 716, 4278},{ 869, 4472}, - { 1050, 4640},{ 1264, 4781},{ 1512, 4895},{ 1775, 4991}, - { 2042, 5069},{ 2310, 5141},{ 2593, 5207},{ 2912, 5239} - } - }, - { - /*Cr qi=43 INTRA*/ - { - { 15, 7},{ 121, 390},{ 208, 767},{ 315, 1135}, - { 449, 1449},{ 586, 1715},{ 718, 1950},{ 843, 2158}, - { 977, 2342},{ 1120, 2501},{ 1290, 2632},{ 1466, 2739}, - { 1613, 2845},{ 1763, 2945},{ 1937, 3015},{ 2093, 3070}, - { 2225, 3126},{ 2366, 3194},{ 2501, 3267},{ 2634, 3324}, - { 2815, 3385},{ 2964, 3466},{ 3087, 3538},{ 3263, 3555} - }, - /*Cr qi=43 INTER*/ - { - { 84, -4},{ 93, 358},{ 95, 683},{ 113, 1052}, - { 131, 1421},{ 148, 1770},{ 171, 2110},{ 201, 2439}, - { 240, 2750},{ 287, 3046},{ 348, 3322},{ 429, 3576}, - { 527, 3811},{ 641, 4029},{ 767, 4230},{ 904, 4422}, - { 1053, 4603},{ 1225, 4765},{ 1433, 4903},{ 1661, 5030}, - { 1928, 5121},{ 2252, 5160},{ 2604, 5164},{ 2979, 5125} - } - } - }, - { - { - /*Y' qi=44 INTRA*/ - { - { 103, 80},{ 560, 1244},{ 1183, 2009},{ 1891, 2391}, - { 2586, 2649},{ 3324, 2884},{ 4093, 3089},{ 4850, 3243}, - { 5575, 3358},{ 6252, 3452},{ 6886, 3518},{ 7459, 3546}, - { 7993, 3562},{ 8515, 3594},{ 9030, 3645},{ 9534, 3691}, - {10004, 3723},{10469, 3750},{10887, 3765},{11236, 3766}, - {11596, 3762},{11960, 3775},{12317, 3784},{12766, 3789} - }, - /*Y' qi=44 INTER*/ - { - { 77, -24},{ 145, 1332},{ 332, 2580},{ 642, 3649}, - { 1270, 4438},{ 2360, 4860},{ 3685, 4982},{ 4910, 4966}, - { 5929, 4928},{ 6785, 4900},{ 7529, 4880},{ 8198, 4863}, - { 8804, 4850},{ 9361, 4842},{ 9882, 4836},{10371, 4830}, - {10827, 4822},{11262, 4816},{11672, 4811},{12052, 4807}, - {12431, 4806},{12780, 4798},{13095, 4792},{13401, 4791} - } - }, - { - /*Cb qi=44 INTRA*/ - { - { 9, 2},{ 122, 371},{ 214, 741},{ 307, 1109}, - { 433, 1432},{ 576, 1704},{ 718, 1939},{ 855, 2152}, - { 991, 2340},{ 1141, 2497},{ 1298, 2632},{ 1463, 2749}, - { 1636, 2851},{ 1796, 2944},{ 1947, 3041},{ 2101, 3140}, - { 2260, 3219},{ 2392, 3297},{ 2527, 3366},{ 2693, 3424}, - { 2872, 3477},{ 3025, 3525},{ 3175, 3584},{ 3451, 3626} - }, - /*Cb qi=44 INTER*/ - { - { 111, 14},{ 110, 339},{ 109, 671},{ 120, 1040}, - { 139, 1410},{ 162, 1758},{ 197, 2084},{ 243, 2397}, - { 291, 2702},{ 342, 2992},{ 405, 3265},{ 484, 3521}, - { 584, 3760},{ 705, 3983},{ 855, 4185},{ 1048, 4356}, - { 1274, 4500},{ 1531, 4617},{ 1816, 4707},{ 2111, 4783}, - { 2409, 4846},{ 2720, 4901},{ 3044, 4957},{ 3391, 4985} - } - }, - { - /*Cr qi=44 INTRA*/ - { - { 17, 7},{ 128, 392},{ 219, 770},{ 329, 1135}, - { 465, 1442},{ 601, 1703},{ 734, 1935},{ 862, 2142}, - { 998, 2325},{ 1147, 2482},{ 1321, 2606},{ 1496, 2710}, - { 1649, 2813},{ 1809, 2908},{ 1984, 2977},{ 2143, 3032}, - { 2279, 3087},{ 2423, 3152},{ 2559, 3225},{ 2684, 3288}, - { 2866, 3351},{ 3025, 3426},{ 3161, 3492},{ 3372, 3500} - }, - /*Cr qi=44 INTER*/ - { - { 89, 0},{ 101, 352},{ 104, 683},{ 121, 1051}, - { 141, 1414},{ 163, 1757},{ 192, 2092},{ 231, 2415}, - { 278, 2720},{ 336, 3007},{ 412, 3273},{ 510, 3516}, - { 633, 3733},{ 769, 3936},{ 914, 4130},{ 1076, 4307}, - { 1256, 4472},{ 1469, 4617},{ 1723, 4732},{ 2012, 4822}, - { 2347, 4871},{ 2716, 4875},{ 3082, 4866},{ 3422, 4826} + { 53, 45},{ 50, 467},{ 45, 789},{ 76, 1150}, + { 92, 1531},{ 107, 1877},{ 125, 2219},{ 147, 2561}, + { 176, 2893},{ 206, 3209},{ 231, 3514},{ 260, 3808}, + { 298, 4085},{ 350, 4344},{ 411, 4587},{ 475, 4814}, + { 532, 5037},{ 587, 5261},{ 647, 5480},{ 707, 5694}, + { 793, 5900},{ 891, 6093},{ 1017, 6292},{ 1205, 6307} } } }, @@ -2848,557 +341,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=45 INTRA*/ { - { 119, 78},{ 610, 1226},{ 1271, 1965},{ 2026, 2319}, - { 2768, 2550},{ 3556, 2757},{ 4369, 2938},{ 5157, 3076}, - { 5901, 3182},{ 6598, 3268},{ 7253, 3326},{ 7844, 3343}, - { 8392, 3356},{ 8922, 3386},{ 9453, 3433},{ 9973, 3474}, - {10457, 3503},{10929, 3530},{11351, 3543},{11709, 3541}, - {12068, 3537},{12434, 3547},{12805, 3555},{13268, 3563} + { 47, 170},{ 955, 1217},{ 1713, 2014},{ 3050, 2094}, + { 3954, 2179},{ 4801, 2357},{ 5629, 2494},{ 6313, 2614}, + { 6962, 2716},{ 7566, 2820},{ 8138, 2886},{ 8613, 2949}, + { 9097, 3031},{ 9574, 3044},{10053, 3142},{10514, 3134}, + {10897, 3241},{11397, 3275},{11775, 3297},{12200, 3350}, + {12527, 3350},{12959, 3393},{13246, 3401},{13573, 3397} }, /*Y' qi=45 INTER*/ { - { 77, -20},{ 146, 1330},{ 342, 2566},{ 699, 3604}, - { 1439, 4332},{ 2669, 4672},{ 4075, 4727},{ 5318, 4679}, - { 6345, 4630},{ 7209, 4595},{ 7963, 4570},{ 8644, 4551}, - { 9262, 4535},{ 9831, 4525},{10370, 4515},{10872, 4506}, - {11334, 4500},{11783, 4492},{12219, 4489},{12617, 4483}, - {12995, 4477},{13350, 4472},{13674, 4466},{13968, 4468} + { 53, 73},{ 175, 1343},{ 649, 2439},{ 1339, 3250}, + { 2297, 3837},{ 3395, 4203},{ 4438, 4400},{ 5401, 4529}, + { 6222, 4588},{ 7018, 4564},{ 7713, 4532},{ 8378, 4464}, + { 8959, 4414},{ 9464, 4364},{ 9980, 4315},{10401, 4291}, + {10805, 4260},{11172, 4260},{11501, 4231},{11798, 4248}, + {12082, 4254},{12381, 4262},{12572, 4285},{12877, 4289} } }, { /*Cb qi=45 INTRA*/ { - { 9, 2},{ 122, 370},{ 219, 735},{ 324, 1096}, - { 465, 1414},{ 619, 1679},{ 771, 1905},{ 920, 2103}, - { 1070, 2276},{ 1236, 2419},{ 1410, 2539},{ 1595, 2644}, - { 1784, 2736},{ 1949, 2831},{ 2104, 2931},{ 2275, 3021}, - { 2443, 3092},{ 2586, 3166},{ 2735, 3234},{ 2904, 3288}, - { 3093, 3338},{ 3262, 3382},{ 3419, 3427},{ 3708, 3456} + { 112, -14},{ 173, 495},{ 260, 827},{ 355, 1122}, + { 451, 1420},{ 579, 1695},{ 697, 1934},{ 917, 2101}, + { 1104, 2244},{ 1266, 2381},{ 1417, 2520},{ 1609, 2611}, + { 1801, 2689},{ 1973, 2764},{ 2108, 2864},{ 2298, 2948}, + { 2452, 3008},{ 2588, 3080},{ 2732, 3161},{ 2888, 3203}, + { 3052, 3266},{ 3240, 3294},{ 3342, 3351},{ 3467, 3373} }, /*Cb qi=45 INTER*/ { - { 103, 0},{ 109, 339},{ 109, 670},{ 119, 1039}, - { 137, 1408},{ 162, 1754},{ 199, 2076},{ 248, 2386}, - { 301, 2684},{ 360, 2967},{ 433, 3234},{ 525, 3481}, - { 640, 3713},{ 780, 3924},{ 956, 4110},{ 1176, 4266}, - { 1438, 4390},{ 1736, 4481},{ 2057, 4553},{ 2385, 4613}, - { 2718, 4656},{ 3056, 4698},{ 3416, 4733},{ 3799, 4755} + { 41, -49},{ 52, 385},{ 87, 743},{ 110, 1102}, + { 135, 1453},{ 162, 1788},{ 207, 2096},{ 272, 2391}, + { 330, 2677},{ 392, 2950},{ 464, 3205},{ 556, 3442}, + { 674, 3656},{ 827, 3847},{ 1030, 4006},{ 1275, 4132}, + { 1544, 4234},{ 1809, 4317},{ 2089, 4408},{ 2377, 4456}, + { 2647, 4532},{ 2919, 4595},{ 3256, 4659},{ 3465, 4657} } }, { /*Cr qi=45 INTRA*/ { - { 16, 7},{ 128, 391},{ 225, 763},{ 350, 1120}, - { 500, 1420},{ 649, 1673},{ 792, 1893},{ 929, 2089}, - { 1084, 2257},{ 1250, 2401},{ 1440, 2518},{ 1633, 2614}, - { 1799, 2708},{ 1968, 2798},{ 2151, 2863},{ 2314, 2914}, - { 2453, 2968},{ 2611, 3025},{ 2759, 3095},{ 2887, 3160}, - { 3082, 3210},{ 3259, 3278},{ 3403, 3342},{ 3593, 3354} + { 99, -14},{ 164, 493},{ 247, 832},{ 358, 1123}, + { 468, 1416},{ 599, 1680},{ 795, 1886},{ 958, 2063}, + { 1133, 2211},{ 1300, 2345},{ 1480, 2461},{ 1664, 2554}, + { 1807, 2656},{ 1995, 2742},{ 2146, 2799},{ 2331, 2856}, + { 2440, 2894},{ 2592, 2996},{ 2751, 3033},{ 2865, 3112}, + { 3073, 3162},{ 3210, 3208},{ 3330, 3306},{ 3454, 3332} }, /*Cr qi=45 INTER*/ { - { 92, 0},{ 101, 352},{ 103, 682},{ 120, 1049}, - { 140, 1412},{ 163, 1752},{ 193, 2083},{ 234, 2402}, - { 287, 2702},{ 353, 2983},{ 442, 3240},{ 557, 3471}, - { 694, 3680},{ 846, 3873},{ 1014, 4056},{ 1200, 4224}, - { 1414, 4369},{ 1664, 4495},{ 1946, 4595},{ 2278, 4654}, - { 2654, 4673},{ 3047, 4658},{ 3438, 4627},{ 3825, 4585} - } - } - }, - { - { - /*Y' qi=46 INTRA*/ - { - { 119, 78},{ 610, 1227},{ 1277, 1960},{ 2043, 2309}, - { 2805, 2529},{ 3618, 2719},{ 4452, 2887},{ 5257, 3016}, - { 6017, 3115},{ 6727, 3195},{ 7392, 3248},{ 7984, 3267}, - { 8528, 3281},{ 9059, 3310},{ 9593, 3354},{10119, 3395}, - {10599, 3425},{11064, 3450},{11493, 3464},{11850, 3466}, - {12207, 3462},{12578, 3471},{12948, 3480},{13407, 3487} - }, - /*Y' qi=46 INTER*/ - { - { 74, -14},{ 149, 1326},{ 382, 2538},{ 807, 3541}, - { 1670, 4211},{ 3000, 4499},{ 4416, 4533},{ 5628, 4490}, - { 6628, 4453},{ 7479, 4425},{ 8228, 4406},{ 8902, 4393}, - { 9521, 4380},{10090, 4371},{10623, 4364},{11124, 4356}, - {11586, 4351},{12043, 4344},{12476, 4341},{12863, 4340}, - {13244, 4337},{13610, 4329},{13936, 4324},{14246, 4329} - } - }, - { - /*Cb qi=46 INTRA*/ - { - { 11, 2},{ 132, 371},{ 234, 737},{ 340, 1094}, - { 481, 1405},{ 637, 1667},{ 791, 1891},{ 944, 2084}, - { 1099, 2253},{ 1268, 2392},{ 1444, 2507},{ 1633, 2610}, - { 1825, 2700},{ 1990, 2794},{ 2147, 2895},{ 2321, 2984}, - { 2493, 3053},{ 2640, 3126},{ 2787, 3198},{ 2954, 3253}, - { 3146, 3297},{ 3313, 3344},{ 3473, 3393},{ 3757, 3434} - }, - /*Cb qi=46 INTER*/ - { - { 97, 0},{ 109, 339},{ 108, 669},{ 120, 1035}, - { 142, 1398},{ 173, 1737},{ 221, 2052},{ 281, 2353}, - { 345, 2646},{ 415, 2924},{ 504, 3183},{ 616, 3421}, - { 749, 3643},{ 914, 3842},{ 1123, 4012},{ 1379, 4150}, - { 1685, 4250},{ 2014, 4327},{ 2366, 4382},{ 2731, 4426}, - { 3083, 4470},{ 3445, 4490},{ 3805, 4511},{ 4146, 4539} - } - }, - { - /*Cr qi=46 INTRA*/ - { - { 19, 7},{ 137, 393},{ 237, 765},{ 364, 1116}, - { 516, 1411},{ 665, 1662},{ 809, 1880},{ 951, 2072}, - { 1109, 2236},{ 1278, 2378},{ 1474, 2491},{ 1669, 2584}, - { 1835, 2678},{ 2014, 2766},{ 2203, 2828},{ 2366, 2880}, - { 2506, 2933},{ 2661, 2988},{ 2810, 3053},{ 2941, 3116}, - { 3131, 3175},{ 3310, 3243},{ 3461, 3303},{ 3656, 3321} - }, - /*Cr qi=46 INTER*/ - { - { 91, 1},{ 103, 351},{ 104, 681},{ 121, 1046}, - { 144, 1401},{ 173, 1736},{ 213, 2060},{ 265, 2373}, - { 330, 2666},{ 410, 2938},{ 517, 3185},{ 655, 3404}, - { 815, 3601},{ 989, 3784},{ 1183, 3951},{ 1400, 4104}, - { 1649, 4241},{ 1933, 4352},{ 2261, 4427},{ 2646, 4458}, - { 3057, 4446},{ 3453, 4418},{ 3820, 4385},{ 4171, 4352} - } - } - }, - { - { - /*Y' qi=47 INTRA*/ - { - { 117, 83},{ 670, 1205},{ 1408, 1904},{ 2239, 2219}, - { 3049, 2414},{ 3905, 2584},{ 4775, 2734},{ 5610, 2852}, - { 6393, 2944},{ 7121, 3017},{ 7804, 3066},{ 8407, 3081}, - { 8957, 3093},{ 9498, 3119},{10043, 3160},{10582, 3199}, - {11083, 3226},{11561, 3250},{11993, 3263},{12352, 3264}, - {12711, 3259},{13092, 3266},{13463, 3271},{13918, 3275} - }, - /*Y' qi=47 INTER*/ - { - { 74, -11},{ 148, 1325},{ 404, 2518},{ 910, 3478}, - { 1916, 4080},{ 3369, 4298},{ 4823, 4292},{ 6035, 4238}, - { 7037, 4197},{ 7894, 4168},{ 8650, 4146},{ 9337, 4129}, - { 9968, 4116},{10549, 4105},{11096, 4096},{11605, 4089}, - {12081, 4083},{12547, 4076},{12990, 4070},{13399, 4070}, - {13776, 4065},{14133, 4059},{14486, 4057},{14842, 4053} - } - }, - { - /*Cb qi=47 INTRA*/ - { - { 11, 2},{ 133, 370},{ 242, 731},{ 367, 1077}, - { 524, 1378},{ 692, 1630},{ 860, 1844},{ 1028, 2024}, - { 1203, 2178},{ 1393, 2305},{ 1582, 2413},{ 1787, 2507}, - { 1992, 2590},{ 2175, 2676},{ 2351, 2767},{ 2534, 2851}, - { 2707, 2923},{ 2862, 2994},{ 3021, 3060},{ 3193, 3111}, - { 3396, 3147},{ 3573, 3184},{ 3752, 3220},{ 4038, 3255} - }, - /*Cb qi=47 INTER*/ - { - { 101, 0},{ 107, 339},{ 108, 667},{ 120, 1033}, - { 142, 1394},{ 175, 1729},{ 227, 2040},{ 295, 2335}, - { 369, 2619},{ 452, 2888},{ 556, 3138},{ 686, 3368}, - { 850, 3574},{ 1050, 3758},{ 1299, 3910},{ 1605, 4024}, - { 1950, 4104},{ 2317, 4163},{ 2689, 4210},{ 3077, 4239}, - { 3466, 4258},{ 3840, 4278},{ 4205, 4298},{ 4515, 4340} - } - }, - { - /*Cr qi=47 INTRA*/ - { - { 19, 7},{ 138, 392},{ 248, 758},{ 396, 1094}, - { 563, 1378},{ 723, 1621},{ 881, 1829},{ 1037, 2011}, - { 1214, 2165},{ 1410, 2290},{ 1623, 2393},{ 1834, 2480}, - { 2016, 2564},{ 2203, 2647},{ 2405, 2707},{ 2569, 2757}, - { 2709, 2810},{ 2871, 2860},{ 3027, 2924},{ 3178, 2980}, - { 3375, 3034},{ 3563, 3097},{ 3724, 3151},{ 3952, 3153} - }, - /*Cr qi=47 INTER*/ - { - { 91, 1},{ 100, 351},{ 102, 681},{ 120, 1043}, - { 144, 1397},{ 175, 1729},{ 219, 2049},{ 277, 2356}, - { 353, 2640},{ 451, 2902},{ 579, 3136},{ 739, 3342}, - { 926, 3525},{ 1125, 3698},{ 1343, 3859},{ 1595, 3998}, - { 1881, 4113},{ 2208, 4205},{ 2589, 4253},{ 3014, 4250}, - { 3444, 4220},{ 3838, 4183},{ 4196, 4147},{ 4521, 4116} - } - } - }, - { - { - /*Y' qi=48 INTRA*/ - { - { 107, 87},{ 681, 1200},{ 1456, 1883},{ 2306, 2193}, - { 3122, 2386},{ 3984, 2548},{ 4862, 2693},{ 5704, 2808}, - { 6495, 2899},{ 7232, 2970},{ 7915, 3018},{ 8524, 3034}, - { 9085, 3043},{ 9635, 3068},{10192, 3108},{10735, 3145}, - {11237, 3171},{11719, 3194},{12153, 3207},{12516, 3206}, - {12888, 3202},{13266, 3210},{13637, 3218},{14101, 3219} - }, - /*Y' qi=48 INTER*/ - { - { 83, -18},{ 147, 1328},{ 398, 2519},{ 923, 3468}, - { 1979, 4047},{ 3472, 4246},{ 4936, 4232},{ 6148, 4178}, - { 7150, 4139},{ 8007, 4111},{ 8765, 4091},{ 9458, 4076}, - {10090, 4063},{10676, 4054},{11226, 4045},{11742, 4038}, - {12223, 4033},{12686, 4029},{13127, 4022},{13527, 4015}, - {13915, 4012},{14277, 4007},{14619, 4004},{14966, 4001} - } - }, - { - /*Cb qi=48 INTRA*/ - { - { 11, 2},{ 134, 369},{ 245, 730},{ 373, 1075}, - { 531, 1374},{ 698, 1625},{ 865, 1839},{ 1033, 2019}, - { 1207, 2173},{ 1397, 2300},{ 1588, 2408},{ 1795, 2501}, - { 2003, 2581},{ 2187, 2666},{ 2362, 2757},{ 2548, 2841}, - { 2719, 2912},{ 2876, 2983},{ 3034, 3047},{ 3209, 3097}, - { 3409, 3137},{ 3589, 3178},{ 3762, 3216},{ 4004, 3252} - }, - /*Cb qi=48 INTER*/ - { - { 113, 26},{ 112, 344},{ 111, 668},{ 120, 1032}, - { 141, 1392},{ 173, 1727},{ 224, 2036},{ 290, 2330}, - { 363, 2612},{ 447, 2880},{ 551, 3130},{ 685, 3358}, - { 852, 3563},{ 1061, 3742},{ 1332, 3884},{ 1654, 3993}, - { 2011, 4068},{ 2394, 4120},{ 2782, 4160},{ 3172, 4186}, - { 3557, 4209},{ 3932, 4228},{ 4306, 4237},{ 4675, 4236} - } - }, - { - /*Cr qi=48 INTRA*/ - { - { 18, 7},{ 139, 389},{ 252, 755},{ 404, 1090}, - { 573, 1372},{ 732, 1615},{ 889, 1823},{ 1045, 2005}, - { 1222, 2159},{ 1417, 2285},{ 1631, 2387},{ 1843, 2474}, - { 2027, 2558},{ 2212, 2639},{ 2413, 2697},{ 2578, 2746}, - { 2720, 2798},{ 2887, 2852},{ 3040, 2913},{ 3181, 2970}, - { 3381, 3024},{ 3581, 3081},{ 3743, 3130},{ 3948, 3133} - }, - /*Cr qi=48 INTER*/ - { - { 89, 0},{ 106, 352},{ 105, 682},{ 120, 1044}, - { 144, 1395},{ 174, 1724},{ 215, 2044},{ 270, 2350}, - { 343, 2635},{ 441, 2895},{ 571, 3129},{ 735, 3334}, - { 926, 3518},{ 1139, 3684},{ 1371, 3836},{ 1628, 3977}, - { 1933, 4089},{ 2279, 4164},{ 2672, 4204},{ 3105, 4205}, - { 3533, 4176},{ 3931, 4135},{ 4290, 4089},{ 4624, 4057} - } - } - }, - { - { - /*Y' qi=49 INTRA*/ - { - { 120, 85},{ 706, 1194},{ 1485, 1875},{ 2348, 2187}, - { 3190, 2372},{ 4076, 2521},{ 4967, 2658},{ 5819, 2771}, - { 6611, 2861},{ 7345, 2936},{ 8026, 2990},{ 8626, 3013}, - { 9182, 3030},{ 9723, 3059},{10266, 3100},{10802, 3143}, - {11293, 3179},{11768, 3206},{12201, 3221},{12556, 3225}, - {12914, 3226},{13281, 3237},{13639, 3247},{14089, 3257} - }, - /*Y' qi=49 INTER*/ - { - { 72, -11},{ 155, 1320},{ 458, 2485},{ 1090, 3386}, - { 2284, 3907},{ 3835, 4075},{ 5272, 4064},{ 6449, 4026}, - { 7426, 4003},{ 8267, 3987},{ 9017, 3976},{ 9698, 3967}, - {10328, 3962},{10913, 3959},{11452, 3954},{11961, 3950}, - {12442, 3947},{12904, 3946},{13347, 3945},{13749, 3943}, - {14123, 3941},{14490, 3941},{14826, 3939},{15153, 3937} - } - }, - { - /*Cb qi=49 INTRA*/ - { - { 11, 2},{ 145, 369},{ 262, 729},{ 393, 1070}, - { 557, 1363},{ 731, 1607},{ 907, 1811},{ 1085, 1983}, - { 1268, 2130},{ 1465, 2251},{ 1658, 2359},{ 1868, 2454}, - { 2079, 2534},{ 2264, 2621},{ 2440, 2717},{ 2625, 2802}, - { 2792, 2878},{ 2945, 2954},{ 3106, 3021},{ 3277, 3075}, - { 3466, 3119},{ 3638, 3170},{ 3824, 3213},{ 4100, 3243} - }, - /*Cb qi=49 INTER*/ - { - { 98, -6},{ 113, 343},{ 110, 669},{ 122, 1029}, - { 149, 1380},{ 192, 1706},{ 258, 2007},{ 340, 2293}, - { 426, 2569},{ 525, 2831},{ 653, 3071},{ 814, 3287}, - { 1013, 3478},{ 1262, 3637},{ 1575, 3761},{ 1936, 3851}, - { 2328, 3910},{ 2741, 3949},{ 3163, 3970},{ 3559, 3994}, - { 3936, 4025},{ 4300, 4050},{ 4655, 4060},{ 4962, 4062} - } - }, - { - /*Cr qi=49 INTRA*/ - { - { 19, 7},{ 151, 389},{ 270, 753},{ 427, 1084}, - { 602, 1360},{ 767, 1595},{ 933, 1794},{ 1098, 1968}, - { 1285, 2115},{ 1489, 2237},{ 1699, 2342},{ 1912, 2435}, - { 2101, 2519},{ 2288, 2601},{ 2486, 2663},{ 2651, 2715}, - { 2799, 2769},{ 2958, 2825},{ 3106, 2890},{ 3257, 2948}, - { 3452, 3007},{ 3634, 3075},{ 3786, 3136},{ 3959, 3164} - }, - /*Cr qi=49 INTER*/ - { - { 85, 1},{ 103, 352},{ 104, 681},{ 121, 1039}, - { 152, 1382},{ 195, 1702},{ 248, 2015},{ 316, 2316}, - { 403, 2595},{ 520, 2847},{ 676, 3068},{ 870, 3258}, - { 1091, 3429},{ 1329, 3585},{ 1597, 3725},{ 1894, 3849}, - { 2242, 3940},{ 2656, 3984},{ 3098, 3992},{ 3531, 3981}, - { 3936, 3950},{ 4304, 3915},{ 4646, 3879},{ 4915, 3861} - } - } - }, - { - { - /*Y' qi=50 INTRA*/ - { - { 122, 89},{ 798, 1170},{ 1682, 1812},{ 2613, 2096}, - { 3501, 2260},{ 4430, 2388},{ 5352, 2510},{ 6228, 2613}, - { 7043, 2698},{ 7793, 2770},{ 8486, 2823},{ 9092, 2846}, - { 9652, 2865},{10210, 2895},{10773, 2936},{11315, 2979}, - {11817, 3014},{12297, 3041},{12734, 3057},{13097, 3064}, - {13443, 3067},{13813, 3078},{14190, 3088},{14646, 3103} - }, - /*Y' qi=50 INTER*/ - { - { 73, -11},{ 154, 1318},{ 501, 2457},{ 1281, 3291}, - { 2685, 3719},{ 4356, 3810},{ 5811, 3769},{ 6988, 3726}, - { 7976, 3700},{ 8835, 3682},{ 9606, 3669},{10307, 3659}, - {10953, 3652},{11556, 3645},{12115, 3643},{12641, 3640}, - {13138, 3636},{13613, 3634},{14068, 3629},{14488, 3627}, - {14876, 3625},{15237, 3621},{15585, 3623},{15922, 3629} - } - }, - { - /*Cb qi=50 INTRA*/ - { - { 11, 2},{ 148, 368},{ 278, 724},{ 431, 1052}, - { 613, 1334},{ 806, 1567},{ 1004, 1756},{ 1203, 1915}, - { 1405, 2051},{ 1621, 2163},{ 1833, 2262},{ 2059, 2347}, - { 2280, 2424},{ 2476, 2512},{ 2670, 2598},{ 2864, 2679}, - { 3037, 2754},{ 3201, 2826},{ 3376, 2887},{ 3562, 2936}, - { 3756, 2976},{ 3932, 3022},{ 4117, 3065},{ 4385, 3094} - }, - /*Cb qi=50 INTER*/ - { - { 92, -3},{ 112, 343},{ 109, 669},{ 121, 1027}, - { 149, 1375},{ 196, 1697},{ 270, 1992},{ 366, 2267}, - { 471, 2532},{ 594, 2782},{ 747, 3011},{ 942, 3212}, - { 1189, 3384},{ 1497, 3521},{ 1875, 3613},{ 2297, 3673}, - { 2739, 3710},{ 3195, 3725},{ 3644, 3737},{ 4057, 3751}, - { 4445, 3763},{ 4841, 3769},{ 5211, 3779},{ 5568, 3769} - } - }, - { - /*Cr qi=50 INTRA*/ - { - { 19, 7},{ 155, 388},{ 290, 744},{ 474, 1060}, - { 666, 1324},{ 847, 1549},{ 1033, 1737},{ 1219, 1898}, - { 1428, 2034},{ 1653, 2147},{ 1885, 2245},{ 2115, 2329}, - { 2316, 2410},{ 2517, 2486},{ 2730, 2539},{ 2901, 2586}, - { 3042, 2638},{ 3199, 2693},{ 3366, 2755},{ 3534, 2805}, - { 3738, 2858},{ 3934, 2916},{ 4079, 2975},{ 4257, 2992} - }, - /*Cr qi=50 INTER*/ - { - { 87, 1},{ 102, 353},{ 103, 680},{ 121, 1036}, - { 153, 1377},{ 199, 1694},{ 260, 1999},{ 339, 2291}, - { 446, 2559},{ 590, 2797},{ 780, 3003},{ 1010, 3176}, - { 1267, 3331},{ 1547, 3474},{ 1874, 3594},{ 2245, 3688}, - { 2666, 3742},{ 3130, 3758},{ 3594, 3748},{ 4028, 3711}, - { 4415, 3674},{ 4771, 3641},{ 5122, 3605},{ 5482, 3569} - } - } - }, - { - { - /*Y' qi=51 INTRA*/ - { - { 115, 93},{ 819, 1164},{ 1739, 1806},{ 2695, 2101}, - { 3612, 2257},{ 4552, 2374},{ 5479, 2490},{ 6352, 2593}, - { 7158, 2683},{ 7898, 2761},{ 8580, 2823},{ 9177, 2854}, - { 9728, 2880},{10268, 2917},{10816, 2966},{11350, 3016}, - {11834, 3058},{12311, 3089},{12741, 3109},{13092, 3119}, - {13434, 3126},{13791, 3142},{14156, 3155},{14590, 3171} - }, - /*Y' qi=51 INTER*/ - { - { 58, 0},{ 171, 1307},{ 610, 2407},{ 1563, 3175}, - { 3116, 3545},{ 4789, 3624},{ 6185, 3602},{ 7320, 3583}, - { 8282, 3574},{ 9124, 3569},{ 9878, 3567},{10569, 3565}, - {11207, 3563},{11801, 3564},{12359, 3566},{12884, 3567}, - {13373, 3568},{13841, 3567},{14289, 3566},{14699, 3568}, - {15086, 3568},{15446, 3566},{15788, 3564},{16103, 3568} - } - }, - { - /*Cb qi=51 INTRA*/ - { - { 14, 3},{ 161, 369},{ 297, 722},{ 454, 1047}, - { 639, 1325},{ 833, 1554},{ 1033, 1742},{ 1236, 1897}, - { 1440, 2032},{ 1653, 2148},{ 1860, 2253},{ 2077, 2347}, - { 2288, 2432},{ 2476, 2525},{ 2661, 2621},{ 2841, 2714}, - { 3010, 2797},{ 3170, 2876},{ 3333, 2945},{ 3510, 3000}, - { 3696, 3054},{ 3865, 3114},{ 4046, 3164},{ 4317, 3200} - }, - /*Cb qi=51 INTER*/ - { - { 88, -11},{ 109, 341},{ 109, 668},{ 126, 1019}, - { 168, 1358},{ 233, 1670},{ 329, 1955},{ 451, 2219}, - { 584, 2472},{ 736, 2711},{ 931, 2923},{ 1179, 3104}, - { 1480, 3254},{ 1846, 3368},{ 2265, 3448},{ 2714, 3501}, - { 3180, 3524},{ 3638, 3529},{ 4074, 3543},{ 4485, 3560}, - { 4868, 3571},{ 5238, 3581},{ 5597, 3594},{ 5953, 3591} - } - }, - { - /*Cr qi=51 INTRA*/ - { - { 24, 7},{ 168, 388},{ 309, 742},{ 496, 1054}, - { 688, 1316},{ 873, 1538},{ 1063, 1723},{ 1252, 1882}, - { 1460, 2018},{ 1682, 2134},{ 1907, 2238},{ 2125, 2332}, - { 2317, 2422},{ 2507, 2510},{ 2705, 2575},{ 2869, 2630}, - { 3015, 2684},{ 3178, 2744},{ 3329, 2815},{ 3477, 2878}, - { 3667, 2945},{ 3848, 3016},{ 3997, 3082},{ 4174, 3121} - }, - /*Cr qi=51 INTER*/ - { - { 83, -2},{ 102, 351},{ 102, 680},{ 126, 1029}, - { 172, 1359},{ 238, 1665},{ 321, 1962},{ 422, 2246}, - { 552, 2505},{ 733, 2728},{ 970, 2912},{ 1247, 3069}, - { 1552, 3209},{ 1876, 3338},{ 2251, 3440},{ 2692, 3502}, - { 3161, 3529},{ 3637, 3525},{ 4084, 3509},{ 4487, 3479}, - { 4850, 3444},{ 5181, 3419},{ 5507, 3406},{ 5786, 3398} - } - } - }, - { - { - /*Y' qi=52 INTRA*/ - { - { 117, 93},{ 814, 1168},{ 1729, 1822},{ 2706, 2119}, - { 3655, 2262},{ 4604, 2374},{ 5528, 2490},{ 6394, 2596}, - { 7189, 2691},{ 7921, 2777},{ 8596, 2846},{ 9184, 2885}, - { 9728, 2918},{10260, 2961},{10796, 3014},{11316, 3069}, - {11793, 3115},{12267, 3150},{12692, 3172},{13037, 3185}, - {13367, 3196},{13717, 3214},{14087, 3227},{14521, 3249} - }, - /*Y' qi=52 INTER*/ - { - { 52, 0},{ 169, 1308},{ 668, 2382},{ 1735, 3112}, - { 3384, 3451},{ 5077, 3519},{ 6461, 3506},{ 7587, 3496}, - { 8545, 3494},{ 9384, 3494},{10142, 3498},{10838, 3501}, - {11475, 3503},{12078, 3508},{12640, 3511},{13162, 3513}, - {13654, 3517},{14130, 3521},{14576, 3522},{14980, 3523}, - {15369, 3523},{15737, 3522},{16071, 3521},{16382, 3516} - } - }, - { - /*Cb qi=52 INTRA*/ - { - { 14, 3},{ 163, 369},{ 299, 722},{ 457, 1044}, - { 645, 1319},{ 843, 1545},{ 1050, 1728},{ 1261, 1879}, - { 1468, 2013},{ 1678, 2132},{ 1883, 2240},{ 2093, 2338}, - { 2301, 2428},{ 2488, 2523},{ 2667, 2619},{ 2843, 2718}, - { 3010, 2805},{ 3163, 2887},{ 3323, 2963},{ 3490, 3028}, - { 3665, 3087},{ 3841, 3145},{ 4011, 3197},{ 4289, 3230} - }, - /*Cb qi=52 INTER*/ - { - { 98, -7},{ 109, 342},{ 109, 668},{ 126, 1018}, - { 170, 1355},{ 242, 1663},{ 352, 1941},{ 490, 2195}, - { 642, 2439},{ 823, 2666},{ 1052, 2868},{ 1333, 3039}, - { 1670, 3178},{ 2074, 3280},{ 2524, 3348},{ 2996, 3390}, - { 3469, 3410},{ 3923, 3420},{ 4355, 3434},{ 4771, 3451}, - { 5166, 3468},{ 5532, 3483},{ 5885, 3499},{ 6263, 3501} - } - }, - { - /*Cr qi=52 INTRA*/ - { - { 25, 7},{ 170, 388},{ 312, 741},{ 500, 1051}, - { 694, 1310},{ 883, 1529},{ 1082, 1709},{ 1280, 1864}, - { 1491, 1998},{ 1710, 2117},{ 1932, 2225},{ 2143, 2324}, - { 2328, 2418},{ 2516, 2506},{ 2708, 2578},{ 2870, 2637}, - { 3017, 2693},{ 3170, 2758},{ 3312, 2835},{ 3455, 2901}, - { 3644, 2972},{ 3827, 3049},{ 3968, 3121},{ 4115, 3166} - }, - /*Cr qi=52 INTER*/ - { - { 86, -2},{ 101, 352},{ 100, 680},{ 126, 1028}, - { 175, 1356},{ 247, 1657},{ 341, 1948},{ 458, 2224}, - { 615, 2471},{ 828, 2681},{ 1091, 2857},{ 1395, 3008}, - { 1732, 3140},{ 2095, 3257},{ 2502, 3348},{ 2968, 3402}, - { 3457, 3420},{ 3926, 3413},{ 4360, 3388},{ 4759, 3357}, - { 5128, 3329},{ 5449, 3306},{ 5741, 3295},{ 6071, 3296} - } - } - }, - { - { - /*Y' qi=53 INTRA*/ - { - { 138, 93},{ 850, 1161},{ 1773, 1810},{ 2763, 2103}, - { 3722, 2245},{ 4675, 2360},{ 5600, 2483},{ 6464, 2597}, - { 7255, 2700},{ 7982, 2792},{ 8652, 2867},{ 9237, 2913}, - { 9775, 2950},{10302, 2998},{10834, 3058},{11347, 3121}, - {11826, 3169},{12299, 3207},{12713, 3235},{13054, 3250}, - {13387, 3265},{13744, 3286},{14110, 3302},{14515, 3323} - }, - /*Y' qi=53 INTER*/ - { - { 52, 2},{ 169, 1308},{ 680, 2377},{ 1763, 3103}, - { 3410, 3450},{ 5094, 3531},{ 6469, 3526},{ 7590, 3525}, - { 8547, 3530},{ 9385, 3534},{10139, 3540},{10835, 3548}, - {11479, 3553},{12075, 3559},{12634, 3565},{13159, 3570}, - {13650, 3573},{14124, 3576},{14575, 3580},{14993, 3583}, - {15375, 3584},{15744, 3584},{16091, 3583},{16421, 3586} - } - }, - { - /*Cb qi=53 INTRA*/ - { - { 14, 3},{ 167, 367},{ 317, 717},{ 492, 1033}, - { 687, 1306},{ 887, 1531},{ 1095, 1715},{ 1309, 1866}, - { 1517, 2000},{ 1729, 2119},{ 1932, 2227},{ 2146, 2325}, - { 2358, 2414},{ 2544, 2511},{ 2724, 2611},{ 2902, 2711}, - { 3070, 2800},{ 3227, 2878},{ 3381, 2954},{ 3548, 3021}, - { 3724, 3077},{ 3888, 3140},{ 4065, 3196},{ 4359, 3225} - }, - /*Cb qi=53 INTER*/ - { - { 93, -8},{ 110, 342},{ 108, 668},{ 125, 1018}, - { 170, 1355},{ 242, 1663},{ 353, 1939},{ 494, 2192}, - { 651, 2433},{ 838, 2658},{ 1076, 2856},{ 1368, 3022}, - { 1716, 3158},{ 2123, 3260},{ 2575, 3330},{ 3042, 3373}, - { 3507, 3396},{ 3962, 3413},{ 4394, 3430},{ 4797, 3452}, - { 5169, 3476},{ 5547, 3496},{ 5914, 3510},{ 6235, 3525} - } - }, - { - /*Cr qi=53 INTRA*/ - { - { 25, 7},{ 175, 386},{ 335, 734},{ 541, 1037}, - { 737, 1296},{ 926, 1516},{ 1125, 1696},{ 1324, 1851}, - { 1540, 1984},{ 1763, 2102},{ 1989, 2210},{ 2202, 2310}, - { 2386, 2404},{ 2572, 2495},{ 2768, 2569},{ 2929, 2627}, - { 3071, 2684},{ 3231, 2749},{ 3374, 2825},{ 3514, 2894}, - { 3703, 2963},{ 3882, 3040},{ 4024, 3111},{ 4190, 3150} - }, - /*Cr qi=53 INTER*/ - { - { 87, -1},{ 99, 352},{ 100, 680},{ 125, 1027}, - { 175, 1355},{ 249, 1657},{ 343, 1946},{ 462, 2220}, - { 624, 2465},{ 844, 2671},{ 1122, 2841},{ 1435, 2989}, - { 1768, 3125},{ 2134, 3243},{ 2545, 3334},{ 3002, 3393}, - { 3490, 3412},{ 3965, 3405},{ 4401, 3384},{ 4797, 3359}, - { 5156, 3328},{ 5482, 3297},{ 5800, 3292},{ 6135, 3293} + { 39, -33},{ 48, 403},{ 86, 744},{ 110, 1101}, + { 134, 1461},{ 165, 1779},{ 205, 2095},{ 259, 2401}, + { 318, 2686},{ 386, 2958},{ 481, 3204},{ 610, 3415}, + { 753, 3603},{ 908, 3780},{ 1055, 3959},{ 1220, 4132}, + { 1422, 4281},{ 1656, 4419},{ 1939, 4512},{ 2259, 4574}, + { 2593, 4593},{ 2950, 4569},{ 3339, 4505},{ 3542, 4497} } } }, @@ -3406,557 +403,563 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=54 INTRA*/ { - { 184, 94},{ 902, 1151},{ 1876, 1776},{ 2881, 2057}, - { 3832, 2200},{ 4785, 2315},{ 5709, 2442},{ 6570, 2562}, - { 7362, 2672},{ 8092, 2771},{ 8760, 2852},{ 9337, 2901}, - { 9874, 2943},{10402, 2995},{10928, 3059},{11443, 3126}, - {11926, 3178},{12396, 3220},{12805, 3251},{13139, 3266}, - {13466, 3280},{13822, 3304},{14184, 3322},{14585, 3342} + { 339, 30},{ 785, 1251},{ 2395, 1971},{ 4075, 2063}, + { 4924, 2135},{ 5806, 2270},{ 6604, 2372},{ 7224, 2497}, + { 7879, 2608},{ 8400, 2729},{ 8951, 2829},{ 9379, 2864}, + { 9782, 2955},{10230, 3020},{10704, 3132},{11264, 3272}, + {11618, 3284},{12034, 3394},{12500, 3482},{12767, 3484}, + {13162, 3580},{13552, 3565},{13997, 3732},{14320, 3715} }, /*Y' qi=54 INTER*/ { - { 60, 5},{ 169, 1308},{ 683, 2375},{ 1791, 3090}, - { 3478, 3412},{ 5184, 3470},{ 6568, 3455},{ 7697, 3446}, - { 8659, 3446},{ 9503, 3447},{10266, 3450},{10971, 3454}, - {11619, 3458},{12223, 3462},{12789, 3467},{13315, 3471}, - {13811, 3475},{14291, 3479},{14743, 3479},{15148, 3481}, - {15535, 3483},{15913, 3481},{16252, 3479},{16569, 3472} + { 65, 95},{ 269, 1312},{ 1152, 2242},{ 2336, 2863}, + { 3728, 3239},{ 4944, 3439},{ 6034, 3543},{ 7064, 3580}, + { 7991, 3586},{ 8849, 3568},{ 9605, 3561},{10306, 3550}, + {10919, 3544},{11466, 3530},{11972, 3528},{12401, 3536}, + {12818, 3511},{13185, 3522},{13523, 3505},{13827, 3505}, + {14114, 3522},{14395, 3521},{14625, 3533},{14909, 3532} } }, { /*Cb qi=54 INTRA*/ { - { 13, 2},{ 165, 367},{ 318, 715},{ 498, 1030}, - { 698, 1301},{ 906, 1523},{ 1121, 1703},{ 1336, 1853}, - { 1549, 1984},{ 1765, 2100},{ 1974, 2207},{ 2192, 2306}, - { 2402, 2396},{ 2587, 2493},{ 2773, 2591},{ 2953, 2691}, - { 3119, 2778},{ 3277, 2858},{ 3430, 2940},{ 3603, 3004}, - { 3788, 3059},{ 3950, 3121},{ 4128, 3173},{ 4398, 3215} + { 148, -3},{ 218, 480},{ 351, 787},{ 437, 1069}, + { 550, 1350},{ 730, 1592},{ 931, 1784},{ 1243, 1884}, + { 1499, 1984},{ 1680, 2115},{ 1864, 2244},{ 2062, 2334}, + { 2278, 2407},{ 2442, 2496},{ 2602, 2603},{ 2783, 2686}, + { 2928, 2771},{ 3073, 2856},{ 3207, 2938},{ 3368, 2998}, + { 3516, 3077},{ 3699, 3122},{ 3818, 3202},{ 3939, 3230} }, /*Cb qi=54 INTER*/ { - { 100, -3},{ 109, 343},{ 107, 668},{ 125, 1018}, - { 169, 1354},{ 241, 1662},{ 353, 1938},{ 496, 2190}, - { 655, 2431},{ 843, 2655},{ 1082, 2851},{ 1381, 3015}, - { 1739, 3146},{ 2154, 3243},{ 2610, 3310},{ 3094, 3344}, - { 3581, 3358},{ 4034, 3371},{ 4457, 3384},{ 4867, 3399}, - { 5255, 3413},{ 5630, 3425},{ 6003, 3440},{ 6346, 3440} + { 48, -11},{ 54, 407},{ 86, 743},{ 122, 1083}, + { 176, 1400},{ 241, 1699},{ 347, 1968},{ 496, 2208}, + { 664, 2431},{ 863, 2637},{ 1120, 2816},{ 1442, 2961}, + { 1835, 3066},{ 2261, 3140},{ 2676, 3203},{ 3092, 3245}, + { 3480, 3266},{ 3862, 3286},{ 4254, 3305},{ 4604, 3316}, + { 4989, 3335},{ 5306, 3351},{ 5654, 3339},{ 5855, 3345} } }, { /*Cr qi=54 INTRA*/ { - { 23, 7},{ 174, 386},{ 338, 732},{ 549, 1034}, - { 751, 1289},{ 947, 1506},{ 1150, 1685},{ 1353, 1837}, - { 1572, 1969},{ 1800, 2087},{ 2031, 2192},{ 2248, 2291}, - { 2434, 2387},{ 2622, 2477},{ 2815, 2549},{ 2976, 2607}, - { 3126, 2663},{ 3286, 2727},{ 3427, 2807},{ 3569, 2877}, - { 3761, 2941},{ 3942, 3016},{ 4084, 3093},{ 4226, 3131} + { 137, 10},{ 212, 492},{ 315, 795},{ 470, 1061}, + { 612, 1333},{ 821, 1539},{ 1105, 1680},{ 1335, 1811}, + { 1566, 1927},{ 1773, 2038},{ 1973, 2153},{ 2148, 2259}, + { 2311, 2352},{ 2474, 2460},{ 2647, 2516},{ 2810, 2607}, + { 2928, 2638},{ 3085, 2742},{ 3232, 2815},{ 3348, 2899}, + { 3533, 2993},{ 3679, 3029},{ 3803, 3138},{ 3925, 3170} }, /*Cr qi=54 INTER*/ { - { 88, -2},{ 99, 351},{ 100, 680},{ 125, 1027}, - { 175, 1354},{ 248, 1656},{ 343, 1945},{ 463, 2219}, - { 626, 2463},{ 850, 2668},{ 1128, 2837},{ 1445, 2983}, - { 1791, 3111},{ 2168, 3224},{ 2597, 3309},{ 3075, 3351}, - { 3560, 3364},{ 4029, 3356},{ 4464, 3335},{ 4858, 3307}, - { 5218, 3275},{ 5547, 3256},{ 5850, 3247},{ 6171, 3214} + { 46, 2},{ 47, 419},{ 87, 746},{ 125, 1083}, + { 177, 1401},{ 249, 1687},{ 342, 1964},{ 453, 2226}, + { 627, 2454},{ 869, 2641},{ 1152, 2800},{ 1455, 2942}, + { 1776, 3077},{ 2135, 3187},{ 2524, 3287},{ 2984, 3325}, + { 3425, 3344},{ 3881, 3328},{ 4313, 3274},{ 4701, 3218}, + { 5027, 3171},{ 5299, 3130},{ 5597, 3107},{ 5791, 3120} } } }, { { - /*Y' qi=55 INTRA*/ - { - { 178, 95},{ 968, 1137},{ 2000, 1747},{ 3013, 2027}, - { 3966, 2173},{ 4920, 2294},{ 5842, 2427},{ 6702, 2553}, - { 7489, 2668},{ 8213, 2773},{ 8875, 2858},{ 9452, 2913}, - { 9986, 2959},{10504, 3016},{11023, 3085},{11530, 3157}, - {12011, 3213},{12480, 3257},{12882, 3291},{13214, 3310}, - {13542, 3325},{13890, 3350},{14248, 3371},{14671, 3398} + /*Y' qi=63 INTRA*/ + { + { -86, 167},{ 2070, 1104},{ 5138, 1428},{ 7014, 1535}, + { 8430, 1629},{ 9663, 1690},{10576, 1745},{11277, 1809}, + {12003, 1869},{12663, 1925},{13258, 1983},{13701, 2016}, + {14228, 2073},{14756, 2088},{15203, 2164},{15993, 2175}, + {16378, 2256},{16917, 2240},{17361, 2332},{17782, 2312}, + {18376, 2381},{18728, 2362},{19224, 2408},{19705, 2392} }, - /*Y' qi=55 INTER*/ - { - { 59, 5},{ 170, 1307},{ 725, 2358},{ 1886, 3058}, - { 3589, 3385},{ 5284, 3459},{ 6654, 3458},{ 7771, 3461}, - { 8727, 3470},{ 9564, 3478},{10322, 3488},{11019, 3497}, - {11658, 3505},{12258, 3513},{12819, 3520},{13344, 3527}, - {13840, 3533},{14314, 3537},{14755, 3541},{15161, 3544}, - {15552, 3548},{15916, 3548},{16257, 3548},{16576, 3540} + /*Y' qi=63 INTER*/ + { + { -529, 154},{ 967, 1233},{ 4201, 1610},{ 6285, 1800}, + { 8058, 1908},{ 9439, 1968},{10737, 1987},{11999, 1979}, + {13003, 1972},{13854, 1963},{14584, 1965},{15217, 1955}, + {15773, 1956},{16229, 1949},{16735, 1952},{17085, 1956}, + {17508, 1956},{17821, 1961},{18191, 1961},{18465, 1982}, + {18792, 1975},{19158, 1995},{19378, 2010},{19817, 2021} } }, { - /*Cb qi=55 INTRA*/ - { - { 13, 2},{ 167, 366},{ 322, 714},{ 508, 1026}, - { 716, 1292},{ 930, 1511},{ 1148, 1690},{ 1366, 1839}, - { 1578, 1972},{ 1793, 2090},{ 2001, 2199},{ 2217, 2300}, - { 2427, 2393},{ 2609, 2495},{ 2784, 2600},{ 2961, 2704}, - { 3121, 2797},{ 3268, 2884},{ 3423, 2965},{ 3590, 3032}, - { 3764, 3096},{ 3926, 3165},{ 4101, 3223},{ 4405, 3258} + /*Cb qi=63 INTRA*/ + { + { 136, 4},{ 338, 438},{ 593, 730},{ 835, 974}, + { 1168, 1188},{ 1602, 1345},{ 2004, 1467},{ 2465, 1505}, + { 2799, 1574},{ 3091, 1669},{ 3384, 1758},{ 3673, 1817}, + { 3950, 1861},{ 4190, 1924},{ 4444, 1993},{ 4701, 2051}, + { 4915, 2123},{ 5119, 2166},{ 5329, 2231},{ 5576, 2259}, + { 5793, 2310},{ 6001, 2334},{ 6198, 2384},{ 6344, 2401} }, - /*Cb qi=55 INTER*/ - { - { 90, -4},{ 109, 344},{ 107, 668},{ 126, 1017}, - { 172, 1351},{ 249, 1657},{ 370, 1928},{ 527, 2174}, - { 702, 2407},{ 909, 2624},{ 1170, 2814},{ 1493, 2970}, - { 1869, 3097},{ 2292, 3192},{ 2752, 3258},{ 3232, 3295}, - { 3709, 3314},{ 4156, 3335},{ 4592, 3355},{ 5004, 3373}, - { 5377, 3389},{ 5737, 3411},{ 6092, 3432},{ 6473, 3423} + /*Cb qi=63 INTER*/ + { + { 49, 4},{ 51, 403},{ 98, 729},{ 185, 1034}, + { 352, 1304},{ 622, 1533},{ 1068, 1696},{ 1604, 1821}, + { 2203, 1924},{ 2890, 1988},{ 3622, 2017},{ 4359, 2019}, + { 5025, 2005},{ 5586, 2002},{ 6090, 1989},{ 6519, 1977}, + { 6927, 1977},{ 7305, 1968},{ 7730, 1984},{ 8087, 1981}, + { 8435, 1991},{ 8822, 1987},{ 9155, 2008},{ 9392, 2011} } }, { - /*Cr qi=55 INTRA*/ - { - { 23, 7},{ 175, 385},{ 342, 730},{ 561, 1028}, - { 771, 1279},{ 973, 1493},{ 1181, 1669},{ 1384, 1822}, - { 1602, 1956},{ 1830, 2076},{ 2057, 2184},{ 2270, 2288}, - { 2452, 2389},{ 2637, 2484},{ 2823, 2559},{ 2983, 2621}, - { 3129, 2682},{ 3280, 2753},{ 3417, 2833},{ 3554, 2904}, - { 3743, 2977},{ 3921, 3060},{ 4055, 3137},{ 4185, 3186} + /*Cr qi=63 INTRA*/ + { + { 131, 11},{ 334, 448},{ 569, 739},{ 929, 946}, + { 1285, 1145},{ 1718, 1274},{ 2176, 1343},{ 2531, 1424}, + { 2866, 1504},{ 3176, 1580},{ 3475, 1657},{ 3736, 1728}, + { 3962, 1807},{ 4232, 1872},{ 4425, 1921},{ 4657, 1976}, + { 4817, 2009},{ 5063, 2082},{ 5281, 2129},{ 5480, 2199}, + { 5743, 2258},{ 5887, 2283},{ 6124, 2358},{ 6273, 2378} }, - /*Cr qi=55 INTER*/ - { - { 85, 0},{ 99, 352},{ 100, 679},{ 126, 1025}, - { 178, 1351},{ 256, 1650},{ 359, 1935},{ 493, 2202}, - { 675, 2439},{ 921, 2636},{ 1220, 2799},{ 1552, 2941}, - { 1910, 3068},{ 2303, 3177},{ 2735, 3262},{ 3206, 3311}, - { 3689, 3333},{ 4152, 3327},{ 4588, 3299},{ 4978, 3272}, - { 5325, 3243},{ 5651, 3221},{ 5969, 3210},{ 6218, 3185} + /*Cr qi=63 INTER*/ + { + { 47, 15},{ 40, 405},{ 100, 730},{ 189, 1037}, + { 351, 1303},{ 625, 1526},{ 984, 1719},{ 1512, 1862}, + { 2189, 1947},{ 2895, 2003},{ 3576, 2046},{ 4249, 2072}, + { 4901, 2068},{ 5514, 2043},{ 6079, 2009},{ 6528, 1977}, + { 6927, 1940},{ 7274, 1915},{ 7580, 1894},{ 7910, 1910}, + { 8211, 1902},{ 8472, 1920},{ 8742, 1926},{ 8981, 1930} } } - }, + } +}; + +# if !defined(OC_COLLECT_METRICS) +static const +# endif +oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={ { { - /*Y' qi=56 INTRA*/ - { - { 137, 104},{ 1048, 1128},{ 2147, 1760},{ 3261, 2029}, - { 4319, 2131},{ 5310, 2234},{ 6245, 2351},{ 7101, 2464}, - { 7886, 2572},{ 8610, 2675},{ 9270, 2762},{ 9840, 2818}, - {10365, 2869},{10875, 2928},{11393, 2997},{11900, 3071}, - {12371, 3128},{12834, 3172},{13233, 3208},{13562, 3228}, - {13878, 3245},{14221, 3271},{14584, 3292},{15008, 3320} + /*Y' qi=0 INTRA*/ + { + { 33, 122},{ 57, 1297},{ 13, 2226},{ 157, 3890}, + { 227, 3682},{ 169, 3084},{ 197, 2700},{ 227, 3238}, + { 290, 4294},{ 354, 5230},{ 406, 5615},{ 417, 5322}, + { 452, 5462},{ 455, 5683},{ 493, 5938},{ 553, 6374}, + { 558, 6464},{ 606, 6493},{ 616, 6417},{ 643, 6557}, + { 641, 6664},{ 716, 7285},{ 748, 7518},{ 747, 7502} }, - /*Y' qi=56 INTER*/ - { - { 19, 21},{ 207, 1292},{ 1031, 2252},{ 2553, 2846}, - { 4463, 3085},{ 6137, 3131},{ 7441, 3151},{ 8526, 3172}, - { 9468, 3193},{10301, 3209},{11059, 3224},{11760, 3237}, - {12405, 3249},{13008, 3261},{13570, 3270},{14100, 3278}, - {14597, 3284},{15074, 3289},{15524, 3297},{15929, 3302}, - {16314, 3306},{16675, 3307},{17004, 3305},{17288, 3301} + /*Y' qi=0 INTER*/ + { + { 16, 205},{ 5, 1338},{ 16, 2554},{ 6, 3809}, + { 9, 5188},{ 58, 6446},{ 76, 7561},{ 95, 8648}, + { 124, 9713},{ 158,10787},{ 193,11887},{ 233,12991}, + { 270,14116},{ 307,15236},{ 341,16346},{ 372,17426}, + { 398,18499},{ 422,19594},{ 448,20669},{ 479,21732}, + { 526,22720},{ 583,23572},{ 655,24516},{ 758,24647} } }, { - /*Cb qi=56 INTRA*/ - { - { 16, 3},{ 188, 367},{ 353, 712},{ 546, 1017}, - { 765, 1275},{ 989, 1484},{ 1221, 1653},{ 1459, 1791}, - { 1681, 1920},{ 1893, 2046},{ 2102, 2160},{ 2323, 2257}, - { 2534, 2347},{ 2720, 2447},{ 2902, 2549},{ 3075, 2654}, - { 3239, 2749},{ 3392, 2835},{ 3544, 2920},{ 3712, 2988}, - { 3882, 3052},{ 4052, 3123},{ 4227, 3181},{ 4483, 3213} + /*Cb qi=0 INTRA*/ + { + { 26, 40},{ 23, 589},{ 27, 784},{ 27, 1079}, + { 24, 1186},{ 25, 1641},{ 25, 1915},{ 29, 2207}, + { 39, 2361},{ 39, 2746},{ 32, 3020},{ 16, 3387}, + { 31, 3604},{ 36, 4076},{ 69, 4426},{ 102, 4724}, + { 139, 4923},{ 196, 5061},{ 211, 5103},{ 214, 5063}, + { 161, 4466},{ 208, 4793},{ 218, 4537},{ 219, 4539} }, - /*Cb qi=56 INTER*/ - { - { 92, -1},{ 111, 343},{ 114, 665},{ 148, 1003}, - { 224, 1321},{ 345, 1609},{ 526, 1858},{ 754, 2077}, - { 1009, 2281},{ 1319, 2464},{ 1702, 2614},{ 2145, 2732}, - { 2625, 2824},{ 3123, 2890},{ 3634, 2933},{ 4137, 2954}, - { 4614, 2965},{ 5052, 2988},{ 5468, 3015},{ 5852, 3035}, - { 6213, 3060},{ 6557, 3081},{ 6906, 3094},{ 7243, 3112} + /*Cb qi=0 INTER*/ + { + { 3, 164},{ 1, 535},{ 1, 779},{ 2, 1048}, + { 3, 1267},{ 1, 1625},{ 2, 1921},{ 5, 2224}, + { 8, 2481},{ 8, 2813},{ 4, 3089},{ -2, 3386}, + { -9, 3642},{ -14, 3993},{ -11, 4300},{ -6, 4628}, + { 4, 4929},{ 25, 5299},{ 44, 5623},{ 83, 5915}, + { 93, 6186},{ 91, 6483},{ 90, 6775},{ 95, 6952} } }, { - /*Cr qi=56 INTRA*/ - { - { 28, 8},{ 195, 385},{ 373, 727},{ 598, 1019}, - { 816, 1263},{ 1033, 1465},{ 1260, 1630},{ 1482, 1773}, - { 1717, 1900},{ 1949, 2018},{ 2178, 2128},{ 2393, 2233}, - { 2570, 2338},{ 2749, 2435},{ 2937, 2514},{ 3097, 2577}, - { 3240, 2638},{ 3398, 2709},{ 3540, 2791},{ 3673, 2865}, - { 3869, 2938},{ 4049, 3019},{ 4179, 3095},{ 4330, 3137} + /*Cr qi=0 INTRA*/ + { + { 22, 49},{ 26, 579},{ 23, 762},{ 15, 1050}, + { 20, 1191},{ 24, 1608},{ 26, 1875},{ 35, 2173}, + { 39, 2359},{ 30, 2736},{ 16, 2987},{ 0, 3334}, + { 14, 3625},{ 11, 4095},{ 57, 4512},{ 95, 4793}, + { 141, 4949},{ 206, 5242},{ 230, 5191},{ 242, 5177}, + { 178, 4775},{ 237, 5010},{ 223, 4656},{ 224, 4657} }, - /*Cr qi=56 INTER*/ - { - { 83, 0},{ 99, 353},{ 103, 676},{ 146, 1010}, - { 232, 1320},{ 355, 1601},{ 512, 1866},{ 713, 2109}, - { 988, 2312},{ 1344, 2471},{ 1750, 2602},{ 2180, 2719}, - { 2642, 2819},{ 3141, 2892},{ 3653, 2939},{ 4159, 2961}, - { 4636, 2961},{ 5072, 2945},{ 5464, 2917},{ 5813, 2895}, - { 6134, 2890},{ 6458, 2883},{ 6735, 2881},{ 6953, 2902} + /*Cr qi=0 INTER*/ + { + { 3, 163},{ 1, 536},{ 1, 773},{ 3, 1023}, + { 2, 1225},{ 1, 1607},{ 1, 1900},{ 5, 2204}, + { 9, 2453},{ 8, 2781},{ 3, 3049},{ -5, 3338}, + { -13, 3570},{ -17, 3950},{ -13, 4255},{ -6, 4596}, + { 7, 4893},{ 33, 5300},{ 53, 5632},{ 97, 5942}, + { 103, 6216},{ 96, 6522},{ 91, 6849},{ 98, 6995} } } }, { { - /*Y' qi=57 INTRA*/ - { - { 170, 106},{ 1106, 1120},{ 2246, 1740},{ 3399, 1993}, - { 4482, 2077},{ 5492, 2167},{ 6446, 2273},{ 7324, 2379}, - { 8130, 2482},{ 8866, 2578},{ 9537, 2661},{10119, 2715}, - {10646, 2762},{11161, 2820},{11694, 2886},{12214, 2957}, - {12693, 3013},{13166, 3053},{13569, 3087},{13897, 3106}, - {14224, 3122},{14568, 3148},{14931, 3167},{15390, 3192} + /*Y' qi=9 INTRA*/ + { + { 47, 152},{ 50, 1213},{ 144, 2543},{ 242, 2332}, + { 210, 1894},{ 250, 2386},{ 328, 3094},{ 407, 3419}, + { 464, 3507},{ 522, 3770},{ 613, 4194},{ 657, 4618}, + { 753, 5137},{ 796, 5248},{ 842, 5110},{ 927, 5330}, + { 994, 5487},{ 1008, 5463},{ 1101, 5794},{ 1169, 5966}, + { 1208, 6121},{ 1331, 6447},{ 1445, 6618},{ 1449, 6616} }, - /*Y' qi=57 INTER*/ - { - { 19, 20},{ 205, 1292},{ 1096, 2229},{ 2775, 2766}, - { 4811, 2943},{ 6512, 2964},{ 7832, 2976},{ 8940, 2990}, - { 9903, 3004},{10755, 3017},{11532, 3029},{12243, 3039}, - {12891, 3047},{13502, 3058},{14073, 3065},{14603, 3071}, - {15097, 3078},{15581, 3083},{16036, 3086},{16452, 3090}, - {16855, 3093},{17222, 3094},{17552, 3092},{17851, 3098} + /*Y' qi=9 INTER*/ + { + { 4, 218},{ 16, 1314},{ 4, 2563},{ 37, 3882}, + { 83, 5058},{ 109, 6184},{ 161, 7292},{ 224, 8389}, + { 287, 9485},{ 349,10565},{ 411,11608},{ 464,12648}, + { 518,13664},{ 575,14650},{ 649,15585},{ 742,16451}, + { 862,17214},{ 1003,17860},{ 1179,18325},{ 1372,18648}, + { 1576,18878},{ 1795,18903},{ 2040,18880},{ 2116,18759} } }, { - /*Cb qi=57 INTRA*/ - { - { 16, 3},{ 197, 365},{ 384, 704},{ 603, 1001}, - { 837, 1252},{ 1077, 1455},{ 1326, 1618},{ 1581, 1748}, - { 1819, 1871},{ 2042, 1993},{ 2264, 2104},{ 2500, 2196}, - { 2722, 2280},{ 2916, 2375},{ 3103, 2473},{ 3290, 2575}, - { 3456, 2667},{ 3612, 2748},{ 3775, 2829},{ 3958, 2896}, - { 4145, 2947},{ 4307, 3012},{ 4476, 3070},{ 4733, 3110} + /*Cb qi=9 INTRA*/ + { + { 27, 42},{ 23, 587},{ 34, 782},{ 37, 1079}, + { 34, 1204},{ 42, 1630},{ 37, 1887},{ 25, 2210}, + { 40, 2455},{ 71, 2880},{ 112, 3193},{ 156, 3427}, + { 168, 3403},{ 217, 3488},{ 203, 3335},{ 224, 3200}, + { 191, 2742},{ 195, 2810},{ 207, 2665},{ 201, 2661}, + { 169, 2078},{ 211, 2720},{ 226, 2813},{ 228, 2824} }, - /*Cb qi=57 INTER*/ - { - { 94, -1},{ 111, 344},{ 112, 665},{ 147, 1002}, - { 227, 1319},{ 353, 1604},{ 543, 1849},{ 785, 2062}, - { 1066, 2257},{ 1408, 2430},{ 1827, 2568},{ 2320, 2670}, - { 2848, 2743},{ 3386, 2791},{ 3934, 2812},{ 4453, 2820}, - { 4929, 2830},{ 5368, 2842},{ 5787, 2856},{ 6190, 2875}, - { 6554, 2896},{ 6895, 2913},{ 7229, 2927},{ 7572, 2932} + /*Cb qi=9 INTER*/ + { + { 4, 158},{ 2, 537},{ 3, 779},{ 2, 1045}, + { 3, 1284},{ 7, 1629},{ 7, 1917},{ 1, 2218}, + { -4, 2497},{ -3, 2845},{ 6, 3162},{ 23, 3482}, + { 42, 3788},{ 62, 4116},{ 76, 4416},{ 84, 4700}, + { 91, 4975},{ 95, 5259},{ 97, 5518},{ 94, 5790}, + { 99, 6052},{ 111, 6311},{ 126, 6601},{ 136, 6719} } }, { - /*Cr qi=57 INTRA*/ - { - { 28, 8},{ 207, 383},{ 413, 716},{ 661, 999}, - { 889, 1237},{ 1123, 1433},{ 1365, 1592},{ 1603, 1731}, - { 1853, 1852},{ 2103, 1965},{ 2345, 2072},{ 2571, 2173}, - { 2763, 2271},{ 2949, 2364},{ 3146, 2438},{ 3315, 2497}, - { 3459, 2552},{ 3618, 2616},{ 3767, 2697},{ 3906, 2773}, - { 4099, 2841},{ 4281, 2916},{ 4429, 2987},{ 4569, 3030} + /*Cr qi=9 INTRA*/ + { + { 25, 50},{ 32, 576},{ 32, 762},{ 21, 1049}, + { 28, 1207},{ 41, 1603},{ 36, 1839},{ 26, 2170}, + { 34, 2462},{ 59, 2872},{ 109, 3176},{ 157, 3364}, + { 188, 3397},{ 231, 3418},{ 250, 3341},{ 261, 3228}, + { 222, 2814},{ 258, 3091},{ 234, 2915},{ 228, 3042}, + { 210, 2610},{ 273, 3210},{ 274, 3231},{ 276, 3239} }, - /*Cr qi=57 INTER*/ - { - { 85, 0},{ 99, 352},{ 102, 675},{ 147, 1008}, - { 235, 1317},{ 363, 1597},{ 529, 1858},{ 748, 2094}, - { 1050, 2287},{ 1439, 2436},{ 1877, 2557},{ 2352, 2660}, - { 2869, 2740},{ 3413, 2791},{ 3962, 2815},{ 4485, 2819}, - { 4955, 2816},{ 5382, 2800},{ 5769, 2772},{ 6107, 2748}, - { 6443, 2740},{ 6754, 2739},{ 7029, 2737},{ 7284, 2745} + /*Cr qi=9 INTER*/ + { + { 4, 156},{ 2, 538},{ 3, 772},{ 2, 1028}, + { 3, 1254},{ 7, 1613},{ 7, 1893},{ 0, 2191}, + { -8, 2454},{ -4, 2811},{ 7, 3121},{ 27, 3442}, + { 48, 3749},{ 72, 4101},{ 88, 4410},{ 91, 4698}, + { 99, 4988},{ 99, 5279},{ 101, 5542},{ 95, 5813}, + { 99, 6088},{ 114, 6367},{ 125, 6683},{ 137, 6761} } } }, { { - /*Y' qi=58 INTRA*/ - { - { 164, 109},{ 1198, 1111},{ 2396, 1737},{ 3606, 1978}, - { 4727, 2048},{ 5749, 2138},{ 6708, 2243},{ 7584, 2347}, - { 8388, 2449},{ 9122, 2549},{ 9784, 2635},{10354, 2691}, - {10876, 2740},{11385, 2800},{11912, 2869},{12429, 2941}, - {12902, 2997},{13375, 3040},{13779, 3075},{14103, 3096}, - {14435, 3112},{14783, 3140},{15141, 3160},{15599, 3186} + /*Y' qi=18 INTRA*/ + { + { 51, 88},{ 88, 1344},{ 258, 1643},{ 228, 1325}, + { 372, 2208},{ 443, 2371},{ 520, 2382},{ 584, 2477}, + { 739, 2906},{ 859, 3348},{ 1008, 3697},{ 1131, 3884}, + { 1278, 4110},{ 1349, 4229},{ 1431, 4329},{ 1544, 4395}, + { 1602, 4439},{ 1669, 4535},{ 1814, 4656},{ 1883, 4716}, + { 1957, 4940},{ 2101, 5019},{ 2259, 5249},{ 2265, 5246} }, - /*Y' qi=58 INTER*/ - { - { 14, 23},{ 210, 1290},{ 1277, 2178},{ 3118, 2677}, - { 5207, 2834},{ 6902, 2857},{ 8218, 2878},{ 9323, 2900}, - {10285, 2919},{11132, 2934},{11899, 2949},{12599, 2961}, - {13235, 2971},{13835, 2982},{14394, 2991},{14917, 2997}, - {15412, 3005},{15882, 3009},{16325, 3013},{16735, 3016}, - {17131, 3018},{17501, 3021},{17824, 3021},{18125, 3016} + /*Y' qi=18 INTER*/ + { + { 26, 195},{ 1, 1317},{ 45, 2595},{ 103, 3750}, + { 168, 4903},{ 281, 6007},{ 397, 7062},{ 513, 8064}, + { 630, 9010},{ 758, 9902},{ 906,10732},{ 1095,11463}, + { 1338,12060},{ 1629,12490},{ 1969,12724},{ 2313,12842}, + { 2666,12828},{ 2993,12747},{ 3294,12670},{ 3558,12553}, + { 3813,12440},{ 3990,12379},{ 4177,12291},{ 4226,12265} } }, { - /*Cb qi=58 INTRA*/ - { - { 17, 3},{ 200, 365},{ 389, 703},{ 613, 996}, - { 853, 1243},{ 1095, 1445},{ 1349, 1604},{ 1613, 1731}, - { 1853, 1853},{ 2074, 1978},{ 2292, 2091},{ 2526, 2184}, - { 2750, 2266},{ 2945, 2360},{ 3134, 2458},{ 3320, 2561}, - { 3482, 2654},{ 3641, 2737},{ 3804, 2818},{ 3985, 2881}, - { 4168, 2935},{ 4331, 3003},{ 4499, 3060},{ 4751, 3100} + /*Cb qi=18 INTRA*/ + { + { 31, 43},{ 33, 585},{ 40, 781},{ 58, 1077}, + { 45, 1189},{ 58, 1655},{ 66, 1983},{ 123, 2221}, + { 168, 2193},{ 227, 2321},{ 241, 2246},{ 250, 2208}, + { 221, 1786},{ 250, 2087},{ 247, 2036},{ 250, 2164}, + { 241, 2054},{ 287, 2453},{ 302, 2551},{ 335, 2758}, + { 279, 2511},{ 379, 2973},{ 404, 3028},{ 406, 3029} }, - /*Cb qi=58 INTER*/ - { - { 94, -1},{ 112, 345},{ 112, 665},{ 152, 998}, - { 247, 1307},{ 406, 1580},{ 644, 1810},{ 938, 2007}, - { 1271, 2189},{ 1668, 2348},{ 2151, 2470},{ 2691, 2558}, - { 3249, 2619},{ 3798, 2659},{ 4334, 2682},{ 4849, 2692}, - { 5314, 2700},{ 5747, 2721},{ 6167, 2742},{ 6547, 2765}, - { 6902, 2790},{ 7251, 2804},{ 7583, 2819},{ 7924, 2833} + /*Cb qi=18 INTER*/ + { + { 7, 153},{ 4, 537},{ 3, 777},{ 9, 1034}, + { 6, 1282},{ 0, 1630},{ 0, 1943},{ 21, 2252}, + { 48, 2567},{ 67, 2881},{ 83, 3178},{ 89, 3463}, + { 92, 3738},{ 99, 4024},{ 114, 4289},{ 131, 4552}, + { 153, 4814},{ 179, 5081},{ 207, 5333},{ 241, 5581}, + { 273, 5822},{ 303, 6068},{ 335, 6368},{ 353, 6432} } }, { - /*Cr qi=58 INTRA*/ - { - { 29, 8},{ 210, 382},{ 419, 714},{ 671, 993}, - { 903, 1229},{ 1141, 1422},{ 1390, 1578},{ 1635, 1713}, - { 1889, 1833},{ 2140, 1946},{ 2379, 2055},{ 2604, 2157}, - { 2794, 2256},{ 2977, 2349},{ 3174, 2422},{ 3339, 2482}, - { 3483, 2537},{ 3643, 2604},{ 3790, 2684},{ 3927, 2757}, - { 4112, 2826},{ 4294, 2900},{ 4451, 2975},{ 4600, 3011} + /*Cr qi=18 INTRA*/ + { + { 31, 49},{ 42, 575},{ 42, 763},{ 38, 1045}, + { 41, 1184},{ 56, 1631},{ 87, 1968},{ 163, 2177}, + { 191, 2188},{ 236, 2264},{ 240, 2101},{ 234, 2047}, + { 206, 1651},{ 222, 1966},{ 238, 2013},{ 240, 2176}, + { 229, 2098},{ 321, 2592},{ 341, 2748},{ 378, 3025}, + { 367, 2849},{ 442, 3283},{ 453, 3315},{ 455, 3313} }, - /*Cr qi=58 INTER*/ - { - { 86, 0},{ 99, 352},{ 103, 675},{ 151, 1004}, - { 256, 1306},{ 417, 1573},{ 628, 1819},{ 901, 2040}, - { 1262, 2217},{ 1705, 2353},{ 2191, 2466},{ 2713, 2556}, - { 3268, 2622},{ 3831, 2664},{ 4374, 2682},{ 4881, 2686}, - { 5339, 2685},{ 5747, 2668},{ 6123, 2646},{ 6465, 2630}, - { 6783, 2618},{ 7082, 2623},{ 7366, 2632},{ 7673, 2654} + /*Cr qi=18 INTER*/ + { + { 6, 151},{ 3, 539},{ 3, 775},{ 8, 1027}, + { 6, 1260},{ -3, 1619},{ 0, 1927},{ 24, 2238}, + { 58, 2558},{ 76, 2871},{ 92, 3173},{ 96, 3461}, + { 98, 3742},{ 104, 4032},{ 116, 4306},{ 136, 4578}, + { 158, 4839},{ 185, 5123},{ 217, 5383},{ 250, 5642}, + { 279, 5910},{ 306, 6169},{ 333, 6502},{ 350, 6522} } } }, { { - /*Y' qi=59 INTRA*/ - { - { 142, 112},{ 1259, 1100},{ 2552, 1711},{ 3815, 1933}, - { 4955, 1987},{ 5983, 2068},{ 6949, 2165},{ 7832, 2263}, - { 8645, 2359},{ 9392, 2454},{10066, 2536},{10643, 2589}, - {11174, 2636},{11696, 2693},{12230, 2758},{12752, 2826}, - {13239, 2883},{13721, 2926},{14139, 2959},{14479, 2978}, - {14811, 2993},{15166, 3020},{15532, 3039},{16000, 3062} + /*Y' qi=27 INTRA*/ + { + { 10, 85},{ 280, 1349},{ 278, 815},{ 497, 1699}, + { 600, 1569},{ 744, 1944},{ 894, 2114},{ 1040, 2292}, + { 1216, 2484},{ 1485, 2816},{ 1778, 3065},{ 1990, 3243}, + { 2199, 3381},{ 2326, 3515},{ 2370, 3422},{ 2512, 3581}, + { 2548, 3526},{ 2656, 3615},{ 2803, 3679},{ 2946, 3766}, + { 3023, 3824},{ 3179, 3908},{ 3374, 4035},{ 3377, 4030} }, - /*Y' qi=59 INTER*/ - { - { 8, 25},{ 211, 1289},{ 1394, 2144},{ 3421, 2580}, - { 5611, 2689},{ 7316, 2701},{ 8643, 2717},{ 9762, 2734}, - {10735, 2750},{11587, 2763},{12353, 2775},{13056, 2785}, - {13693, 2793},{14288, 2805},{14843, 2814},{15361, 2821}, - {15857, 2827},{16328, 2831},{16763, 2834},{17171, 2838}, - {17568, 2840},{17941, 2842},{18285, 2843},{18586, 2839} + /*Y' qi=27 INTER*/ + { + { -2, 172},{ 31, 1347},{ 117, 2488},{ 245, 3651}, + { 448, 4719},{ 668, 5679},{ 918, 6524},{ 1204, 7255}, + { 1557, 7848},{ 1998, 8281},{ 2511, 8531},{ 3055, 8642}, + { 3582, 8648},{ 4062, 8611},{ 4482, 8582},{ 4845, 8560}, + { 5140, 8560},{ 5423, 8581},{ 5645, 8596},{ 5855, 8586}, + { 6061, 8608},{ 6211, 8558},{ 6402, 8583},{ 6472, 8575} } }, { - /*Cb qi=59 INTRA*/ - { - { 17, 3},{ 224, 363},{ 441, 696},{ 689, 982}, - { 945, 1222},{ 1204, 1416},{ 1474, 1571},{ 1751, 1695}, - { 2001, 1816},{ 2228, 1941},{ 2453, 2055},{ 2693, 2147}, - { 2924, 2227},{ 3125, 2321},{ 3321, 2416},{ 3510, 2520}, - { 3676, 2616},{ 3839, 2699},{ 4008, 2778},{ 4193, 2842}, - { 4371, 2898},{ 4535, 2965},{ 4710, 3023},{ 4921, 3068} + /*Cb qi=27 INTRA*/ + { + { 47, 49},{ 35, 580},{ 64, 778},{ 69, 1071}, + { 98, 1289},{ 186, 1556},{ 177, 1654},{ 197, 1736}, + { 211, 1373},{ 284, 1742},{ 321, 1840},{ 344, 2024}, + { 321, 1969},{ 386, 2254},{ 397, 2281},{ 425, 2320}, + { 396, 2088},{ 448, 2284},{ 462, 2213},{ 482, 2274}, + { 410, 1894},{ 513, 2310},{ 546, 2332},{ 549, 2334} }, - /*Cb qi=59 INTER*/ - { - { 95, -5},{ 111, 343},{ 112, 664},{ 157, 995}, - { 258, 1302},{ 429, 1569},{ 691, 1790},{ 1017, 1977}, - { 1387, 2148},{ 1832, 2294},{ 2368, 2401},{ 2961, 2472}, - { 3553, 2518},{ 4133, 2545},{ 4688, 2557},{ 5198, 2563}, - { 5663, 2574},{ 6100, 2590},{ 6511, 2608},{ 6898, 2621}, - { 7274, 2634},{ 7631, 2655},{ 7984, 2669},{ 8361, 2669} + /*Cb qi=27 INTER*/ + { + { 11, 145},{ 5, 539},{ 11, 771},{ 0, 1033}, + { 9, 1334},{ 44, 1644},{ 70, 1934},{ 87, 2227}, + { 96, 2508},{ 113, 2812},{ 139, 3085},{ 174, 3352}, + { 216, 3614},{ 261, 3873},{ 305, 4123},{ 349, 4372}, + { 396, 4611},{ 442, 4853},{ 493, 5088},{ 543, 5313}, + { 600, 5537},{ 662, 5752},{ 737, 6018},{ 775, 6037} } }, { - /*Cr qi=59 INTRA*/ - { - { 31, 8},{ 240, 379},{ 480, 706},{ 748, 978}, - { 993, 1208},{ 1250, 1394},{ 1519, 1543},{ 1779, 1674}, - { 2047, 1792},{ 2307, 1904},{ 2552, 2013},{ 2780, 2116}, - { 2973, 2216},{ 3165, 2309},{ 3362, 2383},{ 3528, 2444}, - { 3677, 2499},{ 3841, 2566},{ 3995, 2646},{ 4139, 2720}, - { 4324, 2793},{ 4504, 2867},{ 4658, 2939},{ 4806, 2975} + /*Cr qi=27 INTRA*/ + { + { 49, 52},{ 57, 570},{ 61, 762},{ 44, 1048}, + { 80, 1291},{ 196, 1513},{ 224, 1522},{ 242, 1532}, + { 213, 1293},{ 260, 1639},{ 253, 1691},{ 291, 1915}, + { 294, 1897},{ 367, 2178},{ 395, 2258},{ 432, 2310}, + { 407, 2105},{ 503, 2369},{ 492, 2293},{ 552, 2421}, + { 496, 2099},{ 598, 2549},{ 624, 2531},{ 627, 2532} }, - /*Cr qi=59 INTER*/ - { - { 89, -3},{ 98, 352},{ 103, 674},{ 156, 1002}, - { 268, 1300},{ 441, 1562},{ 673, 1801},{ 980, 2010}, - { 1385, 2175},{ 1868, 2301},{ 2401, 2402},{ 2984, 2474}, - { 3591, 2520},{ 4179, 2545},{ 4729, 2555},{ 5232, 2553}, - { 5679, 2545},{ 6081, 2530},{ 6447, 2510},{ 6791, 2496}, - { 7101, 2487},{ 7393, 2489},{ 7684, 2499},{ 7950, 2501} + /*Cr qi=27 INTER*/ + { + { 10, 147},{ 4, 538},{ 11, 769},{ 0, 1022}, + { 9, 1318},{ 51, 1635},{ 80, 1925},{ 97, 2214}, + { 101, 2493},{ 115, 2805},{ 143, 3083},{ 182, 3361}, + { 226, 3625},{ 270, 3898},{ 319, 4157},{ 366, 4405}, + { 418, 4649},{ 467, 4904},{ 509, 5157},{ 548, 5412}, + { 589, 5659},{ 636, 5909},{ 683, 6208},{ 710, 6190} } } }, { { - /*Y' qi=60 INTRA*/ - { - { 92, 116},{ 1361, 1085},{ 2746, 1686},{ 4050, 1895}, - { 5209, 1939},{ 6244, 2012},{ 7213, 2103},{ 8105, 2197}, - { 8928, 2290},{ 9685, 2381},{10371, 2460},{10952, 2511}, - {11487, 2556},{12026, 2611},{12574, 2674},{13102, 2739}, - {13597, 2793},{14092, 2831},{14523, 2862},{14862, 2881}, - {15198, 2897},{15568, 2923},{15949, 2941},{16416, 2964} + /*Y' qi=36 INTRA*/ + { + { 86, 252},{ 345, 662},{ 476, 1143},{ 698, 1169}, + { 894, 1457},{ 1218, 1728},{ 1465, 1849},{ 1731, 2019}, + { 2183, 2298},{ 2666, 2511},{ 3116, 2731},{ 3371, 2813}, + { 3621, 2923},{ 3675, 2949},{ 3710, 2921},{ 3740, 2896}, + { 3746, 2895},{ 3886, 2978},{ 4069, 2991},{ 4229, 3016}, + { 4338, 3102},{ 4530, 3124},{ 4751, 3248},{ 4753, 3244} }, - /*Y' qi=60 INTER*/ - { - { 4, 30},{ 215, 1287},{ 1547, 2104},{ 3729, 2491}, - { 5973, 2568},{ 7672, 2577},{ 9001, 2591},{10123, 2606}, - {11094, 2620},{11943, 2632},{12709, 2643},{13409, 2652}, - {14044, 2660},{14641, 2669},{15193, 2677},{15709, 2684}, - {16201, 2689},{16675, 2693},{17118, 2696},{17522, 2701}, - {17920, 2704},{18293, 2706},{18620, 2702},{18923, 2700} + /*Y' qi=36 INTER*/ + { + { 0, 208},{ 73, 1293},{ 248, 2449},{ 616, 3461}, + { 1061, 4329},{ 1601, 4986},{ 2189, 5447},{ 2875, 5723}, + { 3620, 5844},{ 4328, 5879},{ 4954, 5880},{ 5490, 5890}, + { 5934, 5901},{ 6353, 5926},{ 6706, 5924},{ 7036, 5930}, + { 7338, 5938},{ 7600, 5930},{ 7870, 5939},{ 8065, 5921}, + { 8318, 5914},{ 8451, 5912},{ 8648, 5923},{ 8734, 5926} } }, { - /*Cb qi=60 INTRA*/ - { - { 18, 3},{ 227, 362},{ 447, 694},{ 708, 974}, - { 981, 1207},{ 1252, 1397},{ 1532, 1547},{ 1822, 1663}, - { 2082, 1780},{ 2316, 1903},{ 2548, 2013},{ 2794, 2101}, - { 3029, 2178},{ 3242, 2266},{ 3445, 2360},{ 3638, 2459}, - { 3816, 2547},{ 3980, 2628},{ 4146, 2708},{ 4344, 2766}, - { 4546, 2812},{ 4725, 2872},{ 4880, 2930},{ 5054, 2966} + /*Cb qi=36 INTRA*/ + { + { 52, 54},{ 52, 575},{ 103, 776},{ 185, 1072}, + { 172, 1069},{ 211, 1302},{ 217, 1413},{ 285, 1586}, + { 330, 1463},{ 453, 1694},{ 500, 1741},{ 545, 1852}, + { 501, 1650},{ 584, 1874},{ 587, 1856},{ 638, 1919}, + { 581, 1742},{ 670, 1953},{ 688, 1934},{ 731, 2030}, + { 637, 1794},{ 806, 2123},{ 840, 2091},{ 843, 2091} }, - /*Cb qi=60 INTER*/ - { - { 97, -4},{ 112, 343},{ 114, 664},{ 162, 993}, - { 273, 1294},{ 472, 1553},{ 774, 1762},{ 1138, 1939}, - { 1543, 2102},{ 2034, 2236},{ 2620, 2329},{ 3244, 2389}, - { 3860, 2423},{ 4443, 2440},{ 4997, 2449},{ 5502, 2455}, - { 5962, 2458},{ 6413, 2466},{ 6836, 2485},{ 7217, 2506}, - { 7592, 2518},{ 7957, 2533},{ 8291, 2543},{ 8574, 2545} + /*Cb qi=36 INTER*/ + { + { 19, 142},{ 17, 534},{ 6, 772},{ 44, 1023}, + { 82, 1296},{ 94, 1614},{ 117, 1903},{ 158, 2187}, + { 218, 2450},{ 285, 2703},{ 352, 2943},{ 421, 3181}, + { 489, 3415},{ 564, 3644},{ 647, 3861},{ 748, 4060}, + { 861, 4246},{ 993, 4419},{ 1132, 4576},{ 1282, 4744}, + { 1445, 4894},{ 1600, 5034},{ 1782, 5211},{ 1837, 5200} } }, { - /*Cr qi=60 INTRA*/ - { - { 32, 8},{ 243, 379},{ 488, 702},{ 771, 968}, - { 1030, 1192},{ 1300, 1373},{ 1581, 1517},{ 1854, 1643}, - { 2127, 1757},{ 2393, 1864},{ 2645, 1968},{ 2879, 2068}, - { 3078, 2166},{ 3277, 2256},{ 3484, 2325},{ 3660, 2381}, - { 3808, 2433},{ 3970, 2496},{ 4138, 2571},{ 4288, 2643}, - { 4475, 2710},{ 4655, 2778},{ 4810, 2843},{ 4959, 2879} + /*Cr qi=36 INTRA*/ + { + { 62, 55},{ 90, 561},{ 56, 767},{ 148, 1014}, + { 207, 981},{ 258, 1216},{ 273, 1253},{ 326, 1392}, + { 338, 1383},{ 417, 1613},{ 443, 1629},{ 497, 1734}, + { 466, 1525},{ 561, 1778},{ 577, 1787},{ 631, 1892}, + { 591, 1706},{ 715, 1980},{ 730, 1958},{ 822, 2113}, + { 755, 1935},{ 928, 2228},{ 935, 2205},{ 938, 2205} }, - /*Cr qi=60 INTER*/ - { - { 86, -2},{ 99, 352},{ 103, 673},{ 160, 998}, - { 284, 1292},{ 484, 1546},{ 753, 1774},{ 1100, 1973}, - { 1546, 2129},{ 2072, 2246},{ 2652, 2334},{ 3279, 2392}, - { 3911, 2425},{ 4504, 2440},{ 5044, 2443},{ 5536, 2440}, - { 5979, 2430},{ 6381, 2413},{ 6735, 2397},{ 7062, 2382}, - { 7383, 2376},{ 7680, 2375},{ 7962, 2373},{ 8203, 2379} + /*Cr qi=36 INTER*/ + { + { 14, 145},{ 16, 535},{ 5, 772},{ 44, 1017}, + { 91, 1296},{ 100, 1605},{ 122, 1891},{ 163, 2174}, + { 225, 2443},{ 294, 2707},{ 362, 2962},{ 436, 3210}, + { 518, 3437},{ 607, 3664},{ 702, 3876},{ 795, 4094}, + { 886, 4310},{ 980, 4538},{ 1089, 4749},{ 1216, 4927}, + { 1357, 5116},{ 1506, 5247},{ 1758, 5338},{ 1787, 5306} } } }, { { - /*Y' qi=61 INTRA*/ - { - { 54, 121},{ 1477, 1069},{ 3061, 1638},{ 4465, 1808}, - { 5649, 1827},{ 6710, 1884},{ 7716, 1958},{ 8648, 2037}, - { 9514, 2116},{10311, 2192},{11033, 2261},{11641, 2305}, - {12202, 2342},{12771, 2387},{13356, 2440},{13924, 2493}, - {14444, 2541},{14951, 2576},{15409, 2600},{15779, 2615}, - {16131, 2626},{16521, 2648},{16921, 2663},{17409, 2694} + /*Y' qi=45 INTRA*/ + { + { 185, 246},{ 513, 647},{ 883, 891},{ 1313, 1142}, + { 1760, 1351},{ 2368, 1595},{ 2828, 1718},{ 3097, 1780}, + { 3762, 1951},{ 4454, 2121},{ 4986, 2227},{ 5281, 2281}, + { 5477, 2299},{ 5431, 2288},{ 5425, 2283},{ 5439, 2290}, + { 5324, 2249},{ 5509, 2279},{ 5703, 2321},{ 5896, 2348}, + { 6049, 2370},{ 6253, 2425},{ 6415, 2432},{ 6419, 2430} }, - /*Y' qi=61 INTER*/ - { - { -1, 32},{ 216, 1286},{ 1806, 2036},{ 4279, 2327}, - { 6629, 2352},{ 8347, 2352},{ 9707, 2357},{10860, 2364}, - {11857, 2372},{12726, 2377},{13508, 2382},{14225, 2387}, - {14877, 2392},{15484, 2398},{16048, 2401},{16581, 2405}, - {17092, 2409},{17573, 2409},{18016, 2410},{18427, 2413}, - {18829, 2415},{19221, 2415},{19578, 2415},{19980, 2413} + /*Y' qi=45 INTER*/ + { + { 6, 215},{ 152, 1261},{ 691, 2314},{ 1538, 3095}, + { 2505, 3632},{ 3475, 3935},{ 4355, 4084},{ 5209, 4139}, + { 5985, 4162},{ 6644, 4185},{ 7235, 4190},{ 7768, 4196}, + { 8266, 4200},{ 8736, 4210},{ 9143, 4207},{ 9511, 4215}, + { 9828, 4209},{10112, 4224},{10374, 4226},{10642, 4232}, + {10842, 4219},{10971, 4208},{11200, 4211},{11299, 4216} } }, { - /*Cb qi=61 INTRA*/ - { - { 19, 3},{ 231, 362},{ 456, 693},{ 733, 965}, - { 1032, 1188},{ 1330, 1369},{ 1637, 1508},{ 1956, 1612}, - { 2241, 1718},{ 2496, 1832},{ 2750, 1932},{ 3019, 2007}, - { 3274, 2074},{ 3505, 2154},{ 3725, 2236},{ 3943, 2323}, - { 4138, 2403},{ 4323, 2476},{ 4505, 2543},{ 4706, 2592}, - { 4909, 2630},{ 5109, 2675},{ 5292, 2724},{ 5495, 2768} + /*Cb qi=45 INTRA*/ + { + { 58, 71},{ 66, 548},{ 155, 762},{ 213, 944}, + { 192, 731},{ 324, 1147},{ 401, 1366},{ 481, 1480}, + { 508, 1238},{ 657, 1522},{ 727, 1563},{ 794, 1611}, + { 761, 1470},{ 885, 1710},{ 893, 1700},{ 958, 1760}, + { 893, 1543},{ 985, 1719},{ 1014, 1732},{ 1082, 1784}, + { 963, 1519},{ 1152, 1800},{ 1221, 1830},{ 1226, 1830} }, - /*Cb qi=61 INTER*/ - { - { 91, -2},{ 111, 344},{ 114, 663},{ 166, 989}, - { 291, 1285},{ 522, 1534},{ 875, 1729},{ 1302, 1889}, - { 1786, 2031},{ 2368, 2141},{ 3042, 2207},{ 3734, 2243}, - { 4388, 2259},{ 4982, 2264},{ 5533, 2265},{ 6043, 2262}, - { 6524, 2264},{ 6982, 2274},{ 7422, 2283},{ 7831, 2295}, - { 8198, 2308},{ 8593, 2319},{ 8965, 2329},{ 9258, 2340} + /*Cb qi=45 INTER*/ + { + { 35, 135},{ 12, 532},{ 54, 769},{ 106, 1007}, + { 127, 1258},{ 198, 1565},{ 289, 1832},{ 398, 2082}, + { 520, 2302},{ 653, 2511},{ 800, 2705},{ 956, 2897}, + { 1143, 3064},{ 1358, 3220},{ 1623, 3335},{ 1913, 3444}, + { 2198, 3534},{ 2502, 3626},{ 2787, 3711},{ 3114, 3783}, + { 3454, 3831},{ 3711, 3871},{ 4163, 3901},{ 4221, 3890} } }, { - /*Cr qi=61 INTRA*/ - { - { 33, 9},{ 245, 378},{ 497, 699},{ 801, 958}, - { 1087, 1171},{ 1384, 1342},{ 1692, 1474},{ 1992, 1589}, - { 2290, 1692},{ 2576, 1789},{ 2852, 1884},{ 3109, 1973}, - { 3324, 2061},{ 3544, 2142},{ 3763, 2199},{ 3945, 2244}, - { 4103, 2292},{ 4283, 2349},{ 4469, 2413},{ 4635, 2476}, - { 4836, 2534},{ 5038, 2592},{ 5210, 2649},{ 5358, 2682} + /*Cr qi=45 INTRA*/ + { + { 93, 68},{ 72, 541},{ 154, 769},{ 239, 848}, + { 214, 623},{ 377, 1060},{ 437, 1200},{ 514, 1280}, + { 512, 1160},{ 625, 1453},{ 657, 1470},{ 718, 1516}, + { 692, 1331},{ 831, 1617},{ 875, 1609},{ 944, 1678}, + { 886, 1469},{ 1061, 1699},{ 1082, 1714},{ 1226, 1823}, + { 1113, 1581},{ 1324, 1872},{ 1370, 1925},{ 1374, 1924} }, - /*Cr qi=61 INTER*/ - { - { 82, 0},{ 97, 353},{ 104, 672},{ 165, 995}, - { 303, 1284},{ 532, 1529},{ 852, 1742},{ 1273, 1921}, - { 1798, 2057},{ 2409, 2154},{ 3090, 2212},{ 3794, 2240}, - { 4460, 2251},{ 5057, 2249},{ 5596, 2249},{ 6085, 2245}, - { 6519, 2234},{ 6908, 2220},{ 7269, 2203},{ 7618, 2196}, - { 7949, 2198},{ 8269, 2195},{ 8554, 2196},{ 8928, 2217} + /*Cr qi=45 INTER*/ + { + { 31, 140},{ 13, 533},{ 52, 770},{ 109, 1000}, + { 134, 1253},{ 201, 1555},{ 298, 1821},{ 411, 2076}, + { 525, 2314},{ 659, 2545},{ 828, 2747},{ 1019, 2918}, + { 1205, 3082},{ 1405, 3266},{ 1609, 3443},{ 1847, 3606}, + { 2085, 3730},{ 2404, 3835},{ 2709, 3876},{ 3049, 3886}, + { 3381, 3821},{ 3708, 3780},{ 4026, 3663},{ 4043, 3646} } } }, { { - /*Y' qi=62 INTRA*/ - { - { 29, 124},{ 1527, 1067},{ 3221, 1618},{ 4703, 1751}, - { 5909, 1744},{ 7001, 1779},{ 8057, 1829},{ 9049, 1885}, - { 9968, 1943},{10813, 1999},{11572, 2050},{12206, 2082}, - {12801, 2107},{13402, 2140},{14020, 2180},{14625, 2223}, - {15179, 2260},{15718, 2288},{16196, 2305},{16581, 2313}, - {16963, 2324},{17382, 2341},{17800, 2351},{18318, 2376} + /*Y' qi=54 INTRA*/ + { + { 316, 203},{ 720, 585},{ 1596, 1077},{ 2316, 1289}, + { 2687, 1439},{ 3133, 1593},{ 3495, 1706},{ 3836, 1775}, + { 4249, 1892},{ 4804, 2031},{ 5320, 2139},{ 5617, 2203}, + { 5726, 2199},{ 5726, 2176},{ 5682, 2146},{ 5677, 2127}, + { 5717, 2124},{ 5707, 2129},{ 5853, 2148},{ 6110, 2180}, + { 6454, 2247},{ 6714, 2287},{ 6845, 2304},{ 6854, 2303} }, - /*Y' qi=62 INTER*/ - { - { -8, 36},{ 218, 1284},{ 2073, 1965},{ 4814, 2159}, - { 7237, 2138},{ 8979, 2124},{10378, 2115},{11570, 2109}, - {12601, 2106},{13503, 2103},{14320, 2103},{15064, 2103}, - {15746, 2103},{16384, 2104},{16975, 2105},{17534, 2105}, - {18062, 2106},{18564, 2107},{19035, 2106},{19471, 2107}, - {19890, 2107},{20288, 2107},{20651, 2107},{21012, 2108} + /*Y' qi=54 INTER*/ + { + { -48, 217},{ 314, 1261},{ 1450, 2126},{ 2761, 2728}, + { 4275, 3012},{ 5408, 3167},{ 6305, 3245},{ 7165, 3290}, + { 7966, 3325},{ 8698, 3359},{ 9352, 3377},{ 9907, 3391}, + {10389, 3390},{10856, 3395},{11170, 3385},{11530, 3385}, + {11780, 3362},{12018, 3362},{12266, 3361},{12443, 3339}, + {12683, 3342},{12713, 3317},{12967, 3325},{13082, 3332} } }, { - /*Cb qi=62 INTRA*/ - { - { 21, 3},{ 283, 360},{ 565, 683},{ 907, 938}, - { 1269, 1143},{ 1611, 1311},{ 1949, 1441},{ 2290, 1535}, - { 2596, 1632},{ 2877, 1738},{ 3162, 1828},{ 3458, 1893}, - { 3745, 1948},{ 4011, 2016},{ 4253, 2089},{ 4506, 2164}, - { 4734, 2233},{ 4943, 2294},{ 5162, 2353},{ 5381, 2393}, - { 5593, 2420},{ 5807, 2454},{ 6003, 2496},{ 6210, 2543} + /*Cb qi=54 INTRA*/ + { + { 94, 73},{ 83, 557},{ 152, 818},{ 304, 919}, + { 341, 819},{ 506, 1128},{ 593, 1281},{ 700, 1389}, + { 714, 1225},{ 907, 1502},{ 981, 1549},{ 1062, 1641}, + { 1032, 1523},{ 1170, 1710},{ 1217, 1727},{ 1258, 1714}, + { 1216, 1575},{ 1309, 1682},{ 1331, 1656},{ 1393, 1712}, + { 1247, 1456},{ 1469, 1728},{ 1530, 1711},{ 1532, 1711} }, - /*Cb qi=62 INTER*/ - { - { 91, -1},{ 110, 344},{ 113, 663},{ 169, 987}, - { 306, 1279},{ 562, 1519},{ 961, 1701},{ 1450, 1845}, - { 2013, 1967},{ 2686, 2053},{ 3437, 2095},{ 4171, 2109}, - { 4841, 2109},{ 5441, 2105},{ 6002, 2097},{ 6542, 2089}, - { 7028, 2087},{ 7491, 2088},{ 7949, 2090},{ 8377, 2089}, - { 8789, 2095},{ 9195, 2103},{ 9569, 2104},{ 9937, 2102} + /*Cb qi=54 INTER*/ + { + { 33, 133},{ 12, 532},{ 70, 770},{ 171, 996}, + { 279, 1233},{ 427, 1503},{ 600, 1736},{ 824, 1939}, + { 1101, 2097},{ 1411, 2237},{ 1735, 2374},{ 2097, 2493}, + { 2486, 2606},{ 2916, 2691},{ 3297, 2771},{ 3715, 2826}, + { 4088, 2855},{ 4460, 2886},{ 4849, 2911},{ 5198, 2932}, + { 5489, 2940},{ 5875, 2981},{ 6208, 3017},{ 6270, 3012} } }, { - /*Cr qi=62 INTRA*/ - { - { 38, 8},{ 308, 374},{ 619, 685},{ 984, 925}, - { 1326, 1126},{ 1662, 1285},{ 1999, 1407},{ 2328, 1512}, - { 2659, 1604},{ 2976, 1691},{ 3285, 1774},{ 3570, 1853}, - { 3815, 1931},{ 4068, 1998},{ 4304, 2044},{ 4491, 2082}, - { 4666, 2124},{ 4870, 2174},{ 5078, 2231},{ 5262, 2285}, - { 5480, 2335},{ 5703, 2378},{ 5905, 2423},{ 6075, 2454} + /*Cr qi=54 INTRA*/ + { + { 103, 63},{ 83, 580},{ 258, 796},{ 301, 802}, + { 361, 675},{ 538, 1001},{ 625, 1097},{ 713, 1171}, + { 699, 1103},{ 868, 1380},{ 915, 1400},{ 970, 1491}, + { 923, 1365},{ 1070, 1603},{ 1154, 1655},{ 1206, 1677}, + { 1157, 1541},{ 1366, 1736},{ 1391, 1723},{ 1506, 1797}, + { 1388, 1556},{ 1616, 1828},{ 1655, 1797},{ 1658, 1796} }, - /*Cr qi=62 INTER*/ - { - { 79, 1},{ 95, 353},{ 102, 671},{ 169, 992}, - { 318, 1277},{ 569, 1515},{ 936, 1716},{ 1428, 1876}, - { 2034, 1993},{ 2738, 2067},{ 3511, 2095},{ 4268, 2094}, - { 4943, 2087},{ 5543, 2079},{ 6074, 2074},{ 6552, 2069}, - { 6985, 2057},{ 7366, 2043},{ 7728, 2030},{ 8086, 2021}, - { 8423, 2017},{ 8752, 2016},{ 9057, 2014},{ 9376, 2008} + /*Cr qi=54 INTER*/ + { + { 30, 138},{ 14, 532},{ 63, 771},{ 176, 990}, + { 299, 1226},{ 438, 1496},{ 606, 1735},{ 814, 1950}, + { 1089, 2127},{ 1417, 2281},{ 1761, 2421},{ 2104, 2571}, + { 2467, 2701},{ 2881, 2827},{ 3303, 2900},{ 3735, 2917}, + { 4183, 2913},{ 4529, 2882},{ 4915, 2844},{ 5168, 2796}, + { 5410, 2763},{ 5562, 2753},{ 5815, 2764},{ 5832, 2755} } } }, @@ -3964,61 +967,61 @@ oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={ { /*Y' qi=63 INTRA*/ { - { -59, 134},{ 1734, 1036},{ 3743, 1521},{ 5309, 1618}, - { 6520, 1597},{ 7664, 1609},{ 8809, 1630},{ 9894, 1657}, - {10907, 1687},{11838, 1717},{12673, 1744},{13379, 1758}, - {14038, 1767},{14698, 1784},{15379, 1806},{16062, 1831}, - {16694, 1852},{17300, 1867},{17827, 1878},{18250, 1881}, - {18702, 1884},{19199, 1892},{19665, 1896},{20273, 1908} + { 421, 194},{ 1272, 564},{ 3016, 943},{ 3831, 1079}, + { 4282, 1174},{ 4799, 1290},{ 5166, 1348},{ 5259, 1350}, + { 5720, 1426},{ 6501, 1539},{ 7048, 1606},{ 7328, 1642}, + { 7374, 1622},{ 7349, 1612},{ 7192, 1578},{ 7207, 1571}, + { 7161, 1555},{ 7259, 1573},{ 7432, 1592},{ 7710, 1613}, + { 8167, 1672},{ 8425, 1697},{ 8597, 1710},{ 8602, 1710} }, /*Y' qi=63 INTER*/ { - { -7, 33},{ 209, 1285},{ 2309, 1904},{ 5274, 2025}, - { 7801, 1966},{ 9637, 1924},{11126, 1892},{12403, 1868}, - {13515, 1849},{14491, 1834},{15380, 1822},{16197, 1814}, - {16944, 1806},{17645, 1799},{18303, 1794},{18916, 1789}, - {19494, 1785},{20056, 1782},{20568, 1779},{21047, 1776}, - {21508, 1775},{21925, 1772},{22327, 1770},{22678, 1771} + { -584, 286},{ 1231, 1186},{ 3939, 1663},{ 6096, 1865}, + { 7849, 1929},{ 8934, 1995},{ 9962, 2039},{11038, 2078}, + {12016, 2092},{12889, 2100},{13617, 2096},{14221, 2089}, + {14743, 2083},{15240, 2081},{15619, 2074},{15992, 2065}, + {16314, 2065},{16529, 2059},{16822, 2056},{17041, 2049}, + {17321, 2052},{17408, 2043},{17670, 2051},{17801, 2053} } }, { /*Cb qi=63 INTRA*/ { - { 20, 3},{ 294, 357},{ 608, 673},{ 1047, 908}, - { 1501, 1090},{ 1898, 1240},{ 2275, 1353},{ 2654, 1427}, - { 3014, 1502},{ 3366, 1579},{ 3726, 1637},{ 4084, 1674}, - { 4425, 1703},{ 4752, 1743},{ 5058, 1791},{ 5377, 1838}, - { 5676, 1877},{ 5946, 1912},{ 6213, 1945},{ 6458, 1969}, - { 6704, 1982},{ 6969, 1997},{ 7210, 2017},{ 7439, 2037} + { 154, 55},{ 280, 582},{ 507, 731},{ 788, 853}, + { 763, 738},{ 1141, 1008},{ 1323, 1090},{ 1540, 1220}, + { 1487, 1089},{ 1861, 1322},{ 1983, 1347},{ 2145, 1425}, + { 2047, 1317},{ 2334, 1475},{ 2352, 1413},{ 2458, 1467}, + { 2243, 1270},{ 2464, 1413},{ 2423, 1335},{ 2506, 1385}, + { 2182, 1180},{ 2565, 1376},{ 2555, 1321},{ 2557, 1321} }, /*Cb qi=63 INTER*/ { - { 86, 1},{ 108, 345},{ 111, 663},{ 168, 985}, - { 307, 1276},{ 577, 1513},{ 1007, 1688},{ 1550, 1819}, - { 2189, 1921},{ 2938, 1981},{ 3744, 2002},{ 4512, 2002}, - { 5199, 1996},{ 5824, 1986},{ 6419, 1971},{ 6978, 1954}, - { 7507, 1940},{ 8015, 1932},{ 8502, 1928},{ 8978, 1920}, - { 9410, 1915},{ 9842, 1910},{10262, 1901},{10634, 1896} + { 34, 133},{ 6, 531},{ 139, 767},{ 344, 975}, + { 608, 1180},{ 1048, 1367},{ 1651, 1495},{ 2376, 1572}, + { 3103, 1609},{ 3752, 1646},{ 4373, 1680},{ 4980, 1718}, + { 5540, 1744},{ 6023, 1764},{ 6431, 1766},{ 6800, 1769}, + { 7149, 1775},{ 7529, 1777},{ 7920, 1817},{ 8198, 1808}, + { 8691, 1848},{ 8965, 1845},{ 9372, 1865},{ 9459, 1863} } }, { /*Cr qi=63 INTRA*/ { - { 38, 7},{ 324, 367},{ 677, 670},{ 1136, 892}, - { 1562, 1070},{ 1951, 1209},{ 2326, 1313},{ 2694, 1399}, - { 3074, 1471},{ 3460, 1531},{ 3850, 1575},{ 4214, 1622}, - { 4522, 1679},{ 4819, 1723},{ 5089, 1749},{ 5315, 1769}, - { 5530, 1792},{ 5756, 1825},{ 6006, 1860},{ 6244, 1889}, - { 6514, 1924},{ 6792, 1946},{ 7026, 1962},{ 7191, 1971} + { 121, 59},{ 392, 570},{ 609, 654},{ 800, 760}, + { 720, 598},{ 1192, 892},{ 1298, 897},{ 1470, 1027}, + { 1411, 962},{ 1761, 1184},{ 1826, 1197},{ 1981, 1308}, + { 1854, 1198},{ 2229, 1427},{ 2269, 1365},{ 2428, 1453}, + { 2217, 1265},{ 2558, 1435},{ 2541, 1356},{ 2660, 1417}, + { 2337, 1199},{ 2688, 1382},{ 2603, 1301},{ 2605, 1300} }, /*Cr qi=63 INTER*/ { - { 80, 2},{ 95, 354},{ 101, 671},{ 167, 990}, - { 321, 1274},{ 585, 1509},{ 984, 1702},{ 1534, 1849}, - { 2217, 1947},{ 3005, 1995},{ 3839, 1999},{ 4619, 1986}, - { 5310, 1973},{ 5933, 1961},{ 6486, 1952},{ 6988, 1942}, - { 7435, 1927},{ 7817, 1911},{ 8198, 1900},{ 8552, 1895}, - { 8881, 1890},{ 9253, 1883},{ 9598, 1876},{ 9923, 1859} + { 31, 137},{ 10, 531},{ 136, 768},{ 360, 971}, + { 638, 1166},{ 1029, 1373},{ 1604, 1519},{ 2351, 1595}, + { 3129, 1640},{ 3861, 1691},{ 4491, 1751},{ 5101, 1783}, + { 5635, 1784},{ 6136, 1779},{ 6550, 1763},{ 6905, 1746}, + { 7172, 1726},{ 7495, 1732},{ 7738, 1735},{ 7949, 1735}, + { 8211, 1744},{ 8424, 1740},{ 8779, 1764},{ 8812, 1760} } } } diff --git a/thirdparty/libtheora/ocintrin.h b/thirdparty/libtheora/ocintrin.h index d49ebb2159..b200ceafce 100644 --- a/thirdparty/libtheora/ocintrin.h +++ b/thirdparty/libtheora/ocintrin.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: ocintrin.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch b/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch deleted file mode 100644 index 1b9c8e20be..0000000000 --- a/thirdparty/libtheora/patches/theora.git-0ae66d565e6bead8604d312bc1a4e9dccf245c88.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 0ae66d565e6bead8604d312bc1a4e9dccf245c88 Mon Sep 17 00:00:00 2001 -From: Tim Terriberry <tterribe@xiph.org> -Date: Tue, 8 May 2012 02:51:57 +0000 -Subject: [PATCH] Fix pp_sharp_mod calculation. - -This was broken when the dequant_tables indexing changed in commit - r16102, but it only affected post-processing quality, so we never - noticed. -With gcc 4.8.0, this can now trigger a segfault during decoder - initialization. - -svn path=/trunk/theora/; revision=18268 ---- - decode.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/decode.c b/decode.c -index b803505..9f2516a 100644 ---- a/decode.c -+++ b/decode.c -@@ -400,10 +400,10 @@ static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info, - int qsum; - qsum=0; - for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){ -- qsum+=_dec->state.dequant_tables[qti][pli][qi][12]+ -- _dec->state.dequant_tables[qti][pli][qi][17]+ -- _dec->state.dequant_tables[qti][pli][qi][18]+ -- _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0); -+ qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+ -+ _dec->state.dequant_tables[qi][pli][qti][17]+ -+ _dec->state.dequant_tables[qi][pli][qti][18]+ -+ _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0); - } - _dec->pp_sharp_mod[qi]=-(qsum>>11); - } --- -2.11.0 - diff --git a/thirdparty/libtheora/quant.c b/thirdparty/libtheora/quant.c index 8359f5abea..e206202844 100644 --- a/thirdparty/libtheora/quant.c +++ b/thirdparty/libtheora/quant.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -21,6 +21,14 @@ #include "quant.h" #include "decint.h" +/*The maximum output of the DCT with +/- 255 inputs is +/- 8157. + These minimum quantizers ensure the result after quantization (and after + prediction for DC) will be no more than +/- 510. + The tokenization system can handle values up to +/- 580, so there is no need + to do any coefficient clamping. + I would rather have allowed smaller quantizers and had to clamp, but these + minimums were required when constructing the original VP3 matrices and have + been formalized in the spec.*/ static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2}; static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2}; diff --git a/thirdparty/libtheora/quant.h b/thirdparty/libtheora/quant.h index 49ce13a65c..247210eaae 100644 --- a/thirdparty/libtheora/quant.h +++ b/thirdparty/libtheora/quant.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: quant.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ diff --git a/thirdparty/libtheora/rate.c b/thirdparty/libtheora/rate.c index 4f43bb2e5f..bf2b1396a1 100644 --- a/thirdparty/libtheora/rate.c +++ b/thirdparty/libtheora/rate.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: rate.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> @@ -190,7 +190,8 @@ void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){ This may need to be revised if the R-D cost estimation or qii flag optimization strategies change.*/ nqis=1; - if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible){ + if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&& + _enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0, lq+(OC_Q57(7)+5)/10); if(qi1!=qi)_enc->state.qis[nqis++]=qi1; @@ -761,6 +762,7 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc, _enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale); _enc->rc.cur_metrics.dup_count=_enc->dup_count; _enc->rc.cur_metrics.frame_type=_enc->state.frame_type; + _enc->rc.cur_metrics.activity_avg=_enc->activity_avg; _enc->rc.twopass_buffer_bytes=0; }break; case 2:{ @@ -863,9 +865,9 @@ int oc_enc_update_rc_state(oc_enc_ctx *_enc, return dropped; } -#define OC_RC_2PASS_VERSION (1) +#define OC_RC_2PASS_VERSION (2) #define OC_RC_2PASS_HDR_SZ (38) -#define OC_RC_2PASS_PACKET_SZ (8) +#define OC_RC_2PASS_PACKET_SZ (12) static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){ while(_bytes-->0){ @@ -900,6 +902,7 @@ int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){ oc_rc_buffer_val(&_enc->rc, _enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4); oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4); + oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.activity_avg,4); } } else if(_enc->packet_state==OC_PACKET_DONE&& @@ -1050,16 +1053,19 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){ ogg_uint32_t dup_count; ogg_int32_t log_scale; + unsigned activity; int qti; int arg; /*Read the metrics for the next frame.*/ dup_count=oc_rc_unbuffer_val(&_enc->rc,4); log_scale=oc_rc_unbuffer_val(&_enc->rc,4); + activity=oc_rc_unbuffer_val(&_enc->rc,4); _enc->rc.cur_metrics.log_scale=log_scale; qti=(dup_count&0x80000000)>>31; _enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF; _enc->rc.cur_metrics.frame_type=qti; _enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME; + _enc->activity_avg=_enc->rc.cur_metrics.activity_avg=activity; /*"Helpfully" set the dup count back to what it was in pass 1.*/ arg=_enc->rc.cur_metrics.dup_count; th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg)); @@ -1070,8 +1076,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ else{ int frames_needed; /*We're using a finite buffer:*/ - frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay - -(_enc->rc.scale_window_end-_enc->rc.scale_window0), + frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay, + _enc->rc.scale_window_end-_enc->rc.scale_window0), _enc->rc.frames_left[0]+_enc->rc.frames_left[1] -_enc->rc.nframes[0]-_enc->rc.nframes[1]); while(frames_needed>0){ @@ -1087,9 +1093,11 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ ogg_uint32_t dup_count; ogg_int32_t log_scale; int qti; + unsigned activity; /*Read the metrics for the next frame.*/ dup_count=oc_rc_unbuffer_val(&_enc->rc,4); log_scale=oc_rc_unbuffer_val(&_enc->rc,4); + activity=oc_rc_unbuffer_val(&_enc->rc,4); /*Add the to the circular buffer.*/ fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++; if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics; @@ -1098,6 +1106,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ qti=(dup_count&0x80000000)>>31; m->dup_count=dup_count&0x7FFFFFFF; m->frame_type=qti; + m->activity_avg=activity; /*And accumulate the statistics over the window.*/ _enc->rc.nframes[qti]++; _enc->rc.nframes[2]+=m->dup_count; @@ -1105,8 +1114,8 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ _enc->rc.scale_window_end+=m->dup_count+1; /*Compute an upper bound on the number of remaining packets needed for the current window.*/ - frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay - -(_enc->rc.scale_window_end-_enc->rc.scale_window0), + frames_needed=OC_MINI(_enc->rc.buf_delay-OC_MINI(_enc->rc.buf_delay, + _enc->rc.scale_window_end-_enc->rc.scale_window0), _enc->rc.frames_left[0]+_enc->rc.frames_left[1] -_enc->rc.nframes[0]-_enc->rc.nframes[1]); /*Clear the buffer for the next frame.*/ @@ -1124,6 +1133,7 @@ int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){ *(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head); _enc->rc.twopass_force_kf= _enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME; + _enc->activity_avg=_enc->rc.cur_metrics.activity_avg; /*"Helpfully" set the dup count back to what it was in pass 1.*/ arg=_enc->rc.cur_metrics.dup_count; th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg)); diff --git a/thirdparty/libtheora/state.c b/thirdparty/libtheora/state.c index 42ed33a9a3..f4c6240387 100644 --- a/thirdparty/libtheora/state.c +++ b/thirdparty/libtheora/state.c @@ -11,25 +11,93 @@ ******************************************************************** function: - last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> #include <string.h> -#include "internal.h" -#if defined(OC_X86_ASM) -#if defined(_MSC_VER) -# include "x86_vc/x86int.h" -#else -# include "x86/x86int.h" -#endif -#endif +#include "state.h" #if defined(OC_DUMP_IMAGES) # include <stdio.h> # include "png.h" +# include "zlib.h" #endif +/*The function used to fill in the chroma plane motion vectors for a macro + block when 4 different motion vectors are specified in the luma plane. + This version is for use with chroma decimated in the X and Y directions + (4:2:0). + _cbmvs: The chroma block-level motion vectors to fill in. + _lbmvs: The luma block-level motion vectors.*/ +static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ + int dx; + int dy; + dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]) + +OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]); + dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]) + +OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]); + _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,2,2),OC_DIV_ROUND_POW2(dy,2,2)); +} + +/*The function used to fill in the chroma plane motion vectors for a macro + block when 4 different motion vectors are specified in the luma plane. + This version is for use with chroma decimated in the Y direction. + _cbmvs: The chroma block-level motion vectors to fill in. + _lbmvs: The luma block-level motion vectors.*/ +static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ + int dx; + int dy; + dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[2]); + dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[2]); + _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); + dx=OC_MV_X(_lbmvs[1])+OC_MV_X(_lbmvs[3]); + dy=OC_MV_Y(_lbmvs[1])+OC_MV_Y(_lbmvs[3]); + _cbmvs[1]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); +} + +/*The function used to fill in the chroma plane motion vectors for a macro + block when 4 different motion vectors are specified in the luma plane. + This version is for use with chroma decimated in the X direction (4:2:2). + _cbmvs: The chroma block-level motion vectors to fill in. + _lbmvs: The luma block-level motion vectors.*/ +static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ + int dx; + int dy; + dx=OC_MV_X(_lbmvs[0])+OC_MV_X(_lbmvs[1]); + dy=OC_MV_Y(_lbmvs[0])+OC_MV_Y(_lbmvs[1]); + _cbmvs[0]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); + dx=OC_MV_X(_lbmvs[2])+OC_MV_X(_lbmvs[3]); + dy=OC_MV_Y(_lbmvs[2])+OC_MV_Y(_lbmvs[3]); + _cbmvs[2]=OC_MV(OC_DIV_ROUND_POW2(dx,1,1),OC_DIV_ROUND_POW2(dy,1,1)); +} + +/*The function used to fill in the chroma plane motion vectors for a macro + block when 4 different motion vectors are specified in the luma plane. + This version is for use with no chroma decimation (4:4:4). + _cbmvs: The chroma block-level motion vectors to fill in. + _lmbmv: The luma macro-block level motion vector to fill in for use in + prediction. + _lbmvs: The luma block-level motion vectors.*/ +static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){ + _cbmvs[0]=_lbmvs[0]; + _cbmvs[1]=_lbmvs[1]; + _cbmvs[2]=_lbmvs[2]; + _cbmvs[3]=_lbmvs[3]; +} + +/*A table of functions used to fill in the chroma plane motion vectors for a + macro block when 4 different motion vectors are specified in the luma + plane.*/ +const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={ + (oc_set_chroma_mvs_func)oc_set_chroma_mvs00, + (oc_set_chroma_mvs_func)oc_set_chroma_mvs01, + (oc_set_chroma_mvs_func)oc_set_chroma_mvs10, + (oc_set_chroma_mvs_func)oc_set_chroma_mvs11 +}; + + + /*Returns the fragment index of the top-left block in a macro block. This can be used to test whether or not the whole macro block is valid. _sb_map: The super block map. @@ -92,7 +160,7 @@ static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[], if(jmax>4)jmax=4; else if(jmax<=0)break; /*By default, set all fragment indices to -1.*/ - memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi])); + memset(_sb_maps[sbi],0xFF,sizeof(_sb_maps[sbi])); /*Fill in the fragment map for this super block.*/ xfrag=yfrag+x; for(i=0;i<imax;i++){ @@ -186,10 +254,14 @@ static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3], This version is for use with no chroma decimation (4:4:4). This uses the already filled-in luma plane values. _mb_map: The macro block map to fill. - _fplanes: The descriptions of the fragment planes.*/ + _fplanes: The descriptions of the fragment planes. + _xfrag0: The X location of the upper-left hand fragment in the luma plane. + _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/ static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3], - const oc_fragment_plane _fplanes[3]){ + const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){ int k; + (void)_xfrag0; + (void)_yfrag0; for(k=0;k<4;k++){ _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset; _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset; @@ -211,7 +283,7 @@ static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={ oc_mb_fill_cmapping00, oc_mb_fill_cmapping01, oc_mb_fill_cmapping10, - (oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11 + oc_mb_fill_cmapping11 }; /*Fills in the mapping from macro blocks to their corresponding fragment @@ -469,7 +541,7 @@ static void oc_state_frarray_clear(oc_theora_state *_state){ unrestricted motion vectors without special casing the boundary. If chroma is decimated in either direction, the padding is reduced by a factor of 2 on the appropriate sides. - _nrefs: The number of reference buffers to init; must be 3 or 4.*/ + _nrefs: The number of reference buffers to init; must be in the range 3...6.*/ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ th_info *info; unsigned char *ref_frame_data; @@ -481,6 +553,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ int yheight; int chstride; int cheight; + ptrdiff_t align; ptrdiff_t yoffset; ptrdiff_t coffset; ptrdiff_t *frag_buf_offs; @@ -489,33 +562,38 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ int vdec; int rfi; int pli; - if(_nrefs<3||_nrefs>4)return TH_EINVAL; + if(_nrefs<3||_nrefs>6)return TH_EINVAL; info=&_state->info; /*Compute the image buffer parameters for each plane.*/ hdec=!(info->pixel_fmt&1); vdec=!(info->pixel_fmt&2); yhstride=info->frame_width+2*OC_UMV_PADDING; yheight=info->frame_height+2*OC_UMV_PADDING; - chstride=yhstride>>hdec; + /*Require 16-byte aligned rows in the chroma planes.*/ + chstride=(yhstride>>hdec)+15&~15; cheight=yheight>>vdec; yplane_sz=yhstride*(size_t)yheight; cplane_sz=chstride*(size_t)cheight; yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride; coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride; - ref_frame_sz=yplane_sz+2*cplane_sz; + /*Although we guarantee the rows of the chroma planes are a multiple of 16 + bytes, the initial padding on the first row may only be 8 bytes. + Compute the offset needed to the actual image data to a multiple of 16.*/ + align=-coffset&15; + ref_frame_sz=yplane_sz+2*cplane_sz+16; ref_frame_data_sz=_nrefs*ref_frame_sz; /*Check for overflow. The same caveats apply as for oc_state_frarray_init().*/ - if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz|| + if(yplane_sz/yhstride!=(size_t)yheight||2*cplane_sz+16<cplane_sz|| ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){ return TH_EIMPL; } - ref_frame_data=_ogg_malloc(ref_frame_data_sz); + ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16); frag_buf_offs=_state->frag_buf_offs= _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs)); if(ref_frame_data==NULL||frag_buf_offs==NULL){ _ogg_free(frag_buf_offs); - _ogg_free(ref_frame_data); + oc_aligned_free(ref_frame_data); return TH_EFAULT; } /*Set up the width, height and stride for the image buffers.*/ @@ -532,15 +610,15 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0], sizeof(_state->ref_frame_bufs[0])); } + _state->ref_frame_handle=ref_frame_data; /*Set up the data pointers for the image buffers.*/ for(rfi=0;rfi<_nrefs;rfi++){ - _state->ref_frame_data[rfi]=ref_frame_data; _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset; - ref_frame_data+=yplane_sz; + ref_frame_data+=yplane_sz+align; _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset; ref_frame_data+=cplane_sz; _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset; - ref_frame_data+=cplane_sz; + ref_frame_data+=cplane_sz+(16-align); /*Flip the buffer upside down. This allows us to decode Theora's bottom-up frames in their natural order, yet return a top-down buffer with a positive stride to the user.*/ @@ -550,7 +628,7 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ _state->ref_ystride[0]=-yhstride; _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride; /*Initialize the fragment buffer offsets.*/ - ref_frame_data=_state->ref_frame_data[0]; + ref_frame_data=_state->ref_frame_bufs[0][0].data; fragi=0; for(pli=0;pli<3;pli++){ th_img_plane *iplane; @@ -576,41 +654,44 @@ static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){ vpix+=stride<<3; } } - /*Initialize the reference frame indices.*/ + /*Initialize the reference frame pointers and indices.*/ _state->ref_frame_idx[OC_FRAME_GOLD]= _state->ref_frame_idx[OC_FRAME_PREV]= - _state->ref_frame_idx[OC_FRAME_SELF]=-1; - _state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1; + _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]= + _state->ref_frame_idx[OC_FRAME_PREV_ORIG]= + _state->ref_frame_idx[OC_FRAME_SELF]= + _state->ref_frame_idx[OC_FRAME_IO]=-1; + _state->ref_frame_data[OC_FRAME_GOLD]= + _state->ref_frame_data[OC_FRAME_PREV]= + _state->ref_frame_data[OC_FRAME_GOLD_ORIG]= + _state->ref_frame_data[OC_FRAME_PREV_ORIG]= + _state->ref_frame_data[OC_FRAME_SELF]= + _state->ref_frame_data[OC_FRAME_IO]=NULL; return 0; } static void oc_state_ref_bufs_clear(oc_theora_state *_state){ _ogg_free(_state->frag_buf_offs); - _ogg_free(_state->ref_frame_data[0]); + oc_aligned_free(_state->ref_frame_handle); } -void oc_state_vtable_init_c(oc_theora_state *_state){ +void oc_state_accel_init_c(oc_theora_state *_state){ + _state->cpu_flags=0; +#if defined(OC_STATE_USE_VTABLE) _state->opt_vtable.frag_copy=oc_frag_copy_c; + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c; _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c; _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c; _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c; _state->opt_vtable.idct8x8=oc_idct8x8_c; _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c; - _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c; _state->opt_vtable.state_loop_filter_frag_rows= oc_state_loop_filter_frag_rows_c; _state->opt_vtable.restore_fpu=oc_restore_fpu_c; - _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG; -} - -/*Initialize the accelerated function pointers.*/ -void oc_state_vtable_init(oc_theora_state *_state){ -#if defined(OC_X86_ASM) - oc_state_vtable_init_x86(_state); -#else - oc_state_vtable_init_c(_state); #endif + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG; } @@ -626,7 +707,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){ how it is specified in the bitstream, because the Y axis is flipped in the bitstream. The displayable frame must fit inside the encoded frame. - The color space must be one known by the encoder.*/ + The color space must be one known by the encoder. + The framerate ratio must not contain a zero value.*/ if((_info->frame_width&0xF)||(_info->frame_height&0xF)|| _info->frame_width<=0||_info->frame_width>=0x100000|| _info->frame_height<=0||_info->frame_height>=0x100000|| @@ -639,7 +721,8 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){ but there are a number of compilers which will mis-optimize this. It's better to live with the spurious warnings.*/ _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES|| - _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){ + _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS|| + _info->fps_numerator<1||_info->fps_denominator<1){ return TH_EINVAL; } memset(_state,0,sizeof(*_state)); @@ -648,7 +731,7 @@ int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){ system.*/ _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y; _state->frame_type=OC_UNKWN_FRAME; - oc_state_vtable_init(_state); + oc_state_accel_init(_state); ret=oc_state_frarray_init(_state); if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs); if(ret<0){ @@ -758,11 +841,10 @@ void oc_state_borders_fill(oc_theora_state *_state,int _refi){ _offsets[1] is set if the motion vector has non-zero fractional components. _pli: The color plane index. - _dx: The X component of the motion vector. - _dy: The Y component of the motion vector. + _mv: The motion vector. Return: The number of offsets returned: 1 or 2.*/ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], - int _pli,int _dx,int _dy){ + int _pli,oc_mv _mv){ /*Here is a brief description of how Theora handles motion vectors: Motion vector components are specified to half-pixel accuracy in undecimated directions of each plane, and quarter-pixel accuracy in @@ -785,21 +867,25 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], int xfrac; int yfrac; int offs; + int dx; + int dy; ystride=_state->ref_ystride[_pli]; /*These two variables decide whether we are in half- or quarter-pixel precision in each component.*/ xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1)); yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2)); + dx=OC_MV_X(_mv); + dy=OC_MV_Y(_mv); /*These two variables are either 0 if all the fractional bits are zero or -1 if any of them are non-zero.*/ - xfrac=OC_SIGNMASK(-(_dx&(xprec|1))); - yfrac=OC_SIGNMASK(-(_dy&(yprec|1))); - offs=(_dx>>xprec)+(_dy>>yprec)*ystride; + xfrac=OC_SIGNMASK(-(dx&(xprec|1))); + yfrac=OC_SIGNMASK(-(dy&(yprec|1))); + offs=(dx>>xprec)+(dy>>yprec)*ystride; if(xfrac||yfrac){ int xmask; int ymask; - xmask=OC_SIGNMASK(_dx); - ymask=OC_SIGNMASK(_dy); + xmask=OC_SIGNMASK(dx); + ymask=OC_SIGNMASK(dy); yfrac&=ystride; _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask); _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask); @@ -848,13 +934,17 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], int mx2; int my2; int offs; + int dx; + int dy; ystride=_state->ref_ystride[_pli]; qpy=_pli!=0&&!(_state->info.pixel_fmt&2); - my=OC_MVMAP[qpy][_dy+31]; - my2=OC_MVMAP2[qpy][_dy+31]; + dx=OC_MV_X(_mv); + dy=OC_MV_Y(_mv); + my=OC_MVMAP[qpy][dy+31]; + my2=OC_MVMAP2[qpy][dy+31]; qpx=_pli!=0&&!(_state->info.pixel_fmt&1); - mx=OC_MVMAP[qpx][_dx+31]; - mx2=OC_MVMAP2[qpx][_dx+31]; + mx=OC_MVMAP[qpx][dx+31]; + mx2=OC_MVMAP2[qpx][dx+31]; offs=my*ystride+mx; if(mx2||my2){ _offsets[1]=offs+my2*ystride+mx2; @@ -866,18 +956,12 @@ int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], #endif } -void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){ - _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs, - _last_zzi,_dc_quant); -} - void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){ + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ unsigned char *dst; ptrdiff_t frag_buf_off; int ystride; - int mb_mode; + int refi; /*Apply the inverse transform.*/ /*Special case only having a DC component.*/ if(_last_zzi<2){ @@ -887,69 +971,35 @@ void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, no iDCT rounding.*/ p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); /*LOOP VECTORIZES.*/ - for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p; + for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p; } else{ /*First, dequantize the DC coefficient.*/ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8(_state,_dct_coeffs,_last_zzi); + oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); } /*Fill in the target buffer.*/ frag_buf_off=_state->frag_buf_offs[_fragi]; - mb_mode=_state->frags[_fragi].mb_mode; + refi=_state->frags[_fragi].refi; ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off; - if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs); + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64); else{ const unsigned char *ref; int mvoffsets[2]; - ref= - _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]] - +frag_buf_off; + ref=_state->ref_frame_data[refi]+frag_buf_off; if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){ + _state->frag_mvs[_fragi])>1){ oc_frag_recon_inter2(_state, - dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs); + dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64); + } + else{ + oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); } - else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs); - } -} - -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _pli: The color plane the fragments lie in.*/ -void oc_state_frag_copy_list(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli){ - _state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame, - _src_frame,_pli); -} - -void oc_state_frag_copy_list_c(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli){ - const ptrdiff_t *frag_buf_offs; - const unsigned char *src_frame_data; - unsigned char *dst_frame_data; - ptrdiff_t fragii; - int ystride; - dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]]; - src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]]; - ystride=_state->ref_ystride[_pli]; - frag_buf_offs=_state->frag_buf_offs; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=frag_buf_offs[_fragis[fragii]]; - oc_frag_copy(_state,dst_frame_data+frag_buf_off, - src_frame_data+frag_buf_off,ystride); } } -static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){ +static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){ int y; _pix-=2; for(y=0;y<8;y++){ @@ -965,7 +1015,7 @@ static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){ } } -static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){ +static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){ int x; _pix-=_ystride*2; for(x=0;x<8;x++){ @@ -982,20 +1032,16 @@ static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){ /*Initialize the bounding values array used by the loop filter. _bv: Storage for the array. - Return: 0 on success, or a non-zero value if no filtering need be applied.*/ -int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){ - int flimit; + _flimit: The filter limit as defined in Section 7.10 of the spec.*/ +void oc_loop_filter_init_c(signed char _bv[256],int _flimit){ int i; - flimit=_state->loop_filter_limits[_state->qis[0]]; - if(flimit==0)return 1; memset(_bv,0,sizeof(_bv[0])*256); - for(i=0;i<flimit;i++){ - if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit; - _bv[127-i]=-i; - _bv[127+i]=i; - if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i; + for(i=0;i<_flimit;i++){ + if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit); + _bv[127-i]=(signed char)(-i); + _bv[127+i]=(signed char)(i); + if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i); } - return 0; } /*Apply the loop filter to a given set of fragment rows in the given plane. @@ -1006,14 +1052,8 @@ int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){ _pli: The color plane to filter. _fragy0: The Y coordinate of the first fragment row to filter. _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ -void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256], - int _refi,int _pli,int _fragy0,int _fragy_end){ - _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli, - _fragy0,_fragy_end); -} - -void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv, - int _refi,int _pli,int _fragy0,int _fragy_end){ +void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state, + signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){ const oc_fragment_plane *fplane; const oc_fragment *frags; const ptrdiff_t *frag_buf_offs; @@ -1030,7 +1070,7 @@ void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv, fragi_top=fplane->froffset; fragi_bot=fragi_top+fplane->nfrags; fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; + fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; ystride=_state->ref_ystride[_pli]; frags=_state->frags; frag_buf_offs=_state->frag_buf_offs; diff --git a/thirdparty/libtheora/state.h b/thirdparty/libtheora/state.h new file mode 100644 index 0000000000..f176a53ce9 --- /dev/null +++ b/thirdparty/libtheora/state.h @@ -0,0 +1,552 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $ + + ********************************************************************/ +#if !defined(_state_H) +# define _state_H (1) +# include "internal.h" +# include "huffman.h" +# include "quant.h" + + + +/*A single quadrant of the map from a super block to fragment numbers.*/ +typedef ptrdiff_t oc_sb_map_quad[4]; +/*A map from a super block to fragment numbers.*/ +typedef oc_sb_map_quad oc_sb_map[4]; +/*A single plane of the map from a macro block to fragment numbers.*/ +typedef ptrdiff_t oc_mb_map_plane[4]; +/*A map from a macro block to fragment numbers.*/ +typedef oc_mb_map_plane oc_mb_map[3]; +/*A motion vector.*/ +typedef ogg_int16_t oc_mv; + +typedef struct oc_sb_flags oc_sb_flags; +typedef struct oc_border_info oc_border_info; +typedef struct oc_fragment oc_fragment; +typedef struct oc_fragment_plane oc_fragment_plane; +typedef struct oc_base_opt_vtable oc_base_opt_vtable; +typedef struct oc_base_opt_data oc_base_opt_data; +typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable; +typedef struct oc_theora_state oc_theora_state; + + + +/*Shared accelerated functions.*/ +# if defined(OC_X86_ASM) +# if defined(_MSC_VER) +# include "x86_vc/x86int.h" +# else +# include "x86/x86int.h" +# endif +# endif +# if defined(OC_ARM_ASM) +# include "arm/armint.h" +# endif +# if defined(OC_C64X_ASM) +# include "c64x/c64xint.h" +# endif + +# if !defined(oc_state_accel_init) +# define oc_state_accel_init oc_state_accel_init_c +# endif +# if defined(OC_STATE_USE_VTABLE) +# if !defined(oc_frag_copy) +# define oc_frag_copy(_state,_dst,_src,_ystride) \ + ((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride)) +# endif +# if !defined(oc_frag_copy_list) +# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) \ + ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs)) +# endif +# if !defined(oc_frag_recon_intra) +# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \ + ((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue)) +# endif +# if !defined(oc_frag_recon_inter) +# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ + ((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue)) +# endif +# if !defined(oc_frag_recon_inter2) +# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ + ((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \ + _src1,_src2,_ystride,_residue)) +# endif +# if !defined(oc_idct8x8) +# define oc_idct8x8(_state,_y,_x,_last_zzi) \ + ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi)) +# endif +# if !defined(oc_state_frag_recon) +# define oc_state_frag_recon(_state,_fragi, \ + _pli,_dct_coeffs,_last_zzi,_dc_quant) \ + ((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \ + _pli,_dct_coeffs,_last_zzi,_dc_quant)) +# endif +# if !defined(oc_loop_filter_init) +# define oc_loop_filter_init(_state,_bv,_flimit) \ + ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit)) +# endif +# if !defined(oc_state_loop_filter_frag_rows) +# define oc_state_loop_filter_frag_rows(_state, \ + _bv,_refi,_pli,_fragy0,_fragy_end) \ + ((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \ + _bv,_refi,_pli,_fragy0,_fragy_end)) +# endif +# if !defined(oc_restore_fpu) +# define oc_restore_fpu(_state) \ + ((*(_state)->opt_vtable.restore_fpu)()) +# endif +# else +# if !defined(oc_frag_copy) +# define oc_frag_copy(_state,_dst,_src,_ystride) \ + oc_frag_copy_c(_dst,_src,_ystride) +# endif +# if !defined(oc_frag_copy_list) +# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) \ + oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) +# endif +# if !defined(oc_frag_recon_intra) +# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \ + oc_frag_recon_intra_c(_dst,_dst_ystride,_residue) +# endif +# if !defined(oc_frag_recon_inter) +# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ + oc_frag_recon_inter_c(_dst,_src,_ystride,_residue) +# endif +# if !defined(oc_frag_recon_inter2) +# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ + oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue) +# endif +# if !defined(oc_idct8x8) +# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi) +# endif +# if !defined(oc_state_frag_recon) +# define oc_state_frag_recon oc_state_frag_recon_c +# endif +# if !defined(oc_loop_filter_init) +# define oc_loop_filter_init(_state,_bv,_flimit) \ + oc_loop_filter_init_c(_bv,_flimit) +# endif +# if !defined(oc_state_loop_filter_frag_rows) +# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c +# endif +# if !defined(oc_restore_fpu) +# define oc_restore_fpu(_state) do{}while(0) +# endif +# endif + + + +/*A keyframe.*/ +# define OC_INTRA_FRAME (0) +/*A predicted frame.*/ +# define OC_INTER_FRAME (1) +/*A frame of unknown type (frame type decision has not yet been made).*/ +# define OC_UNKWN_FRAME (-1) + +/*The amount of padding to add to the reconstructed frame buffers on all + sides. + This is used to allow unrestricted motion vectors without special casing. + This must be a multiple of 2.*/ +# define OC_UMV_PADDING (16) + +/*Frame classification indices.*/ +/*The previous golden frame.*/ +# define OC_FRAME_GOLD (0) +/*The previous frame.*/ +# define OC_FRAME_PREV (1) +/*The current frame.*/ +# define OC_FRAME_SELF (2) +/*Used to mark uncoded fragments (for DC prediction).*/ +# define OC_FRAME_NONE (3) + +/*The input or output buffer.*/ +# define OC_FRAME_IO (3) +/*Uncompressed prev golden frame.*/ +# define OC_FRAME_GOLD_ORIG (4) +/*Uncompressed previous frame. */ +# define OC_FRAME_PREV_ORIG (5) + +/*Macroblock modes.*/ +/*Macro block is invalid: It is never coded.*/ +# define OC_MODE_INVALID (-1) +/*Encoded difference from the same macro block in the previous frame.*/ +# define OC_MODE_INTER_NOMV (0) +/*Encoded with no motion compensated prediction.*/ +# define OC_MODE_INTRA (1) +/*Encoded difference from the previous frame offset by the given motion + vector.*/ +# define OC_MODE_INTER_MV (2) +/*Encoded difference from the previous frame offset by the last coded motion + vector.*/ +# define OC_MODE_INTER_MV_LAST (3) +/*Encoded difference from the previous frame offset by the second to last + coded motion vector.*/ +# define OC_MODE_INTER_MV_LAST2 (4) +/*Encoded difference from the same macro block in the previous golden + frame.*/ +# define OC_MODE_GOLDEN_NOMV (5) +/*Encoded difference from the previous golden frame offset by the given motion + vector.*/ +# define OC_MODE_GOLDEN_MV (6) +/*Encoded difference from the previous frame offset by the individual motion + vectors given for each block.*/ +# define OC_MODE_INTER_MV_FOUR (7) +/*The number of (coded) modes.*/ +# define OC_NMODES (8) + +/*Determines the reference frame used for a given MB mode.*/ +# define OC_FRAME_FOR_MODE(_x) \ + OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \ + OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x)) + +/*Constants for the packet state machine common between encoder and decoder.*/ + +/*Next packet to emit/read: Codec info header.*/ +# define OC_PACKET_INFO_HDR (-3) +/*Next packet to emit/read: Comment header.*/ +# define OC_PACKET_COMMENT_HDR (-2) +/*Next packet to emit/read: Codec setup header.*/ +# define OC_PACKET_SETUP_HDR (-1) +/*No more packets to emit/read.*/ +# define OC_PACKET_DONE (INT_MAX) + + + +#define OC_MV(_x,_y) ((oc_mv)((_x)&0xFF|(_y)<<8)) +#define OC_MV_X(_mv) ((signed char)(_mv)) +#define OC_MV_Y(_mv) ((_mv)>>8) +#define OC_MV_ADD(_mv1,_mv2) \ + OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \ + OC_MV_Y(_mv1)+OC_MV_Y(_mv2)) +#define OC_MV_SUB(_mv1,_mv2) \ + OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \ + OC_MV_Y(_mv1)-OC_MV_Y(_mv2)) + + + +/*Super blocks are 32x32 segments of pixels in a single color plane indexed + in image order. + Internally, super blocks are broken up into four quadrants, each of which + contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels. + Quadrants, and the blocks within them, are indexed in a special order called + a "Hilbert curve" within the super block. + + In order to differentiate between the Hilbert-curve indexing strategy and + the regular image order indexing strategy, blocks indexed in image order + are called "fragments". + Fragments are indexed in image order, left to right, then bottom to top, + from Y' plane to Cb plane to Cr plane. + + The co-located fragments in all image planes corresponding to the location + of a single quadrant of a luma plane super block form a macro block. + Thus there is only a single set of macro blocks for all planes, each of which + contains between 6 and 12 fragments, depending on the pixel format. + Therefore macro block information is kept in a separate set of arrays from + super blocks to avoid unused space in the other planes. + The lists are indexed in super block order. + That is, the macro block corresponding to the macro block mbi in (luma plane) + super block sbi is at index (sbi<<2|mbi). + Thus the number of macro blocks in each dimension is always twice the number + of super blocks, even when only an odd number fall inside the coded frame. + These "extra" macro blocks are just an artifact of our internal data layout, + and not part of the coded stream; they are flagged with a negative MB mode.*/ + + + +/*Super block information.*/ +struct oc_sb_flags{ + unsigned char coded_fully:1; + unsigned char coded_partially:1; + unsigned char quad_valid:4; +}; + + + +/*Information about a fragment which intersects the border of the displayable + region. + This marks which pixels belong to the displayable region.*/ +struct oc_border_info{ + /*A bit mask marking which pixels are in the displayable region. + Pixel (x,y) corresponds to bit (y<<3|x).*/ + ogg_int64_t mask; + /*The number of pixels in the displayable region. + This is always positive, and always less than 64.*/ + int npixels; +}; + + + +/*Fragment information.*/ +struct oc_fragment{ + /*A flag indicating whether or not this fragment is coded.*/ + unsigned coded:1; + /*A flag indicating that this entire fragment lies outside the displayable + region of the frame. + Note the contrast with an invalid macro block, which is outside the coded + frame, not just the displayable one. + There are no fragments outside the coded frame by construction.*/ + unsigned invalid:1; + /*The index of the quality index used for this fragment's AC coefficients.*/ + unsigned qii:4; + /*The index of the reference frame this fragment is predicted from.*/ + unsigned refi:2; + /*The mode of the macroblock this fragment belongs to.*/ + unsigned mb_mode:3; + /*The index of the associated border information for fragments which lie + partially outside the displayable region. + For fragments completely inside or outside this region, this is -1. + Note that the C standard requires an explicit signed keyword for bitfield + types, since some compilers may treat them as unsigned without it.*/ + signed int borderi:5; + /*The prediction-corrected DC component. + Note that the C standard requires an explicit signed keyword for bitfield + types, since some compilers may treat them as unsigned without it.*/ + signed int dc:16; +}; + + + +/*A description of each fragment plane.*/ +struct oc_fragment_plane{ + /*The number of fragments in the horizontal direction.*/ + int nhfrags; + /*The number of fragments in the vertical direction.*/ + int nvfrags; + /*The offset of the first fragment in the plane.*/ + ptrdiff_t froffset; + /*The total number of fragments in the plane.*/ + ptrdiff_t nfrags; + /*The number of super blocks in the horizontal direction.*/ + unsigned nhsbs; + /*The number of super blocks in the vertical direction.*/ + unsigned nvsbs; + /*The offset of the first super block in the plane.*/ + unsigned sboffset; + /*The total number of super blocks in the plane.*/ + unsigned nsbs; +}; + + +typedef void (*oc_state_loop_filter_frag_rows_func)( + const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli, + int _fragy0,int _fragy_end); + +/*The shared (encoder and decoder) functions that have accelerated variants.*/ +struct oc_base_opt_vtable{ + void (*frag_copy)(unsigned char *_dst, + const unsigned char *_src,int _ystride); + void (*frag_copy_list)(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); + void (*frag_recon_intra)(unsigned char *_dst,int _ystride, + const ogg_int16_t _residue[64]); + void (*frag_recon_inter)(unsigned char *_dst, + const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); + void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); + void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); + void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); + void (*loop_filter_init)(signed char _bv[256],int _flimit); + oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows; + void (*restore_fpu)(void); +}; + +/*The shared (encoder and decoder) tables that vary according to which variants + of the above functions are used.*/ +struct oc_base_opt_data{ + const unsigned char *dct_fzig_zag; +}; + + +/*State information common to both the encoder and decoder.*/ +struct oc_theora_state{ + /*The stream information.*/ + th_info info; +# if defined(OC_STATE_USE_VTABLE) + /*Table for shared accelerated functions.*/ + oc_base_opt_vtable opt_vtable; +# endif + /*Table for shared data used by accelerated functions.*/ + oc_base_opt_data opt_data; + /*CPU flags to detect the presence of extended instruction sets.*/ + ogg_uint32_t cpu_flags; + /*The fragment plane descriptions.*/ + oc_fragment_plane fplanes[3]; + /*The list of fragments, indexed in image order.*/ + oc_fragment *frags; + /*The the offset into the reference frame buffer to the upper-left pixel of + each fragment.*/ + ptrdiff_t *frag_buf_offs; + /*The motion vector for each fragment.*/ + oc_mv *frag_mvs; + /*The total number of fragments in a single frame.*/ + ptrdiff_t nfrags; + /*The list of super block maps, indexed in image order.*/ + oc_sb_map *sb_maps; + /*The list of super block flags, indexed in image order.*/ + oc_sb_flags *sb_flags; + /*The total number of super blocks in a single frame.*/ + unsigned nsbs; + /*The fragments from each color plane that belong to each macro block. + Fragments are stored in image order (left to right then top to bottom). + When chroma components are decimated, the extra fragments have an index of + -1.*/ + oc_mb_map *mb_maps; + /*The list of macro block modes. + A negative number indicates the macro block lies entirely outside the + coded frame.*/ + signed char *mb_modes; + /*The number of macro blocks in the X direction.*/ + unsigned nhmbs; + /*The number of macro blocks in the Y direction.*/ + unsigned nvmbs; + /*The total number of macro blocks.*/ + size_t nmbs; + /*The list of coded fragments, in coded order. + Uncoded fragments are stored in reverse order from the end of the list.*/ + ptrdiff_t *coded_fragis; + /*The number of coded fragments in each plane.*/ + ptrdiff_t ncoded_fragis[3]; + /*The total number of coded fragments.*/ + ptrdiff_t ntotal_coded_fragis; + /*The actual buffers used for the reference frames.*/ + th_ycbcr_buffer ref_frame_bufs[6]; + /*The index of the buffers being used for each OC_FRAME_* reference frame.*/ + int ref_frame_idx[6]; + /*The storage for the reference frame buffers. + This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here + for faster look-up.*/ + unsigned char *ref_frame_data[6]; + /*The handle used to allocate the reference frame buffers.*/ + unsigned char *ref_frame_handle; + /*The strides for each plane in the reference frames.*/ + int ref_ystride[3]; + /*The number of unique border patterns.*/ + int nborders; + /*The unique border patterns for all border fragments. + The borderi field of fragments which straddle the border indexes this + list.*/ + oc_border_info borders[16]; + /*The frame number of the last keyframe.*/ + ogg_int64_t keyframe_num; + /*The frame number of the current frame.*/ + ogg_int64_t curframe_num; + /*The granpos of the current frame.*/ + ogg_int64_t granpos; + /*The type of the current frame.*/ + signed char frame_type; + /*The bias to add to the frame count when computing granule positions.*/ + unsigned char granpos_bias; + /*The number of quality indices used in the current frame.*/ + unsigned char nqis; + /*The quality indices of the current frame.*/ + unsigned char qis[3]; + /*The dequantization tables, stored in zig-zag order, and indexed by + qi, pli, qti, and zzi.*/ + ogg_uint16_t *dequant_tables[64][3][2]; + OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]); + /*Loop filter strength parameters.*/ + unsigned char loop_filter_limits[64]; +}; + + + +/*The function type used to fill in the chroma plane motion vectors for a + macro block when 4 different motion vectors are specified in the luma + plane. + _cbmvs: The chroma block-level motion vectors to fill in. + _lmbmv: The luma macro-block level motion vector to fill in for use in + prediction. + _lbmvs: The luma block-level motion vectors.*/ +typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]); + + + +/*A table of functions used to fill in the Cb,Cr plane motion vectors for a + macro block when 4 different motion vectors are specified in the luma + plane.*/ +extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]; + + + +int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs); +void oc_state_clear(oc_theora_state *_state); +void oc_state_accel_init_c(oc_theora_state *_state); +void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli, + int _y0,int _yend); +void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli); +void oc_state_borders_fill(oc_theora_state *_state,int _refi); +void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx, + th_ycbcr_buffer _img); +int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby); +int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2], + int _pli,oc_mv _mv); + +void oc_loop_filter_init_c(signed char _bv[256],int _flimit); +void oc_state_loop_filter(oc_theora_state *_state,int _frame); +# if defined(OC_DUMP_IMAGES) +int oc_state_dump_frame(const oc_theora_state *_state,int _frame, + const char *_suf); +# endif + +/*Default pure-C implementations of shared accelerated functions.*/ +void oc_frag_copy_c(unsigned char *_dst, + const unsigned char *_src,int _src_ystride); +void oc_frag_copy_list_c(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); +void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride, + const ogg_int16_t _residue[64]); +void oc_frag_recon_inter_c(unsigned char *_dst, + const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]); +void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1, + const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]); +void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi, + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); +void oc_restore_fpu_c(void); + +/*We need a way to call a few encoder functions without introducing a link-time + dependency into the decoder, while still allowing the old alpha API which + does not distinguish between encoder and decoder objects to be used. + We do this by placing a function table at the start of the encoder object + which can dispatch into the encoder library. + We do a similar thing for the decoder in case we ever decide to split off a + common base library.*/ +typedef void (*oc_state_clear_func)(theora_state *_th); +typedef int (*oc_state_control_func)(theora_state *th,int _req, + void *_buf,size_t _buf_sz); +typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th, + ogg_int64_t _granulepos); +typedef double (*oc_state_granule_time_func)(theora_state *_th, + ogg_int64_t _granulepos); + + +struct oc_state_dispatch_vtable{ + oc_state_clear_func clear; + oc_state_control_func control; + oc_state_granule_frame_func granule_frame; + oc_state_granule_time_func granule_time; +}; + +#endif diff --git a/thirdparty/libtheora/theora/codec.h b/thirdparty/libtheora/theora/codec.h index 5c2669630c..29b8602325 100644 --- a/thirdparty/libtheora/theora/codec.h +++ b/thirdparty/libtheora/theora/codec.h @@ -16,11 +16,12 @@ ********************************************************************/ /**\mainpage - * + * * \section intro Introduction * - * This is the documentation for <tt>libtheora</tt> C API. - * The current reference + * This is the documentation for the <tt>libtheora</tt> C API. + * + * The \c libtheora package is the current reference * implementation for <a href="http://www.theora.org/">Theora</a>, a free, * patent-unencumbered video codec. * Theora is derived from On2's VP3 codec with additional features and @@ -30,29 +31,31 @@ * <a href="http://www.theora.org/doc/Theora.pdf">the Theora * specification</a>. * - * \subsection Organization + * \section Organization * - * The functions documented here are actually subdivided into three + * The functions documented here are divided between two * separate libraries: - * - <tt>libtheoraenc</tt> contains the encoder interface, + * - \c libtheoraenc contains the encoder interface, * described in \ref encfuncs. - * - <tt>libtheoradec</tt> contains the decoder interface and - * routines shared with the encoder. - * You must also link to this if you link to <tt>libtheoraenc</tt>. - * The routines in this library are described in \ref decfuncs and - * \ref basefuncs. - * - <tt>libtheora</tt> contains the \ref oldfuncs. + * - \c libtheoradec contains the decoder interface, + * described in \ref decfuncs, \n + * and additional \ref basefuncs. + * + * New code should link to \c libtheoradec. If using encoder + * features, it must also link to \c libtheoraenc. * - * New code should link to <tt>libtheoradec</tt> and, if using encoder - * features, <tt>libtheoraenc</tt>. Together these two export both - * the standard and the legacy API, so this is all that is needed by - * any code. The older <tt>libtheora</tt> library is provided just for - * compatibility with older build configurations. + * During initial development, prior to the 1.0 release, + * \c libtheora exported a different \ref oldfuncs which + * combined both encode and decode functions. + * In general, legacy API symbols can be indentified + * by their \c theora_ or \c OC_ namespace prefixes. + * The current API uses \c th_ or \c TH_ instead. * - * In general the recommended 1.x API symbols can be distinguished - * by their <tt>th_</tt> or <tt>TH_</tt> namespace prefix. - * The older, legacy API uses <tt>theora_</tt> or <tt>OC_</tt> - * prefixes instead. + * While deprecated, \c libtheoraenc and \c libtheoradec + * together export the legacy api as well at the one documented above. + * Likewise, the legacy \c libtheora included with this package + * exports the new 1.x API. Older code and build scripts can therefore + * but updated independently to the current scheme. */ /**\file @@ -168,7 +171,7 @@ typedef struct{ typedef th_img_plane th_ycbcr_buffer[3]; /**Theora bitstream information. - * This contains the basic playback parameters for a stream, and corresponds to + * This contains the basic playback parameters for a stream, and corresponds to * the initial 'info' header packet. * To initialize an encoder, the application fills in this structure and * passes it to th_encode_alloc(). @@ -317,7 +320,7 @@ typedef struct{ * In filling in this structure, th_decode_headerin() will null-terminate * the user_comment strings for safety. * However, the bitstream format itself treats them as 8-bit clean vectors, - * possibly containing null characters, and so the length array should be + * possibly containing null characters, so the length array should be * treated as their authoritative length. */ typedef struct th_comment{ @@ -448,7 +451,13 @@ typedef struct{ /**\defgroup basefuncs Functions Shared by Encode and Decode*/ /*@{*/ -/**\name Basic shared functions*/ +/**\name Basic shared functions + * These functions return information about the library itself, + * or provide high-level information about codec state + * and packet type. + * + * You must link to \c libtheoradec if you use any of the + * functions in this section.*/ /*@{*/ /**Retrieves a human-readable string to identify the library vendor and * version. @@ -510,7 +519,12 @@ extern int th_packet_iskeyframe(ogg_packet *_op); /*@}*/ -/**\name Functions for manipulating header data*/ +/**\name Functions for manipulating header data + * These functions manipulate the #th_info and #th_comment structures + * which describe video parameters and key-value metadata, respectively. + * + * You must link to \c libtheoradec if you use any of the + * functions in this section.*/ /*@{*/ /**Initializes a th_info structure. * This should be called on a freshly allocated #th_info structure before @@ -537,7 +551,7 @@ extern void th_comment_init(th_comment *_tc); * \param _tc The #th_comment struct to add the comment to. * \param _comment Must be a null-terminated UTF-8 string containing the * comment in "TAG=the value" form.*/ -extern void th_comment_add(th_comment *_tc, char *_comment); +extern void th_comment_add(th_comment *_tc,const char *_comment); /**Add a comment to an initialized #th_comment structure. * \note Neither th_comment_add() nor th_comment_add_tag() support * comments containing null values, although the bitstream format does @@ -545,10 +559,11 @@ extern void th_comment_add(th_comment *_tc, char *_comment); * To add such comments you will need to manipulate the #th_comment * structure directly. * \param _tc The #th_comment struct to add the comment to. - * \param _tag A null-terminated string containing the tag associated with + * \param _tag A null-terminated string containing the tag associated with * the comment. * \param _val The corresponding value as a null-terminated string.*/ -extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val); +extern void th_comment_add_tag(th_comment *_tc,const char *_tag, + const char *_val); /**Look up a comment value by its tag. * \param _tc An initialized #th_comment structure. * \param _tag The tag to look up. @@ -564,15 +579,15 @@ extern void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val); * It should not be modified or freed by the application, and * modifications to the structure may invalidate the pointer. * \retval NULL If no matching tag is found.*/ -extern char *th_comment_query(th_comment *_tc,char *_tag,int _count); +extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count); /**Look up the number of instances of a tag. * Call this first when querying for a specific tag and then iterate over the * number of instances with separate calls to th_comment_query() to * retrieve all the values for that tag in order. * \param _tc An initialized #th_comment structure. * \param _tag The tag to look up. - * \return The number on instances of this particular tag.*/ -extern int th_comment_query_count(th_comment *_tc,char *_tag); + * \return The number of instances of this particular tag.*/ +extern int th_comment_query_count(th_comment *_tc,const char *_tag); /**Clears a #th_comment structure. * This should be called on a #th_comment structure after it is no longer * needed. diff --git a/thirdparty/libtheora/theora/theora.h b/thirdparty/libtheora/theora/theora.h index af6eb6f380..a729a76890 100644 --- a/thirdparty/libtheora/theora/theora.h +++ b/thirdparty/libtheora/theora/theora.h @@ -34,41 +34,41 @@ extern "C" * * \section intro Introduction * - * This is the documentation for the libtheora legacy C API, declared in + * This is the documentation for the libtheora legacy C API, declared in * the theora.h header, which describes the old interface used before * the 1.0 release. This API was widely deployed for several years and - * remains supported, but for new code we recommend the cleaner API + * remains supported, but for new code we recommend the cleaner API * declared in theoradec.h and theoraenc.h. * * libtheora is the reference implementation for * <a href="http://www.theora.org/">Theora</a>, a free video codec. * Theora is derived from On2's VP3 codec with improved integration with * Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>. - * + * * \section overview Overview * - * This library will both decode and encode theora packets to/from raw YUV + * This library will both decode and encode theora packets to/from raw YUV * frames. In either case, the packets will most likely either come from or - * need to be embedded in an Ogg stream. Use - * <a href="http://xiph.org/ogg/">libogg</a> or + * need to be embedded in an Ogg stream. Use + * <a href="http://xiph.org/ogg/">libogg</a> or * <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a> * to extract/package these packets. * * \section decoding Decoding Process * * Decoding can be separated into the following steps: - * -# initialise theora_info and theora_comment structures using + * -# initialise theora_info and theora_comment structures using * theora_info_init() and theora_comment_init(): \verbatim theora_info info; theora_comment comment; - + theora_info_init(&info); theora_comment_init(&comment); \endverbatim - * -# retrieve header packets from Ogg stream (there should be 3) and decode - * into theora_info and theora_comment structures using - * theora_decode_header(). See \ref identification for more information on + * -# retrieve header packets from Ogg stream (there should be 3) and decode + * into theora_info and theora_comment structures using + * theora_decode_header(). See \ref identification for more information on * identifying which packets are theora packets. \verbatim int i; @@ -79,14 +79,14 @@ extern "C" } \endverbatim * -# initialise the decoder based on the information retrieved into the - * theora_info struct by theora_decode_header(). You will need a + * theora_info struct by theora_decode_header(). You will need a * theora_state struct. \verbatim theora_state state; - + theora_decode_init(&state, &info); \endverbatim - * -# pass in packets and retrieve decoded frames! See the yuv_buffer + * -# pass in packets and retrieve decoded frames! See the yuv_buffer * documentation for information on how to retrieve raw YUV data. \verbatim yuf_buffer buffer; @@ -96,20 +96,20 @@ extern "C" theora_decode_YUVout(&state, &buffer); } \endverbatim - * + * * * \subsection identification Identifying Theora Packets * - * All streams inside an Ogg file have a unique serial_no attached to the - * stream. Typically, you will want to - * - retrieve the serial_no for each b_o_s (beginning of stream) page - * encountered within the Ogg file; - * - test the first (only) packet on that page to determine if it is a theora + * All streams inside an Ogg file have a unique serial_no attached to the + * stream. Typically, you will want to + * - retrieve the serial_no for each b_o_s (beginning of stream) page + * encountered within the Ogg file; + * - test the first (only) packet on that page to determine if it is a theora * packet; - * - once you have found a theora b_o_s page then use the retrieved serial_no + * - once you have found a theora b_o_s page then use the retrieved serial_no * to identify future packets belonging to the same theora stream. - * - * Note that you \e cannot use theora_packet_isheader() to determine if a + * + * Note that you \e cannot use theora_packet_isheader() to determine if a * packet is a theora packet or not, as this function does not perform any * checking beyond whether a header bit is present. Instead, use the * theora_decode_header() function and check the return value; or examine the @@ -124,9 +124,9 @@ extern "C" * A YUV buffer for passing uncompressed frames to and from the codec. * This holds a Y'CbCr frame in planar format. The CbCr planes can be * subsampled and have their own separate dimensions and row stride - * offsets. Note that the strides may be negative in some + * offsets. Note that the strides may be negative in some * configurations. For theora the width and height of the largest plane - * must be a multiple of 16. The actual meaningful picture size and + * must be a multiple of 16. The actual meaningful picture size and * offset are stored in the theora_info structure; frames returned by * the decoder may need to be cropped for display. * @@ -135,8 +135,8 @@ extern "C" * are ordered from left to right. * * During decode, the yuv_buffer struct is allocated by the user, but all - * fields (including luma and chroma pointers) are filled by the library. - * These pointers address library-internal memory and their contents should + * fields (including luma and chroma pointers) are filled by the library. + * These pointers address library-internal memory and their contents should * not be modified. * * Conversely, during encode the user allocates the struct and fills out all @@ -179,14 +179,14 @@ typedef enum { OC_PF_420, /**< Chroma subsampling by 2 in each direction (4:2:0) */ OC_PF_RSVD, /**< Reserved value */ OC_PF_422, /**< Horizonatal chroma subsampling by 2 (4:2:2) */ - OC_PF_444, /**< No chroma subsampling at all (4:4:4) */ + OC_PF_444 /**< No chroma subsampling at all (4:4:4) */ } theora_pixelformat; /** * Theora bitstream info. * Contains the basic playback parameters for a stream, * corresponding to the initial 'info' header packet. - * + * * Encoded theora frames must be a multiple of 16 in width and height. * To handle other frame sizes, a crop rectangle is specified in * frame_height and frame_width, offset_x and * offset_y. The offset @@ -198,10 +198,10 @@ typedef enum { * fraction. Aspect ratio is also stored as a rational fraction, and * refers to the aspect ratio of the frame pixels, not of the * overall frame itself. - * + * * See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c"> * examples/encoder_example.c</a> for usage examples of the - * other paramters and good default settings for the encoder parameters. + * other parameters and good default settings for the encoder parameters. */ typedef struct { ogg_uint32_t width; /**< encoded frame width */ @@ -253,14 +253,14 @@ typedef struct{ } theora_state; -/** +/** * Comment header metadata. * * This structure holds the in-stream metadata corresponding to * the 'comment' header packet. * * Meta data is stored as a series of (tag, value) pairs, in - * length-encoded string vectors. The first occurence of the + * length-encoded string vectors. The first occurence of the * '=' character delimits the tag and value. A particular tag * may occur more than once. The character set encoding for * the strings is always UTF-8, but the tag names are limited @@ -285,7 +285,7 @@ typedef struct theora_comment{ /* \anchor decctlcodes_old * These are the available request codes for theora_control() * when called with a decoder instance. - * By convention decoder control codes are odd, to distinguish + * By convention decoder control codes are odd, to distinguish * them from \ref encctlcodes_old "encoder control codes" which * are even. * @@ -306,7 +306,7 @@ typedef struct theora_comment{ #define TH_DECCTL_GET_PPLEVEL_MAX (1) /**Set the post-processing level. - * Sets the level of post-processing to use when decoding the + * Sets the level of post-processing to use when decoding the * compressed stream. This must be a value between zero (off) * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX. */ @@ -345,9 +345,9 @@ typedef struct theora_comment{ * \param[in] buf #th_quant_info * \retval OC_FAULT \a theora_state is <tt>NULL</tt>. * \retval OC_EINVAL Encoding has already begun, the quantization parameters - * are not acceptable to this version of the encoder, - * \a buf is <tt>NULL</tt> and \a buf_sz is not zero, - * or \a buf is non-<tt>NULL</tt> and \a buf_sz is + * are not acceptable to this version of the encoder, + * \a buf is <tt>NULL</tt> and \a buf_sz is not zero, + * or \a buf is non-<tt>NULL</tt> and \a buf_sz is * not <tt>sizeof(#th_quant_info)</tt>. * \retval OC_IMPL Not supported by this implementation.*/ #define TH_ENCCTL_SET_QUANT_PARAMS (2) @@ -424,7 +424,7 @@ typedef struct theora_comment{ #define OC_NEWPACKET -25 /**< Packet is an (ignorable) unhandled extension */ #define OC_DUPFRAME 1 /**< Packet is a dropped frame */ -/** +/** * Retrieve a human-readable string to identify the encoder vendor and version. * \returns A version string. */ @@ -462,7 +462,7 @@ extern int theora_encode_init(theora_state *th, theora_info *ti); extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv); /** - * Request the next packet of encoded video. + * Request the next packet of encoded video. * The encoded data is placed in a user-provided ogg_packet structure. * \param t A theora_state handle previously initialized for encoding. * \param last_p whether this is the last packet the encoder should produce. @@ -496,7 +496,11 @@ extern int theora_encode_header(theora_state *t, ogg_packet *op); * \param op An ogg_packet structure to fill. libtheora will set all * elements of this structure, including a pointer to the encoded * comment data. The memory for the comment data is owned by - * libtheora. + * the application, and must be freed by it using _ogg_free(). + * On some systems (such as Windows when using dynamic linking), this + * may mean the free is executed in a different module from the + * malloc, which will crash; there is no way to free this memory on + * such systems. * \retval 0 Success */ extern int theora_encode_comment(theora_comment *tc, ogg_packet *op); @@ -581,8 +585,8 @@ extern int theora_decode_packetin(theora_state *th,ogg_packet *op); * \param th A theora_state handle previously initialized for decoding. * \param yuv A yuv_buffer in which libtheora should place the decoded data. * Note that the buffer struct itself is allocated by the user, but - * that the luma and chroma pointers will be filled in by the - * library. Also note that these luma and chroma regions should be + * that the luma and chroma pointers will be filled in by the + * library. Also note that these luma and chroma regions should be * considered read-only by the user. * \retval 0 Success */ @@ -617,22 +621,22 @@ extern int theora_packet_iskeyframe(ogg_packet *op); /** * Report the granulepos shift radix * - * When embedded in Ogg, Theora uses a two-part granulepos, + * When embedded in Ogg, Theora uses a two-part granulepos, * splitting the 64-bit field into two pieces. The more-significant * section represents the frame count at the last keyframe, * and the less-significant section represents the count of * frames since the last keyframe. In this way the overall * field is still non-decreasing with time, but usefully encodes * a pointer to the last keyframe, which is necessary for - * correctly restarting decode after a seek. + * correctly restarting decode after a seek. * * This function reports the number of bits used to represent * the distance to the last keyframe, and thus how the granulepos * field must be shifted or masked to obtain the two parts. - * + * * Since libtheora returns compressed data in an ogg_packet * structure, this may be generally useful even if the Theora - * packets are not being used in an Ogg container. + * packets are not being used in an Ogg container. * * \param ti A previously initialized theora_info struct * \returns The bit shift dividing the two granulepos fields @@ -644,7 +648,7 @@ int theora_granule_shift(theora_info *ti); /** * Convert a granulepos to an absolute frame index, starting at 0. * The granulepos is interpreted in the context of a given theora_state handle. - * + * * Note that while the granulepos encodes the frame count (i.e. starting * from 1) this call returns the frame index, starting from zero. Thus * One can calculate the presentation time by multiplying the index by @@ -670,9 +674,7 @@ extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos) * This is the "end time" for the frame, or the latest time it should * be displayed. * It is not the presentation time. - * \retval -1. The given granulepos is undefined (i.e. negative), or - * \retval -1. The function has been disabled because floating - * point support is not available. + * \retval -1. The given granulepos is undefined (i.e. negative). */ extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos); @@ -699,7 +701,7 @@ extern void theora_clear(theora_state *t); /** * Initialize an allocated theora_comment structure - * \param tc An allocated theora_comment structure + * \param tc An allocated theora_comment structure **/ extern void theora_comment_init(theora_comment *tc); @@ -720,7 +722,7 @@ extern void theora_comment_add(theora_comment *tc, char *comment); /** * Add a comment to an initialized theora_comment structure. * \param tc A previously initialized theora comment structure - * \param tag A null-terminated string containing the tag + * \param tag A null-terminated string containing the tag * associated with the comment. * \param value The corresponding value as a null-terminated string * @@ -752,9 +754,9 @@ extern char *theora_comment_query(theora_comment *tc, char *tag, int count); * \param tc An initialized theora_comment structure * \param tag The tag to look up * \returns The number on instances of a particular tag. - * + * * Call this first when querying for a specific tag and then interate - * over the number of instances with separate calls to + * over the number of instances with separate calls to * theora_comment_query() to retrieve all instances in order. **/ extern int theora_comment_query_count(theora_comment *tc, char *tag); @@ -769,7 +771,7 @@ extern void theora_comment_clear(theora_comment *tc); * This is used to provide advanced control the encoding process. * \param th A #theora_state handle. * \param req The control code to process. - * See \ref encctlcodes_old "the list of available + * See \ref encctlcodes_old "the list of available * control codes" for details. * \param buf The parameters for this control code. * \param buf_sz The size of the parameter buffer.*/ diff --git a/thirdparty/libtheora/theora/theoradec.h b/thirdparty/libtheora/theora/theoradec.h index b20f0e3a64..77bef81909 100644 --- a/thirdparty/libtheora/theora/theoradec.h +++ b/thirdparty/libtheora/theora/theoradec.h @@ -92,13 +92,17 @@ extern "C" { * <tt>sizeof(th_stripe_callback)</tt>.*/ #define TH_DECCTL_SET_STRIPE_CB (7) -/**Enables telemetry and sets the macroblock display mode */ +/**Sets the macroblock display mode. Set to 0 to disable displaying + * macroblocks.*/ #define TH_DECCTL_SET_TELEMETRY_MBMODE (9) -/**Enables telemetry and sets the motion vector display mode */ +/**Sets the motion vector display mode. Set to 0 to disable displaying motion + * vectors.*/ #define TH_DECCTL_SET_TELEMETRY_MV (11) -/**Enables telemetry and sets the adaptive quantization display mode */ +/**Sets the adaptive quantization display mode. Set to 0 to disable displaying + * adaptive quantization. */ #define TH_DECCTL_SET_TELEMETRY_QI (13) -/**Enables telemetry and sets the bitstream breakdown visualization mode */ +/**Sets the bitstream breakdown visualization mode. Set to 0 to disable + * displaying bitstream breakdown.*/ #define TH_DECCTL_SET_TELEMETRY_BITS (15) /*@}*/ @@ -171,7 +175,7 @@ typedef struct th_setup_info th_setup_info; /**\defgroup decfuncs Functions for Decoding*/ /*@{*/ /**\name Functions for decoding - * You must link to <tt>libtheoradec</tt> if you use any of the + * You must link to <tt>libtheoradec</tt> if you use any of the * functions in this section. * * The functions are listed in the order they are used in a typical decode. @@ -267,7 +271,10 @@ extern void th_setup_free(th_setup_info *_setup); * See \ref decctlcodes "the list of available control codes" * for details. * \param _buf The parameters for this control code. - * \param _buf_sz The size of the parameter buffer.*/ + * \param _buf_sz The size of the parameter buffer. + * \return Possible return values depend on the control code used. + * See \ref decctlcodes "the list of control codes" for + * specific values. Generally 0 indicates success.*/ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf, size_t _buf_sz); /**Submits a packet containing encoded video data to the decoder. @@ -283,7 +290,8 @@ extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf, * \retval 0 Success. * A new decoded frame can be retrieved by calling * th_decode_ycbcr_out(). - * \retval TH_DUPFRAME The packet represented a dropped (0-byte) frame. + * \retval TH_DUPFRAME The packet represented a dropped frame (either a + * 0-byte frame or an INTER frame with no coded blocks). * The player can skip the call to th_decode_ycbcr_out(), * as the contents of the decoded frame buffer have not * changed. diff --git a/thirdparty/libtheora/theora/theoraenc.h b/thirdparty/libtheora/theora/theoraenc.h index fdf2ab21e2..79b1c2b880 100644 --- a/thirdparty/libtheora/theora/theoraenc.h +++ b/thirdparty/libtheora/theora/theoraenc.h @@ -43,7 +43,7 @@ extern "C" { * <tt>NULL</tt> may be specified to revert to the default tables. * * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt> - * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc is <tt>NULL</tt>. * \retval TH_EINVAL Encoding has already begun or one or more of the given * tables is not full or prefix-free, \a _buf is * <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is @@ -57,8 +57,8 @@ extern "C" { * <tt>NULL</tt> may be specified to revert to the default parameters. * * \param[in] _buf #th_quant_info - * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>. - * \retval TH_EINVAL Encoding has already begun, \a _buf is + * \retval TH_EFAULT \a _enc is <tt>NULL</tt>. + * \retval TH_EINVAL Encoding has already begun, \a _buf is * <tt>NULL</tt> and \a _buf_sz is not zero, * or \a _buf is non-<tt>NULL</tt> and * \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>. @@ -73,7 +73,7 @@ extern "C" { * \param[in] _buf <tt>ogg_uint32_t</tt>: The maximum distance between key * frames. * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>. * \retval TH_EIMPL Not supported by this implementation.*/ #define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4) @@ -101,7 +101,7 @@ extern "C" { * 4:2:0, the picture region is smaller than the full frame, * or if encoding has begun, preventing the quantization * tables and codebooks from being set. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>. * \retval TH_EIMPL Not supported by this implementation.*/ #define TH_ENCCTL_SET_VP3_COMPATIBLE (10) @@ -114,7 +114,7 @@ extern "C" { * the current encoding mode (VBR vs. constant quality, etc.). * * \param[out] _buf <tt>int</tt>: The maximum encoding speed level. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>. * \retval TH_EIMPL Not supported by this implementation in the current * encoding mode.*/ @@ -124,7 +124,7 @@ extern "C" { * * \param[in] _buf <tt>int</tt>: The new encoding speed level. * 0 is slowest, larger values use less CPU. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the * encoding speed level is out of bounds. * The maximum encoding speed level may be @@ -142,7 +142,7 @@ extern "C" { * * \param[out] _buf <tt>int</tt>: The current encoding speed level. * 0 is slowest, larger values use less CPU. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>. * \retval TH_EIMPL Not supported by this implementation in the current * encoding mode.*/ @@ -162,7 +162,7 @@ extern "C" { * * \param[in] _buf <tt>int</tt>: The number of duplicates to produce. * If this is negative or zero, no duplicates will be produced. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the * number of duplicates is greater than or equal to the * maximum keyframe interval. @@ -187,7 +187,7 @@ extern "C" { * use. * - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls * later. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control * is not enabled. * \retval TH_EIMPL Not supported by this implementation in the current @@ -211,7 +211,7 @@ extern "C" { * \param[in] _buf <tt>int</tt>: Requested size of the reservoir measured in * frames. * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control * is not enabled. The buffer has an implementation * defined minimum and maximum size and the value in _buf @@ -243,7 +243,7 @@ extern "C" { * application. * \retval >=0 The number of bytes of metric data available in the * returned buffer. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target * bitrate has been set, or the first call was made after * the first frame was submitted for encoding. @@ -283,7 +283,7 @@ extern "C" { * of bytes consumed. * \retval >0 The number of bytes of metric data required/consumed. * \retval 0 No more data is required before the next frame. - * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc is <tt>NULL</tt>. * \retval TH_EINVAL No target bitrate has been set, or the first call was * made after the first frame was submitted for * encoding. @@ -306,7 +306,7 @@ extern "C" { * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63, * inclusive. * \retval 0 Success. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL A target bitrate has already been specified, or the * quality index was not in the range 0...63. * \retval TH_EIMPL Not supported by this implementation.*/ @@ -328,10 +328,54 @@ extern "C" { * * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second. * \retval 0 Success. - * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>. + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. * \retval TH_EINVAL The target bitrate was not positive. - * \retval TH_EIMPL Not supported by this implementation.*/ + * A future version of this library may allow passing 0 + * to disabled rate-controlled mode and return to a + * quality-based mode, in which case this function will + * not return an error for that value. + * \retval TH_EIMPL Not supported by this implementation.*/ #define TH_ENCCTL_SET_BITRATE (30) +/**Sets the configuration to be compatible with that from the given setup + * header. + * This sets the Huffman codebooks and quantization parameters to match those + * found in the given setup header. + * This guarantees that packets encoded by this encoder will be decodable using + * a decoder configured with the passed-in setup header. + * It does <em>not</em> guarantee that th_encode_flushheader() will produce a + * bit-identical setup header, only that they will be compatible. + * If you need a bit-identical setup header, then use the one you passed into + * this command, and not the one returned by th_encode_flushheader(). + * + * This also does <em>not</em> enable or disable VP3 compatibility; that is not + * signaled in the setup header (or anywhere else in the encoded stream), and + * is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function. + * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks + * and quantization parameters to match the given setup header, you should + * enable VP3 compatibility before invoking this command, otherwise the + * codebooks and quantization parameters will be reset to the VP3 defaults. + * + * The current encoder does not support Huffman codebooks which do not contain + * codewords for all 32 tokens. + * Such codebooks are legal, according to the specification, but cannot be + * configured with this function. + * + * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy + * the configuration from. + * This should be the original, + * undecoded setup header packet, + * and <em>not</em> a #th_setup_info + * structure filled in by + * th_decode_headerin(). + * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>. + * \retval TH_EINVAL Encoding has already begun, so the codebooks and + * quantization parameters cannot be changed, or the + * data in the setup header was not supported by this + * encoder. + * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet. + * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all. + * \retval TH_EIMPL Not supported by this implementation.*/ +#define TH_ENCCTL_SET_COMPAT_CONFIG (32) /*@}*/ @@ -342,7 +386,8 @@ extern "C" { /*@{*/ /**Drop frames to keep within bitrate buffer constraints. * This can have a severe impact on quality, but is the only way to ensure that - * bitrate targets are met at low rates during sudden bursts of activity.*/ + * bitrate targets are met at low rates during sudden bursts of activity. + * It is enabled by default.*/ #define TH_RATECTL_DROP_FRAMES (0x1) /**Ignore bitrate buffer overflows. * If the encoder uses so few bits that the reservoir of available bits @@ -350,14 +395,14 @@ extern "C" { * The encoder will not try to use these extra bits in future frames. * At high rates this may cause the result to be undersized, but allows a * client to play the stream using a finite buffer; it should normally be - * enabled.*/ + * enabled, which is the default.*/ #define TH_RATECTL_CAP_OVERFLOW (0x2) /**Ignore bitrate buffer underflows. * If the encoder uses so many bits that the reservoir of available bits * underflows, ignore the deficit. * The encoder will not try to make up these extra bits in future frames. * At low rates this may cause the result to be oversized; it should normally - * be disabled.*/ + * be disabled, which is the default.*/ #define TH_RATECTL_CAP_UNDERFLOW (0x4) /*@}*/ @@ -401,8 +446,8 @@ typedef struct th_enc_ctx th_enc_ctx; * packets. * - For each uncompressed frame: * - Submit the uncompressed frame via th_encode_ycbcr_in() - * - Repeatedly call th_encode_packetout() to retrieve any video data packets - * that are ready. + * - Repeatedly call th_encode_packetout() to retrieve any video + * data packets that are ready. * - Call th_encode_free() to release all encoder memory.*/ /*@{*/ /**Allocates an encoder instance. @@ -417,7 +462,10 @@ extern th_enc_ctx *th_encode_alloc(const th_info *_info); * See \ref encctlcodes "the list of available control codes" * for details. * \param _buf The parameters for this control code. - * \param _buf_sz The size of the parameter buffer.*/ + * \param _buf_sz The size of the parameter buffer. + * \return Possible return values depend on the control code used. + * See \ref encctlcodes "the list of control codes" for + * specific values. Generally 0 indicates success.*/ extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz); /**Outputs the next header packet. * This should be called repeatedly after encoder initialization until it @@ -441,11 +489,25 @@ extern int th_encode_flushheader(th_enc_ctx *_enc, /**Submits an uncompressed frame to the encoder. * \param _enc A #th_enc_ctx handle. * \param _ycbcr A buffer of Y'CbCr data to encode. + * If the width and height of the buffer matches the frame size + * the encoder was initialized with, the encoder will only + * reference the portion inside the picture region. + * Any data outside this region will be ignored, and need not map + * to a valid address. + * Alternatively, you can pass a buffer equal to the size of the + * picture region, if this is less than the full frame size. + * When using subsampled chroma planes, odd picture sizes or odd + * picture offsets may require an unexpected chroma plane size, + * and their use is generally discouraged, as they will not be + * well-supported by players and other media frameworks. + * See Section 4.4 of + * <a href="http://www.theora.org/doc/Theora.pdf">the Theora + * specification</a> for details if you wish to use them anyway. * \retval 0 Success. * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>. - * \retval TH_EINVAL The buffer size does not match the frame size the encoder - * was initialized with, or encoding has already - * completed.*/ + * \retval TH_EINVAL The buffer size matches neither the frame size nor the + * picture size the encoder was initialized with, or + * encoding has already completed.*/ extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr); /**Retrieves encoded video data packets. * This should be called repeatedly after each frame is submitted to flush any diff --git a/thirdparty/libtheora/tokenize.c b/thirdparty/libtheora/tokenize.c index 60574c3594..57b7aa8da9 100644 --- a/thirdparty/libtheora/tokenize.c +++ b/thirdparty/libtheora/tokenize.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: tokenize.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #include <stdlib.h> @@ -20,27 +20,26 @@ +static unsigned char OC_DCT_EOB_TOKEN[31]={ + 0,1,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 +}; + static int oc_make_eob_token(int _run_count){ - if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1; - else{ - int cat; - cat=OC_ILOGNZ_32(_run_count)-3; - cat=OC_MINI(cat,3); - return OC_DCT_REPEAT_RUN0_TOKEN+cat; - } + return _run_count<32?OC_DCT_EOB_TOKEN[_run_count-1]:OC_DCT_REPEAT_RUN3_TOKEN; } +static unsigned char OC_DCT_EOB_EB[31]={ + 0,0,0,0,1,2,3,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +}; + static int oc_make_eob_token_full(int _run_count,int *_eb){ - if(_run_count<4){ - *_eb=0; - return OC_DCT_EOB1_TOKEN+_run_count-1; + if(_run_count<32){ + *_eb=OC_DCT_EOB_EB[_run_count-1]; + return OC_DCT_EOB_TOKEN[_run_count-1]; } else{ - int cat; - cat=OC_ILOGNZ_32(_run_count)-3; - cat=OC_MINI(cat,3); - *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat); - return OC_DCT_REPEAT_RUN0_TOKEN+cat; + *_eb=_run_count; + return OC_DCT_REPEAT_RUN3_TOKEN; } } @@ -49,86 +48,330 @@ static int oc_decode_eob_token(int _token,int _eb){ return (0x20820C41U>>_token*5&0x1F)+_eb; } -/*TODO: This is now only used during DCT tokenization, and never for runs; it - should be simplified.*/ -static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){ - int neg; - int zero_run; - int token; - int eb; - neg=_val<0; - _val=abs(_val); - zero_run=_zzj-_zzi; - if(zero_run>0){ - int adj; - /*Implement a minor restriction on stack 1 so that we know during DC fixups - that extending a dctrun token from stack 1 will never overflow.*/ - adj=_zzi!=1; - if(_val<2&&zero_run<17+adj){ - if(zero_run<6){ - token=OC_DCT_RUN_CAT1A+zero_run-1; - eb=neg; - } - else if(zero_run<10){ - token=OC_DCT_RUN_CAT1B; - eb=zero_run-6+(neg<<2); - } - else{ - token=OC_DCT_RUN_CAT1C; - eb=zero_run-10+(neg<<3); - } - } - else if(_val<4&&zero_run<3+adj){ - if(zero_run<2){ - token=OC_DCT_RUN_CAT2A; - eb=_val-2+(neg<<1); - } - else{ - token=OC_DCT_RUN_CAT2B; - eb=zero_run-2+(_val-2<<1)+(neg<<2); - } - } - else{ - if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN; - else token=OC_DCT_ZRL_TOKEN; - eb=zero_run-1; - } - } - else if(_val<3){ - token=OC_ONE_TOKEN+(_val-1<<1)+neg; - eb=0; - } - else if(_val<7){ - token=OC_DCT_VAL_CAT2+_val-3; - eb=neg; - } - else if(_val<9){ - token=OC_DCT_VAL_CAT3; - eb=_val-7+(neg<<1); - } - else if(_val<13){ - token=OC_DCT_VAL_CAT4; - eb=_val-9+(neg<<2); - } - else if(_val<21){ - token=OC_DCT_VAL_CAT5; - eb=_val-13+(neg<<3); - } - else if(_val<37){ - token=OC_DCT_VAL_CAT6; - eb=_val-21+(neg<<4); - } - else if(_val<69){ - token=OC_DCT_VAL_CAT7; - eb=_val-37+(neg<<5); - } - else{ - token=OC_DCT_VAL_CAT8; - eb=_val-69+(neg<<9); - } - *_eb=eb; - return token; -} +/*Some tables for fast construction of value tokens.*/ + +static const unsigned char OC_DCT_VALUE_TOKEN[1161]={ + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,21,21,21,21,21,21,21,21, + 21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21, + 21,21,21,21,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20, + 19,19,19,19,19,19,19,19,18,18,18,18,17,17,16,15,14,13,12,10, + 7, + 9,11,13,14,15,16,17,17,18,18,18,18,19,19,19,19,19,19,19,19, + 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,21,21,21,21, + 21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21, + 21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22, + 22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22 +}; + +static const ogg_uint16_t OC_DCT_VALUE_EB[1161]={ + 1023,1022,1021,1020,1019,1018,1017,1016,1015,1014, + 1013,1012,1011,1010,1009,1008,1007,1006,1005,1004, + 1003,1002,1001,1000, 999, 998, 997, 996, 995, 994, + 993, 992, 991, 990, 989, 988, 987, 986, 985, 984, + 983, 982, 981, 980, 979, 978, 977, 976, 975, 974, + 973, 972, 971, 970, 969, 968, 967, 966, 965, 964, + 963, 962, 961, 960, 959, 958, 957, 956, 955, 954, + 953, 952, 951, 950, 949, 948, 947, 946, 945, 944, + 943, 942, 941, 940, 939, 938, 937, 936, 935, 934, + 933, 932, 931, 930, 929, 928, 927, 926, 925, 924, + 923, 922, 921, 920, 919, 918, 917, 916, 915, 914, + 913, 912, 911, 910, 909, 908, 907, 906, 905, 904, + 903, 902, 901, 900, 899, 898, 897, 896, 895, 894, + 893, 892, 891, 890, 889, 888, 887, 886, 885, 884, + 883, 882, 881, 880, 879, 878, 877, 876, 875, 874, + 873, 872, 871, 870, 869, 868, 867, 866, 865, 864, + 863, 862, 861, 860, 859, 858, 857, 856, 855, 854, + 853, 852, 851, 850, 849, 848, 847, 846, 845, 844, + 843, 842, 841, 840, 839, 838, 837, 836, 835, 834, + 833, 832, 831, 830, 829, 828, 827, 826, 825, 824, + 823, 822, 821, 820, 819, 818, 817, 816, 815, 814, + 813, 812, 811, 810, 809, 808, 807, 806, 805, 804, + 803, 802, 801, 800, 799, 798, 797, 796, 795, 794, + 793, 792, 791, 790, 789, 788, 787, 786, 785, 784, + 783, 782, 781, 780, 779, 778, 777, 776, 775, 774, + 773, 772, 771, 770, 769, 768, 767, 766, 765, 764, + 763, 762, 761, 760, 759, 758, 757, 756, 755, 754, + 753, 752, 751, 750, 749, 748, 747, 746, 745, 744, + 743, 742, 741, 740, 739, 738, 737, 736, 735, 734, + 733, 732, 731, 730, 729, 728, 727, 726, 725, 724, + 723, 722, 721, 720, 719, 718, 717, 716, 715, 714, + 713, 712, 711, 710, 709, 708, 707, 706, 705, 704, + 703, 702, 701, 700, 699, 698, 697, 696, 695, 694, + 693, 692, 691, 690, 689, 688, 687, 686, 685, 684, + 683, 682, 681, 680, 679, 678, 677, 676, 675, 674, + 673, 672, 671, 670, 669, 668, 667, 666, 665, 664, + 663, 662, 661, 660, 659, 658, 657, 656, 655, 654, + 653, 652, 651, 650, 649, 648, 647, 646, 645, 644, + 643, 642, 641, 640, 639, 638, 637, 636, 635, 634, + 633, 632, 631, 630, 629, 628, 627, 626, 625, 624, + 623, 622, 621, 620, 619, 618, 617, 616, 615, 614, + 613, 612, 611, 610, 609, 608, 607, 606, 605, 604, + 603, 602, 601, 600, 599, 598, 597, 596, 595, 594, + 593, 592, 591, 590, 589, 588, 587, 586, 585, 584, + 583, 582, 581, 580, 579, 578, 577, 576, 575, 574, + 573, 572, 571, 570, 569, 568, 567, 566, 565, 564, + 563, 562, 561, 560, 559, 558, 557, 556, 555, 554, + 553, 552, 551, 550, 549, 548, 547, 546, 545, 544, + 543, 542, 541, 540, 539, 538, 537, 536, 535, 534, + 533, 532, 531, 530, 529, 528, 527, 526, 525, 524, + 523, 522, 521, 520, 519, 518, 517, 516, 515, 514, + 513, 512, 63, 62, 61, 60, 59, 58, 57, 56, + 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, + 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, + 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 1, 1, 1, 0, 0, + 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, + 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, + 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, + 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, + 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, + 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, + 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, + 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, + 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, + 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, + 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, + 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, + 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, + 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, + 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, + 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, + 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, + 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, + 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, + 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, + 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, + 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, + 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, + 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, + 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, + 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, + 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, + 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, + 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, + 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, + 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, + 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, + 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, + 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, + 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, + 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, + 502, 503, 504, 505, 506, 507, 508, 509, 510, 511 +}; + +/*The first DCT coefficient that both has a smaller magnitude and gets coded + with a different token.*/ +static const ogg_int16_t OC_DCT_TRELLIS_ALT_VALUE[1161]={ + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + -68, -68, -36, -36, -36, -36, -36, -36, -36, -36, + -36, -36, -36, -36, -36, -36, -36, -36, -36, -36, + -36, -36, -36, -36, -36, -36, -36, -36, -36, -36, + -36, -36, -36, -36, -20, -20, -20, -20, -20, -20, + -20, -20, -20, -20, -20, -20, -20, -20, -20, -20, + -12, -12, -12, -12, -12, -12, -12, -12, -8, -8, + -8, -8, -6, -6, -5, -4, -3, -2, -1, 0, + 0, + 0, 1, 2, 3, 4, 5, 6, 6, 8, 8, + 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68 +}; + +#define OC_DCT_VALUE_TOKEN_PTR (OC_DCT_VALUE_TOKEN+580) +#define OC_DCT_VALUE_EB_PTR (OC_DCT_VALUE_EB+580) +#define OC_DCT_TRELLIS_ALT_VALUE_PTR (OC_DCT_TRELLIS_ALT_VALUE+580) + +/*Some tables for fast construction of combo tokens.*/ + +static const unsigned char OC_DCT_RUN_CAT1_TOKEN[17]={ + 23,24,25,26,27,28,28,28,28,29,29,29,29,29,29,29,29 +}; + +static const unsigned char OC_DCT_RUN_CAT1_EB[17][2]={ + {0,1},{0,1},{0, 1},{0, 1},{0, 1},{0, 4},{1, 5},{2, 6},{3,7}, + {0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15} +}; + +static const unsigned char OC_DCT_RUN_CAT2_EB[3][2][2]={ + { {0,1},{2,3} },{ {0,2},{4,6} },{ {1,3},{5,7} } +}; /*Token logging to allow a few fragments of efficient rollback. Late SKIP analysis is tied up in the tokenization process, so we need to be @@ -211,10 +454,11 @@ struct oc_quant_token{ /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then dequantizes and de-zig-zags the result. - The DC coefficient is not preserved; it should be restored by the caller.*/ + The AC coefficients of _idct must be pre-initialized to zero.*/ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, - ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, - int _zzi,oc_token_checkpoint **_stack,int _acmin){ + ogg_int16_t *_idct,const ogg_int16_t *_qdct, + const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, + int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){ oc_token_checkpoint *stack; ogg_int64_t zflags; ogg_int64_t nzflags; @@ -242,31 +486,29 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, d2_accum[0]=0; zzj=64; for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){ - ogg_int32_t lambda; ogg_uint32_t best_cost; int best_bits=best_bits; int best_next=best_next; int best_token=best_token; int best_eb=best_eb; int best_qc=best_qc; - int flush_bits; ogg_uint32_t d2; int dq; + int qc_m; int e; int c; int s; int tj; - lambda=_enc->lambda; qc=_qdct[zzi]; s=-(qc<0); - qc=qc+s^s; - c=_dct[OC_FZIG_ZAG[zzi]]; - if(qc<=1){ + qc_m=qc+s^s; + c=_dct[zzi]; + /*The hard case: try a zero run.*/ + if(qc_m<=1){ ogg_uint32_t sum_d2; int nzeros; int dc_reserve; - /*The hard case: try a zero run.*/ - if(!qc){ + if(!qc_m){ /*Skip runs that are already quantized to zeros. If we considered each zero coefficient in turn, we might theoretically find a better way to partition long zero runs (e.g., @@ -281,15 +523,14 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, d2=0; } else{ - c=c+s^s; d2=c*(ogg_int32_t)c; + c=c+s^s; } eob=eob_run[zzi]; nzeros=zzj-zzi; zzj&=63; sum_d2=d2+d2_accum[zzj]; d2_accum[zzi]=sum_d2; - flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0; /*We reserve 1 spot for combo run tokens that start in the 1st AC stack to ensure they can be extended to include the DC coefficient if necessary; this greatly simplifies stack-rewriting later on.*/ @@ -297,7 +538,6 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, best_cost=0xFFFFFFFF; for(;;){ if(nzflags>>zzj&1){ - int cat; int val; int val_s; int zzk; @@ -306,11 +546,10 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, tk=next&1; zzk=next>>1; /*Try a pure zero run to this point.*/ - cat=nzeros+55>>6; - token=OC_DCT_SHORT_ZRL_TOKEN+cat; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); + token=OC_DCT_SHORT_ZRL_TOKEN+(nzeros+55>>6); + bits=oc_token_bits(_enc,huffi,zzi,token); d2=sum_d2-d2_accum[zzj]; - cost=d2+lambda*bits+tokens[zzj][1].cost; + cost=d2+_lambda*bits+tokens[zzj][1].cost; if(cost<=best_cost){ best_next=(zzj<<1)+1; best_token=token; @@ -319,25 +558,18 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, best_bits=bits+tokens[zzj][1].bits; best_qc=0; } - if(nzeros<16+dc_reserve){ + if(nzeros<17+dc_reserve){ val=_qdct[zzj]; val_s=-(val<0); val=val+val_s^val_s; if(val<=2){ /*Try a +/- 1 combo token.*/ - if(nzeros<6){ - token=OC_DCT_RUN_CAT1A+nzeros-1; - eb=-val_s; - } - else{ - cat=nzeros+54>>6; - token=OC_DCT_RUN_CAT1B+cat; - eb=(-val_s<<cat+2)+nzeros-6-(cat<<2); - } - e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]; + token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1]; + eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-val_s]; + e=_dct[zzj]-(_dequant[zzj]+val_s^val_s); d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj]; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits+tokens[zzk][tk].cost; + bits=oc_token_bits(_enc,huffi,zzi,token); + cost=d2+_lambda*bits+tokens[zzk][tk].cost; if(cost<=best_cost){ best_next=next; best_token=token; @@ -347,22 +579,23 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, best_qc=1+val_s^val_s; } } - if(nzeros<2+dc_reserve&&2<=val&&val<=4){ + if(nzeros<3+dc_reserve&&2<=val&&val<=4){ + int sval; /*Try a +/- 2/3 combo token.*/ - cat=nzeros>>1; - token=OC_DCT_RUN_CAT2A+cat; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - val=2+((val+val_s^val_s)>2); - e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val; + token=OC_DCT_RUN_CAT2A+(nzeros>>1); + bits=oc_token_bits(_enc,huffi,zzi,token); + val=2+(val>2); + sval=val+val_s^val_s; + e=_dct[zzj]-_dequant[zzj]*sval; d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj]; - cost=d2+lambda*bits+tokens[zzk][tk].cost; + cost=d2+_lambda*bits+tokens[zzk][tk].cost; if(cost<=best_cost){ best_cost=cost; best_bits=bits+tokens[zzk][tk].bits; best_next=next; best_token=token; - best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1); - best_qc=val+val_s^val_s; + best_eb=OC_DCT_RUN_CAT2_EB[nzeros-1][-val_s][val-2]; + best_qc=sval; } } } @@ -378,10 +611,10 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, /*We made it all the way to the end of the block; try an EOB token.*/ if(eob<4095){ bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1)) - -flush_bits; + -(eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0); } else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN); - cost=sum_d2+bits*lambda; + cost=sum_d2+bits*_lambda; /*If the best route so far is still a pure zero run to the end of the block, force coding it as an EOB. Even if it's not optimal for this block, it has a good chance of @@ -408,20 +641,20 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, tokens[zzi][0].bits=best_bits; tokens[zzi][0].qc=best_qc; zflags|=(ogg_int64_t)1<<zzi; - if(qc){ + if(qc_m){ dq=_dequant[zzi]; - if(zzi<_acmin)lambda=0; + if(zzi<_acmin)_lambda=0; e=dq-c; d2=e*(ogg_int32_t)e; token=OC_ONE_TOKEN-s; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); + bits=oc_token_bits(_enc,huffi,zzi,token); zzj=zzi+1&63; tj=best_flags>>zzj&1; next=(zzj<<1)+tj; tokens[zzi][1].next=(unsigned char)next; tokens[zzi][1].token=(signed char)token; tokens[zzi][1].eb=0; - tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost; + tokens[zzi][1].cost=d2+_lambda*bits+tokens[zzj][tj].cost; tokens[zzi][1].bits=bits+tokens[zzj][tj].bits; tokens[zzi][1].qc=1+s^s; nzflags|=(ogg_int64_t)1<<zzi; @@ -430,200 +663,38 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, } } else{ + int alt_qc; eob=eob_run[zzi]; - if(zzi<_acmin)lambda=0; - c=c+s^s; + if(zzi<_acmin)_lambda=0; dq=_dequant[zzi]; /*No zero run can extend past this point.*/ d2_accum[zzi]=0; - flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0; - if(qc<=2){ - e=2*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_TWO_TOKEN-s; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e-=dq; - d2=e*(ogg_int32_t)e; - token=OC_ONE_TOKEN-s; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_bits=bits; - best_cost=cost; - qc--; - } - best_eb=0; - } - else if(qc<=3){ - e=3*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT2; - best_eb=-s; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e-=dq; - d2=e*(ogg_int32_t)e; - token=OC_TWO_TOKEN-s; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_eb=0; - best_bits=bits; - best_cost=cost; - qc--; - } - } - else if(qc<=6){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT2+qc-3; - best_eb=-s; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e-=dq; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_bits=bits; - best_cost=cost; - qc--; - } - } - else if(qc<=8){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT3; - best_eb=(-s<<1)+qc-7; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=6*dq-c; - d2=e*(ogg_int32_t)e; - token=OC_DCT_VAL_CAT2+3; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_eb=-s; - best_bits=bits; - best_cost=cost; - qc=6; - } - } - else if(qc<=12){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT4; - best_eb=(-s<<2)+qc-9; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=8*dq-c; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_eb=(-s<<1)+1; - best_bits=bits; - best_cost=cost; - qc=8; - } - } - else if(qc<=20){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT5; - best_eb=(-s<<3)+qc-13; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=12*dq-c; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_eb=(-s<<2)+3; - best_bits=bits; - best_cost=cost; - qc=12; - } - } - else if(qc<=36){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT6; - best_eb=(-s<<4)+qc-21; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=20*dq-c; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<=best_cost){ - best_token=token; - best_eb=(-s<<3)+7; - best_bits=bits; - best_cost=cost; - qc=20; - } - } - else if(qc<=68){ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT7; - best_eb=(-s<<5)+qc-37; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=36*dq-c; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<best_cost){ - best_token=token; - best_eb=(-s<<4)+15; - best_bits=bits; - best_cost=cost; - qc=36; - } - } - else{ - e=qc*dq-c; - d2=e*(ogg_int32_t)e; - best_token=OC_DCT_VAL_CAT8; - best_eb=(-s<<9)+qc-69; - best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token); - best_cost=d2+lambda*best_bits; - e=68*dq-c; - d2=e*(ogg_int32_t)e; - token=best_token-1; - bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token); - cost=d2+lambda*bits; - if(cost<best_cost){ - best_token=token; - best_eb=(-s<<5)+31; - best_bits=bits; - best_cost=cost; - qc=68; - } + e=qc*dq-c; + d2=e*(ogg_int32_t)e; + best_token=*(OC_DCT_VALUE_TOKEN_PTR+qc); + best_bits=oc_token_bits(_enc,huffi,zzi,best_token); + best_cost=d2+_lambda*best_bits; + alt_qc=*(OC_DCT_TRELLIS_ALT_VALUE_PTR+qc); + e=alt_qc*dq-c; + d2=e*(ogg_int32_t)e; + token=*(OC_DCT_VALUE_TOKEN_PTR+alt_qc); + bits=oc_token_bits(_enc,huffi,zzi,token); + cost=d2+_lambda*bits; + if(cost<best_cost){ + best_token=token; + best_bits=bits; + best_cost=cost; + qc=alt_qc; } zzj=zzi+1&63; tj=best_flags>>zzj&1; next=(zzj<<1)+tj; tokens[zzi][1].next=(unsigned char)next; tokens[zzi][1].token=(signed char)best_token; - tokens[zzi][1].eb=best_eb; + tokens[zzi][1].eb=*(OC_DCT_VALUE_EB_PTR+qc); tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost; tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits; - tokens[zzi][1].qc=qc+s^s; + tokens[zzi][1].qc=qc; nzflags|=(ogg_int64_t)1<<zzi; best_flags|=(ogg_int64_t)1<<zzi; } @@ -631,9 +702,6 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, } /*Emit the tokens from the best path through the trellis.*/ stack=*_stack; - /*We blow away the first entry here so that things vectorize better. - The DC coefficient is not actually stored in the array yet.*/ - for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0; dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag; zzi=1; ti=best_flags>>1&1; @@ -643,12 +711,15 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, eob=eob_run[zzi]; if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){ if(++eob>=4095){ - oc_enc_eob_log(_enc,_pli,zzi,eob); + oc_enc_token_log(_enc,_pli,zzi,OC_DCT_REPEAT_RUN3_TOKEN,eob); eob=0; } eob_run[zzi]=eob; /*We don't include the actual EOB cost for this block in the return value. - It will be paid for by the fragment that terminates the EOB run.*/ + It is very likely to eventually be spread over several blocks, and + including it more harshly penalizes the first few blocks in a long EOB + run. + Omitting it here gives a small PSNR and SSIM gain.*/ bits-=tokens[zzi][ti].bits; zzi=_zzi; break; @@ -664,7 +735,7 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, zzj=(next>>1)-1&63; /*TODO: It may be worth saving the dequantized coefficient in the trellis above; we had to compute it to measure the error anyway.*/ - _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]); + _idct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]); zzi=next>>1; ti=next&1; } @@ -673,6 +744,237 @@ int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, return bits; } +/*Simplistic R/D tokenizer. + The AC coefficients of _idct must be pre-initialized to zero. + This could be made more accurate by using more sophisticated + rate predictions for zeros. + It could be made faster by switching from R/D decisions to static + lambda-derived rounding biases.*/ +int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi, + ogg_int16_t *_idct,const ogg_int16_t *_qdct, + const ogg_uint16_t *_dequant,const ogg_int16_t *_dct, + int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){ + const unsigned char *dct_fzig_zag; + ogg_uint16_t *eob_run; + oc_token_checkpoint *stack; + int huffi; + int zzi; + int zzj; + int zzk; + int total_bits; + int zr[4]; + stack=*_stack; + total_bits=0; + /*The apparent bit-cost of coding a zero from observing the trellis + quantizer is pre-combined with lambda. + Four predictive cases are considered: the last optimized value is zero (+2) + or non-zero and the non-optimized value is zero (+1) or non-zero.*/ + zr[0]=3*_lambda>>1; + zr[1]=_lambda; + zr[2]=4*_lambda; + zr[3]=7*_lambda>>1; + eob_run=_enc->eob_run[_pli]; + dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag; + huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1]; + for(zzj=zzi=1;zzj<_zzi&&!_qdct[zzj];zzj++); + while(zzj<_zzi){ + int v; + int d0; + int d1; + int sign; + int k; + int eob; + int dq0; + int dq1; + int dd0; + int dd1; + int next_zero; + int eob_bits; + int dct_fzig_zzj; + dct_fzig_zzj=dct_fzig_zag[zzj]; + v=_dct[zzj]; + d0=_qdct[zzj]; + eob=eob_run[zzi]; + for(zzk=zzj+1;zzk<_zzi&&!_qdct[zzk];zzk++); + next_zero=zzk-zzj+62>>6; + dq0=d0*_dequant[zzj]; + dd0=dq0-v; + dd0*=dd0; + sign=-(d0<0); + k=d0+sign^sign; + d1=(k-(zzj>_acmin))+sign^sign; + dq1=d1*_dequant[zzj]; + dd1=dq1-v; + dd1*=dd1; + /*The cost of ending an eob run is included when the alternative is to + extend this eob run. + A per qi/zzi weight would probably be useful. + Including it in the overall tokenization cost was not helpful. + The same is true at the far end of the zero run plus token case.*/ + if(eob>0&&d1==0&&zzk==_zzi){ + eob_bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN); + } + else eob_bits=0; + if(zzj==zzi){ + /*No active zero run.*/ + int best_token; + int best_eb; + int token; + int best_bits; + int bits; + int cost; + best_token=*(OC_DCT_VALUE_TOKEN_PTR+d0); + best_bits=oc_token_bits(_enc,huffi,zzi,best_token); + if(d1!=0){ + token=*(OC_DCT_VALUE_TOKEN_PTR+d1); + bits=oc_token_bits(_enc,huffi,zzi,token); + cost=dd1+(bits+eob_bits)*_lambda; + } + else{ + token=bits=0; + cost=dd1+zr[next_zero]; + } + if((dd0+(best_bits+eob_bits)*_lambda)>cost){ + _idct[dct_fzig_zzj]=dq1; + if(d1==0){ + zzj=zzk; + continue; + } + best_bits=bits; + best_token=token; + best_eb=*(OC_DCT_VALUE_EB_PTR+d1); + } + else{ + best_eb=*(OC_DCT_VALUE_EB_PTR+d0); + _idct[dct_fzig_zzj]=dq0; + } + oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi); + if(eob>0){ + oc_enc_eob_log(_enc,_pli,zzi,eob); + eob_run[zzi]=0; + } + oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb); + total_bits+=best_bits; + } + else{ + int d; + int dc_reserve; + int best_token; + int best_eb; + int best_bits; + int best_cost; + int best_bits1; + int best_token1; + int best_eb1; + int zr_bits; + int eob2; + int eob_bits2; + int bits; + int token; + int nzeros; + nzeros=zzj-zzi; + dc_reserve=zzi+62>>6; + /*A zero run, followed by the value alone.*/ + best_token=best_token1=OC_DCT_SHORT_ZRL_TOKEN+(nzeros+55>>6); + best_eb=best_eb1=nzeros-1; + eob2=eob_run[zzj]; + eob_bits2=eob2>0?oc_token_bits(_enc,huffi,zzj,OC_DCT_EOB1_TOKEN):0; + zr_bits=oc_token_bits(_enc,huffi,zzi,best_token)+eob_bits2; + best_bits=zr_bits + +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d0)); + d=d0; + best_bits1=0; + if(d1!=0){ + best_bits1=zr_bits + +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d1)); + } + if(nzeros<17+dc_reserve){ + if(k<=2){ + /*+/- 1 combo token.*/ + token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1]; + bits=oc_token_bits(_enc,huffi,zzi,token); + if(k==2&&bits<=best_bits1){ + best_bits1=bits; + best_token1=token; + best_eb1=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign]; + } + if(k==1&&bits<=best_bits){ + best_bits=bits; + best_token=token; + best_eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign]; + } + } + if(nzeros<3+dc_reserve&&2<=k&&k<=4){ + /*+/- 2/3 combo token.*/ + token=OC_DCT_RUN_CAT2A+(nzeros>>1); + bits=oc_token_bits(_enc,huffi,zzi,token); + if(k==4&&bits<=best_bits1){ + best_bits1=bits; + best_token1=token; + best_eb1=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][1]; + } + if(k!=4&&bits<=best_bits){ + best_bits=bits; + best_token=token; + best_eb=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][k-2]; + } + } + } + best_cost=dd0+(best_bits+eob_bits)*_lambda; + if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){ + zzj=zzk; + continue; + } + if(d1!=0&&dd1+(best_bits1+eob_bits)*_lambda<best_cost){ + best_bits=best_bits1; + best_token=best_token1; + best_eb=best_eb1; + d=d1; + _idct[dct_fzig_zzj]=dq1; + } + else _idct[dct_fzig_zzj]=dq0; + oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi); + if(eob){ + oc_enc_eob_log(_enc,_pli,zzi,eob); + eob_run[zzi]=0; + } + oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb); + /*If a zero run won vs. the combo token we still need to code this + value.*/ + if(best_token<=OC_DCT_ZRL_TOKEN){ + oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzj); + if(eob2){ + oc_enc_eob_log(_enc,_pli,zzj,eob2); + /*The cost of any EOB run we disrupted is ignored because doing so + improved PSNR/SSIM by a small amount.*/ + best_bits-=eob_bits2; + eob_run[zzj]=0; + } + oc_enc_token_log(_enc,_pli,zzj, + *(OC_DCT_VALUE_TOKEN_PTR+d),*(OC_DCT_VALUE_EB_PTR+d)); + } + total_bits+=best_bits; + } + zzi=zzj+1; + zzj=zzk; + } + /*Code an EOB run to complete this block. + The cost of the EOB run is not included in the total as explained in + in a comment in the trellis tokenizer above.*/ + if(zzi<64){ + int eob; + eob=eob_run[zzi]+1; + oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi); + if(eob>=4095){ + oc_enc_token_log(_enc,_pli,zzi,OC_DCT_REPEAT_RUN3_TOKEN,eob); + eob=0; + } + eob_run[zzi]=eob; + } + *_stack=stack; + return total_bits; +} + void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc, int _pli,int _fragy0,int _frag_yend){ const oc_fragment_plane *fplane; @@ -695,10 +997,10 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc, predictor for the same reference frame.*/ for(fragx=0;fragx<nhfrags;fragx++,fragi++){ if(frags[fragi].coded){ - int ref; - ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode); - frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]); - pred_last[ref]=frags[fragi].dc; + int refi; + refi=frags[fragi].refi; + frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[refi]); + pred_last[refi]=frags[fragi].dc; } } } @@ -710,27 +1012,24 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc, u_frags=frags-nhfrags; l_ref=-1; ul_ref=-1; - u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1; + u_ref=u_frags[fragi].refi; for(fragx=0;fragx<nhfrags;fragx++,fragi++){ int ur_ref; if(fragx+1>=nhfrags)ur_ref=-1; - else{ - ur_ref=u_frags[fragi+1].coded? - OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1; - } + else ur_ref=u_frags[fragi+1].refi; if(frags[fragi].coded){ int pred; - int ref; - ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode); + int refi; + refi=frags[fragi].refi; /*We break out a separate case based on which of our neighbors use the same reference frames. This is somewhat faster than trying to make a generic case which handles all of them, since it reduces lots of poorly predicted jumps to one switch statement, and also lets a number of the multiplications be optimized out by strength reduction.*/ - switch((l_ref==ref)|(ul_ref==ref)<<1| - (u_ref==ref)<<2|(ur_ref==ref)<<3){ - default:pred=pred_last[ref];break; + switch((l_ref==refi)|(ul_ref==refi)<<1| + (u_ref==refi)<<2|(ur_ref==refi)<<3){ + default:pred=pred_last[refi];break; case 1: case 3:pred=frags[fragi-1].dc;break; case 2:pred=u_frags[fragi-1].dc;break; @@ -764,8 +1063,8 @@ void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc, }break; } frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred); - pred_last[ref]=frags[fragi].dc; - l_ref=ref; + pred_last[refi]=frags[fragi].dc; + l_ref=refi; } else l_ref=-1; ul_ref=u_ref; @@ -850,9 +1149,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli, ti0++; eob_run0=0; } - token=oc_make_dct_token_full(0,0,val,&eb); - dct_tokens0[ti0]=(unsigned char)token; - extra_bits0[ti0]=(ogg_uint16_t)eb; + dct_tokens0[ti0]=*(OC_DCT_VALUE_TOKEN_PTR+val); + extra_bits0[ti0]=*(OC_DCT_VALUE_EB_PTR+val); ti0++; } else{ @@ -863,9 +1161,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli, /*We're in the middle of an active EOB run in stack 1. Move it to stack 0.*/ if(++eob_run0>=4095){ - token=oc_make_eob_token_full(eob_run0,&eb); - dct_tokens0[ti0]=(unsigned char)token; - extra_bits0[ti0]=(ogg_uint16_t)eb; + dct_tokens0[ti0]=OC_DCT_REPEAT_RUN3_TOKEN; + extra_bits0[ti0]=eob_run0; ti0++; eob_run0=0; } @@ -996,9 +1293,8 @@ void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli, neobs1--; /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/ if(eob_run1-neobs1>=4095){ - token=oc_make_eob_token_full(4095,&eb); - dct_tokens1[ti1w]=(unsigned char)token; - extra_bits1[ti1w]=(ogg_uint16_t)eb; + dct_tokens1[ti1w]=OC_DCT_REPEAT_RUN3_TOKEN; + extra_bits1[ti1w]=4095; ti1w++; eob_run1-=4095; } diff --git a/thirdparty/libtheora/x86/mmxencfrag.c b/thirdparty/libtheora/x86/mmxencfrag.c index c79ff01fcc..cc9be8d867 100644 --- a/thirdparty/libtheora/x86/mmxencfrag.c +++ b/thirdparty/libtheora/x86/mmxencfrag.c @@ -65,7 +65,7 @@ unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, "paddw %%mm6,%%mm0\n\t" "paddw %%mm2,%%mm0\n\t" "movd %%mm0,%[ret]\n\t" - :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3) + :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3) :[ystride]"r"((ptrdiff_t)_ystride) ); return (unsigned)ret; @@ -87,7 +87,9 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, The latter is exactly 1 too large when the low bit of two corresponding \ bytes is only set in one of them. \ Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ - correct the output of pavgb.*/ \ + correct the output of pavgb. \ + TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \ + schedules better; currently, however, this function is unused.*/ \ "movq %%mm0,%%mm6\n\t" \ "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \ "pxor %%mm1,%%mm0\n\t" \ @@ -153,7 +155,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, OC_SAD2_LOOP OC_SAD2_LOOP OC_SAD2_TAIL - :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2) + :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2) :[ystride]"r"((ptrdiff_t)_ystride) ); return (unsigned)ret; @@ -163,54 +165,54 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, 16-bit difference in %%mm0...%%mm7.*/ #define OC_LOAD_SUB_8x4(_off) \ "#OC_LOAD_SUB_8x4\n\t" \ - "movd "_off"(%[src]),%%mm0\n\t" \ - "movd "_off"(%[ref]),%%mm4\n\t" \ - "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \ + "movd "#_off"(%[src]),%%mm0\n\t" \ + "movd "#_off"(%[ref]),%%mm4\n\t" \ + "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \ "lea (%[src],%[src_ystride],2),%[src]\n\t" \ - "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \ + "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ - "movd "_off"(%[src]),%%mm2\n\t" \ - "movd "_off"(%[ref]),%%mm7\n\t" \ - "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \ - "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \ + "movd "#_off"(%[src]),%%mm2\n\t" \ + "movd "#_off"(%[ref]),%%mm7\n\t" \ + "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \ + "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \ "punpcklbw %%mm4,%%mm0\n\t" \ "lea (%[src],%[src_ystride],2),%[src]\n\t" \ "punpcklbw %%mm4,%%mm4\n\t" \ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ "psubw %%mm4,%%mm0\n\t" \ - "movd "_off"(%[src]),%%mm4\n\t" \ - "movq %%mm0,"_off"*2(%[buf])\n\t" \ - "movd "_off"(%[ref]),%%mm0\n\t" \ + "movd "#_off"(%[src]),%%mm4\n\t" \ + "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \ + "movd "#_off"(%[ref]),%%mm0\n\t" \ "punpcklbw %%mm5,%%mm1\n\t" \ "punpcklbw %%mm5,%%mm5\n\t" \ "psubw %%mm5,%%mm1\n\t" \ - "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \ + "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \ "punpcklbw %%mm7,%%mm2\n\t" \ "punpcklbw %%mm7,%%mm7\n\t" \ "psubw %%mm7,%%mm2\n\t" \ - "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \ + "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \ "punpcklbw %%mm6,%%mm3\n\t" \ "lea (%[src],%[src_ystride],2),%[src]\n\t" \ "punpcklbw %%mm6,%%mm6\n\t" \ "psubw %%mm6,%%mm3\n\t" \ - "movd "_off"(%[src]),%%mm6\n\t" \ + "movd "#_off"(%[src]),%%mm6\n\t" \ "punpcklbw %%mm0,%%mm4\n\t" \ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ "punpcklbw %%mm0,%%mm0\n\t" \ "lea (%[src],%[src_ystride],2),%[src]\n\t" \ "psubw %%mm0,%%mm4\n\t" \ - "movd "_off"(%[ref]),%%mm0\n\t" \ + "movd "#_off"(%[ref]),%%mm0\n\t" \ "punpcklbw %%mm7,%%mm5\n\t" \ "neg %[src_ystride]\n\t" \ "punpcklbw %%mm7,%%mm7\n\t" \ "psubw %%mm7,%%mm5\n\t" \ - "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \ + "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \ "punpcklbw %%mm0,%%mm6\n\t" \ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ "punpcklbw %%mm0,%%mm0\n\t" \ "neg %[ref_ystride]\n\t" \ "psubw %%mm0,%%mm6\n\t" \ - "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \ + "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \ "lea (%[src],%[src_ystride],8),%[src]\n\t" \ "punpcklbw %%mm0,%%mm7\n\t" \ "neg %[src_ystride]\n\t" \ @@ -218,24 +220,24 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \ "psubw %%mm0,%%mm7\n\t" \ "neg %[ref_ystride]\n\t" \ - "movq "_off"*2(%[buf]),%%mm0\n\t" \ + "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \ /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ #define OC_LOAD_8x4(_off) \ "#OC_LOAD_8x4\n\t" \ - "movd "_off"(%[src]),%%mm0\n\t" \ - "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \ - "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \ + "movd "#_off"(%[src]),%%mm0\n\t" \ + "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \ + "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \ "pxor %%mm7,%%mm7\n\t" \ - "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \ + "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \ "punpcklbw %%mm7,%%mm0\n\t" \ - "movd "_off"(%[src4]),%%mm4\n\t" \ + "movd "#_off"(%[src4]),%%mm4\n\t" \ "punpcklbw %%mm7,%%mm1\n\t" \ - "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \ + "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \ "punpcklbw %%mm7,%%mm2\n\t" \ - "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \ + "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \ "punpcklbw %%mm7,%%mm3\n\t" \ - "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \ + "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \ "punpcklbw %%mm4,%%mm4\n\t" \ "punpcklbw %%mm5,%%mm5\n\t" \ "psrlw $8,%%mm4\n\t" \ @@ -248,7 +250,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, /*Performs the first two stages of an 8-point 1-D Hadamard transform. The transform is performed in place, except that outputs 0-3 are swapped with outputs 4-7. - Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to + Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to perform this stage in place with no temporary registers).*/ #define OC_HADAMARD_AB_8x4 \ "#OC_HADAMARD_AB_8x4\n\t" \ @@ -281,7 +283,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, "psubw %%mm5,%%mm7\n\t" \ /*Performs the last stage of an 8-point 1-D Hadamard transform in place. - Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in + Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in place with no temporary registers).*/ #define OC_HADAMARD_C_8x4 \ "#OC_HADAMARD_C_8x4\n\t" \ @@ -324,8 +326,8 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ This implementation is only 26 (+4 for spilling registers).*/ \ "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \ - "movq %%mm7,"_r7"(%[buf])\n\t" \ - "movq %%mm6,"_r6"(%[buf])\n\t" \ + "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \ + "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \ /*mm7={0x7FFF}x4 \ mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ "pcmpeqb %%mm7,%%mm7\n\t" \ @@ -343,14 +345,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, "pmaxsw %%mm5,%%mm4\n\t" \ "paddw %%mm3,%%mm6\n\t" \ "paddw %%mm5,%%mm1\n\t" \ - "movq "_r7"(%[buf]),%%mm3\n\t" \ + "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \ /*Performs the second part of the final stage of the Hadamard transform and summing of absolute values.*/ #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \ "paddsw %%mm7,%%mm6\n\t" \ - "movq "_r6"(%[buf]),%%mm5\n\t" \ + "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \ "paddsw %%mm7,%%mm1\n\t" \ "psubw %%mm6,%%mm2\n\t" \ "psubw %%mm1,%%mm4\n\t" \ @@ -391,7 +393,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, #define OC_TRANSPOSE_4x4x2(_off) \ "#OC_TRANSPOSE_4x4x2\n\t" \ /*First 4x4 transpose:*/ \ - "movq %%mm5,0x10+"_off"(%[buf])\n\t" \ + "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \ /*mm0 = e3 e2 e1 e0 \ mm1 = f3 f2 f1 f0 \ mm2 = g3 g2 g1 g0 \ @@ -411,13 +413,13 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, "punpckhdq %%mm2,%%mm1\n\t" \ "movq %%mm3,%%mm2\n\t" \ "punpckhdq %%mm5,%%mm3\n\t" \ - "movq %%mm0,0x40+"_off"(%[buf])\n\t" \ + "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \ "punpckldq %%mm5,%%mm2\n\t" \ /*mm0 = h0 g0 f0 e0 \ mm1 = h1 g1 f1 e1 \ mm2 = h2 g2 f2 e2 \ mm3 = h3 g3 f3 e3*/ \ - "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \ + "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \ /*Second 4x4 transpose:*/ \ /*mm4 = a3 a2 a1 a0 \ mm5 = b3 b2 b1 b0 \ @@ -425,11 +427,11 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, mm7 = d3 d2 d1 d0*/ \ "movq %%mm6,%%mm0\n\t" \ "punpcklwd %%mm7,%%mm6\n\t" \ - "movq %%mm1,0x50+"_off"(%[buf])\n\t" \ + "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \ "punpckhwd %%mm7,%%mm0\n\t" \ "movq %%mm4,%%mm7\n\t" \ "punpcklwd %%mm5,%%mm4\n\t" \ - "movq %%mm2,0x60+"_off"(%[buf])\n\t" \ + "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \ "punpckhwd %%mm5,%%mm7\n\t" \ /*mm4 = b1 a1 b0 a0 \ mm7 = b3 a3 b2 a2 \ @@ -437,7 +439,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, mm0 = d3 c3 d2 c2*/ \ "movq %%mm4,%%mm5\n\t" \ "punpckldq %%mm6,%%mm4\n\t" \ - "movq %%mm3,0x70+"_off"(%[buf])\n\t" \ + "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \ "punpckhdq %%mm6,%%mm5\n\t" \ "movq %%mm7,%%mm6\n\t" \ "punpckhdq %%mm0,%%mm7\n\t" \ @@ -447,100 +449,102 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, mm6 = d2 c2 b2 a2 \ mm7 = d3 c3 b3 a3*/ \ -static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, - int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret; - unsigned ret2; - bufp=buf; +static unsigned oc_int_frag_satd_mmxext(int *_dc, + const unsigned char *_src,int _src_ystride, + const unsigned char *_ref,int _ref_ystride){ + OC_ALIGN8(ogg_int16_t buf[64]); + unsigned ret; + unsigned ret2; + int dc; __asm__ __volatile__( - OC_LOAD_SUB_8x4("0x00") + OC_LOAD_SUB_8x4(0x00) OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2("0x00") + OC_TRANSPOSE_4x4x2(0x00) /*Finish swapping out this 8x4 block to make room for the next one. mm0...mm3 have been swapped out already.*/ - "movq %%mm4,0x00(%[buf])\n\t" - "movq %%mm5,0x10(%[buf])\n\t" - "movq %%mm6,0x20(%[buf])\n\t" - "movq %%mm7,0x30(%[buf])\n\t" - OC_LOAD_SUB_8x4("0x04") + "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" + "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" + "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" + "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" + OC_LOAD_SUB_8x4(0x04) OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2("0x08") + OC_TRANSPOSE_4x4x2(0x08) /*Here the first 4x4 block of output from the last transpose is the second 4x4 block of input for the next transform. We have cleverly arranged that it already be in the appropriate place, so we only have to do half the loads.*/ - "movq 0x10(%[buf]),%%mm1\n\t" - "movq 0x20(%[buf]),%%mm2\n\t" - "movq 0x30(%[buf]),%%mm3\n\t" - "movq 0x00(%[buf]),%%mm0\n\t" - OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38") + "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" + "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" + "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x4 + OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) + "movd %%mm1,%[dc]\n\t" + OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) /*Up to this point, everything fit in 16 bits (8 input + 1 for the difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 for the factor of two we dropped + 3 for the vertical accumulation). Now we finally have to promote things to dwords. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long latency of pmaddwd by starting the next series of loads now.*/ - "mov %[thresh],%[ret2]\n\t" "pmaddwd %%mm7,%%mm0\n\t" - "movq 0x50(%[buf]),%%mm1\n\t" - "movq 0x58(%[buf]),%%mm5\n\t" + "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" "movq %%mm0,%%mm4\n\t" - "movq 0x60(%[buf]),%%mm2\n\t" + "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" "punpckhdq %%mm0,%%mm0\n\t" - "movq 0x68(%[buf]),%%mm6\n\t" + "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" "paddd %%mm0,%%mm4\n\t" - "movq 0x70(%[buf]),%%mm3\n\t" - "movd %%mm4,%[ret]\n\t" - "movq 0x78(%[buf]),%%mm7\n\t" - /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 - added to them, and a factor of two removed; correct the final sum here.*/ - "lea -32(%[ret],%[ret]),%[ret]\n\t" - "movq 0x40(%[buf]),%%mm0\n\t" - "cmp %[ret2],%[ret]\n\t" - "movq 0x48(%[buf]),%%mm4\n\t" - "jae 1f\n\t" - OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78") + "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" + "movd %%mm4,%[ret2]\n\t" + "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" + "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" + "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) "pmaddwd %%mm7,%%mm0\n\t" - /*There isn't much to stick in here to hide the latency this time, but the - alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose - latency is even worse.*/ - "sub $32,%[ret]\n\t" + /*Subtract abs(dc) from 2*ret2.*/ + "movsx %w[dc],%[dc]\n\t" + "cdq\n\t" + "lea (%[ret],%[ret2],2),%[ret2]\n\t" "movq %%mm0,%%mm4\n\t" "punpckhdq %%mm0,%%mm0\n\t" + "xor %[dc],%[ret]\n\t" "paddd %%mm0,%%mm4\n\t" - "movd %%mm4,%[ret2]\n\t" - "lea (%[ret],%[ret2],2),%[ret]\n\t" - ".p2align 4,,15\n\t" - "1:\n\t" - /*Although it looks like we're using 7 registers here, gcc can alias %[ret] + /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 + added to them, a factor of two removed, and the DC value included; + correct the final sum here.*/ + "sub %[ret],%[ret2]\n\t" + "movd %%mm4,%[ret]\n\t" + "lea -64(%[ret2],%[ret],2),%[ret]\n\t" + /*Although it looks like we're using 8 registers here, gcc can alias %[ret] and %[ret2] with some of the inputs, since for once we don't write to - them until after we're done using everything but %[buf] (which is also - listed as an output to ensure gcc _doesn't_ alias them against it).*/ + them until after we're done using everything but %[buf].*/ /*Note that _src_ystride and _ref_ystride must be given non-overlapping constraints, otherewise if gcc can prove they're equal it will allocate them to the same register (which is bad); _src and _ref face a similar problem, though those are never actually the same.*/ - :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp) + :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc), + [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride), - [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride), - [thresh]"m"(_thresh) + [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride) /*We have to use neg, so we actually clobber the condition codes for once (not to mention cmp, sub, and add).*/ :"cc" ); + *_dc=dc; return ret; } -unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh){ - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh); +unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride); } /*Our internal implementation of frag_copy2 takes an extra stride parameter so - we can share code with oc_enc_frag_satd2_thresh_mmxext().*/ -static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, + we can share code with oc_enc_frag_satd2_mmxext().*/ +void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ __asm__ __volatile__( /*Load the first 3 rows.*/ @@ -649,55 +653,53 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, "psubb %%mm4,%%mm2\n\t" /*%%mm2 (row 7) is done, write it out.*/ "movq %%mm2,(%[dst],%[dst_ystride])\n\t" - :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2) + :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2) :[dst_ystride]"r"((ptrdiff_t)_dst_ystride), [src_ystride]"r"((ptrdiff_t)_src_ystride) :"memory" ); } -unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh){ +unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ OC_ALIGN8(unsigned char ref[64]); oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh); + return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8); } -unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, - int _ystride){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret; - unsigned ret2; - bufp=buf; +unsigned oc_enc_frag_intra_satd_mmxext(int *_dc, + const unsigned char *_src,int _ystride){ + OC_ALIGN8(ogg_int16_t buf[64]); + unsigned ret; + unsigned ret2; + int dc; __asm__ __volatile__( - OC_LOAD_8x4("0x00") + OC_LOAD_8x4(0x00) OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2("0x00") + OC_TRANSPOSE_4x4x2(0x00) /*Finish swapping out this 8x4 block to make room for the next one. mm0...mm3 have been swapped out already.*/ - "movq %%mm4,0x00(%[buf])\n\t" - "movq %%mm5,0x10(%[buf])\n\t" - "movq %%mm6,0x20(%[buf])\n\t" - "movq %%mm7,0x30(%[buf])\n\t" - OC_LOAD_8x4("0x04") + "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" + "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" + "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" + "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" + OC_LOAD_8x4(0x04) OC_HADAMARD_8x4 - OC_TRANSPOSE_4x4x2("0x08") + OC_TRANSPOSE_4x4x2(0x08) /*Here the first 4x4 block of output from the last transpose is the second 4x4 block of input for the next transform. We have cleverly arranged that it already be in the appropriate place, so we only have to do half the loads.*/ - "movq 0x10(%[buf]),%%mm1\n\t" - "movq 0x20(%[buf]),%%mm2\n\t" - "movq 0x30(%[buf]),%%mm3\n\t" - "movq 0x00(%[buf]),%%mm0\n\t" + "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" + "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" + "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" /*We split out the stages here so we can save the DC coefficient in the middle.*/ OC_HADAMARD_AB_8x4 - OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38") - "movd %%mm1,%[ret]\n\t" - OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38") + OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) + "movd %%mm1,%[dc]\n\t" + OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) /*Up to this point, everything fit in 16 bits (8 input + 1 for the difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 for the factor of two we dropped + 3 for the vertical accumulation). @@ -705,41 +707,43 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long latency of pmaddwd by starting the next series of loads now.*/ "pmaddwd %%mm7,%%mm0\n\t" - "movq 0x50(%[buf]),%%mm1\n\t" - "movq 0x58(%[buf]),%%mm5\n\t" - "movq 0x60(%[buf]),%%mm2\n\t" + "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" + "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" "movq %%mm0,%%mm4\n\t" - "movq 0x68(%[buf]),%%mm6\n\t" + "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" "punpckhdq %%mm0,%%mm0\n\t" - "movq 0x70(%[buf]),%%mm3\n\t" + "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" "paddd %%mm0,%%mm4\n\t" - "movq 0x78(%[buf]),%%mm7\n\t" - "movd %%mm4,%[ret2]\n\t" - "movq 0x40(%[buf]),%%mm0\n\t" - "movq 0x48(%[buf]),%%mm4\n\t" - OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78") + "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" + "movd %%mm4,%[ret]\n\t" + "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" + "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" + OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) "pmaddwd %%mm7,%%mm0\n\t" /*We assume that the DC coefficient is always positive (which is true, because the input to the INTRA transform was not a difference).*/ - "movzx %w[ret],%[ret]\n\t" - "add %[ret2],%[ret2]\n\t" - "sub %[ret],%[ret2]\n\t" + "movzx %w[dc],%[dc]\n\t" + "add %[ret],%[ret]\n\t" + "sub %[dc],%[ret]\n\t" "movq %%mm0,%%mm4\n\t" "punpckhdq %%mm0,%%mm0\n\t" "paddd %%mm0,%%mm4\n\t" - "movd %%mm4,%[ret]\n\t" - "lea -64(%[ret2],%[ret],2),%[ret]\n\t" - /*Although it looks like we're using 7 registers here, gcc can alias %[ret] + "movd %%mm4,%[ret2]\n\t" + "lea -64(%[ret],%[ret2],2),%[ret]\n\t" + /*Although it looks like we're using 8 registers here, gcc can alias %[ret] and %[ret2] with some of the inputs, since for once we don't write to them until after we're done using everything but %[buf] (which is also listed as an output to ensure gcc _doesn't_ alias them against it).*/ - :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp) + :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc), + [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) :[src]"r"(_src),[src4]"r"(_src+4*_ystride), [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride) /*We have to use sub, so we actually clobber the condition codes for once (not to mention add).*/ :"cc" ); + *_dc=dc; return ret; } diff --git a/thirdparty/libtheora/x86/mmxfdct.c b/thirdparty/libtheora/x86/mmxfdct.c index 211875255e..17668358b8 100644 --- a/thirdparty/libtheora/x86/mmxfdct.c +++ b/thirdparty/libtheora/x86/mmxfdct.c @@ -12,6 +12,7 @@ /*MMX fDCT implementation for x86_32*/ /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ #include "x86enc.h" +#include "x86zigzag.h" #if defined(OC_X86_ASM) @@ -462,8 +463,9 @@ mm7 = d3 c3 b3 a3*/ \ /*MMX implementation of the fDCT.*/ -void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - ptrdiff_t a; +void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ + OC_ALIGN8(ogg_int16_t buf[64]); + ptrdiff_t a; __asm__ __volatile__( /*Add two extra bits of working precision to improve accuracy; any more and we could overflow.*/ @@ -586,77 +588,88 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ "movq 0x30(%[y]),%%mm3\n\t" OC_FDCT_STAGE1_8x4 OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38") - OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38") - /*mm0={-2}x4*/ - "pcmpeqw %%mm0,%%mm0\n\t" - "paddw %%mm0,%%mm0\n\t" - /*Round the results.*/ - "psubw %%mm0,%%mm1\n\t" - "psubw %%mm0,%%mm2\n\t" - "psraw $2,%%mm1\n\t" - "psubw %%mm0,%%mm3\n\t" - "movq %%mm1,0x18(%[y])\n\t" - "psraw $2,%%mm2\n\t" - "psubw %%mm0,%%mm4\n\t" - "movq 0x08(%[y]),%%mm1\n\t" - "psraw $2,%%mm3\n\t" - "psubw %%mm0,%%mm5\n\t" + /*mm2={-2}x4*/ + "pcmpeqw %%mm2,%%mm2\n\t" + "paddw %%mm2,%%mm2\n\t" + /*Round and store the results (no transpose).*/ + "movq 0x10(%[y]),%%mm7\n\t" + "psubw %%mm2,%%mm4\n\t" + "psubw %%mm2,%%mm6\n\t" "psraw $2,%%mm4\n\t" - "psubw %%mm0,%%mm6\n\t" - "psraw $2,%%mm5\n\t" - "psubw %%mm0,%%mm7\n\t" + "psubw %%mm2,%%mm0\n\t" + "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" + "movq 0x30(%[y]),%%mm4\n\t" "psraw $2,%%mm6\n\t" - "psubw %%mm0,%%mm1\n\t" + "psubw %%mm2,%%mm5\n\t" + "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" + "psraw $2,%%mm0\n\t" + "psubw %%mm2,%%mm3\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t" + "psraw $2,%%mm5\n\t" + "psubw %%mm2,%%mm1\n\t" + "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t" + "psraw $2,%%mm3\n\t" + "psubw %%mm2,%%mm7\n\t" + "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t" + "psraw $2,%%mm1\n\t" + "psubw %%mm2,%%mm4\n\t" + "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t" "psraw $2,%%mm7\n\t" + "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t" + "psraw $2,%%mm4\n\t" + "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t" + /*Load the next block.*/ "movq 0x40(%[y]),%%mm0\n\t" - "psraw $2,%%mm1\n\t" - "movq %%mm7,0x30(%[y])\n\t" "movq 0x78(%[y]),%%mm7\n\t" - "movq %%mm1,0x08(%[y])\n\t" "movq 0x50(%[y]),%%mm1\n\t" - "movq %%mm6,0x20(%[y])\n\t" "movq 0x68(%[y]),%%mm6\n\t" - "movq %%mm2,0x28(%[y])\n\t" "movq 0x60(%[y]),%%mm2\n\t" - "movq %%mm5,0x10(%[y])\n\t" "movq 0x58(%[y]),%%mm5\n\t" - "movq %%mm3,0x38(%[y])\n\t" "movq 0x70(%[y]),%%mm3\n\t" - "movq %%mm4,0x00(%[y])\n\t" "movq 0x48(%[y]),%%mm4\n\t" OC_FDCT_STAGE1_8x4 OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78") - OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78") - /*mm0={-2}x4*/ - "pcmpeqw %%mm0,%%mm0\n\t" - "paddw %%mm0,%%mm0\n\t" - /*Round the results.*/ - "psubw %%mm0,%%mm1\n\t" - "psubw %%mm0,%%mm2\n\t" - "psraw $2,%%mm1\n\t" - "psubw %%mm0,%%mm3\n\t" - "movq %%mm1,0x58(%[y])\n\t" - "psraw $2,%%mm2\n\t" - "psubw %%mm0,%%mm4\n\t" - "movq 0x48(%[y]),%%mm1\n\t" - "psraw $2,%%mm3\n\t" - "psubw %%mm0,%%mm5\n\t" - "movq %%mm2,0x68(%[y])\n\t" + /*mm2={-2}x4*/ + "pcmpeqw %%mm2,%%mm2\n\t" + "paddw %%mm2,%%mm2\n\t" + /*Round and store the results (no transpose).*/ + "movq 0x50(%[y]),%%mm7\n\t" + "psubw %%mm2,%%mm4\n\t" + "psubw %%mm2,%%mm6\n\t" "psraw $2,%%mm4\n\t" - "psubw %%mm0,%%mm6\n\t" - "movq %%mm3,0x78(%[y])\n\t" - "psraw $2,%%mm5\n\t" - "psubw %%mm0,%%mm7\n\t" - "movq %%mm4,0x40(%[y])\n\t" + "psubw %%mm2,%%mm0\n\t" + "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t" + "movq 0x70(%[y]),%%mm4\n\t" "psraw $2,%%mm6\n\t" - "psubw %%mm0,%%mm1\n\t" - "movq %%mm5,0x50(%[y])\n\t" - "psraw $2,%%mm7\n\t" - "movq %%mm6,0x60(%[y])\n\t" + "psubw %%mm2,%%mm5\n\t" + "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t" + "psraw $2,%%mm0\n\t" + "psubw %%mm2,%%mm3\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t" + "psraw $2,%%mm5\n\t" + "psubw %%mm2,%%mm1\n\t" + "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t" + "psraw $2,%%mm3\n\t" + "psubw %%mm2,%%mm7\n\t" + "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t" "psraw $2,%%mm1\n\t" - "movq %%mm7,0x70(%[y])\n\t" - "movq %%mm1,0x48(%[y])\n\t" - :[a]"=&r"(a) + "psubw %%mm2,%%mm4\n\t" + "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t" + "psraw $2,%%mm7\n\t" + "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t" + "psraw $2,%%mm4\n\t" + "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t" + /*Final transpose and zig-zag.*/ +#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \ + "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \ + +#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \ + "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \ + + OC_TRANSPOSE_ZIG_ZAG_MMXEXT +#undef OC_ZZ_LOAD_ROW_LO +#undef OC_ZZ_LOAD_ROW_HI + :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) :[y]"r"(_y),[x]"r"(_x) :"memory" ); diff --git a/thirdparty/libtheora/x86/mmxfrag.c b/thirdparty/libtheora/x86/mmxfrag.c index 2c732939c3..b3ec508956 100644 --- a/thirdparty/libtheora/x86/mmxfrag.c +++ b/thirdparty/libtheora/x86/mmxfrag.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -22,17 +22,92 @@ The iteration each instruction belongs to is marked in the comments as #i.*/ #include <stddef.h> #include "x86int.h" -#include "mmxfrag.h" #if defined(OC_X86_ASM) /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes between rows.*/ +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ + do{ \ + const unsigned char *src; \ + unsigned char *dst; \ + ptrdiff_t ystride3; \ + src=(_src); \ + dst=(_dst); \ + __asm__ __volatile__( \ + /*src+0*ystride*/ \ + "movq (%[src]),%%mm0\n\t" \ + /*src+1*ystride*/ \ + "movq (%[src],%[ystride]),%%mm1\n\t" \ + /*ystride3=ystride*3*/ \ + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ + /*src+2*ystride*/ \ + "movq (%[src],%[ystride],2),%%mm2\n\t" \ + /*src+3*ystride*/ \ + "movq (%[src],%[ystride3]),%%mm3\n\t" \ + /*dst+0*ystride*/ \ + "movq %%mm0,(%[dst])\n\t" \ + /*dst+1*ystride*/ \ + "movq %%mm1,(%[dst],%[ystride])\n\t" \ + /*Pointer to next 4.*/ \ + "lea (%[src],%[ystride],4),%[src]\n\t" \ + /*dst+2*ystride*/ \ + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ + /*dst+3*ystride*/ \ + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ + /*Pointer to next 4.*/ \ + "lea (%[dst],%[ystride],4),%[dst]\n\t" \ + /*src+0*ystride*/ \ + "movq (%[src]),%%mm0\n\t" \ + /*src+1*ystride*/ \ + "movq (%[src],%[ystride]),%%mm1\n\t" \ + /*src+2*ystride*/ \ + "movq (%[src],%[ystride],2),%%mm2\n\t" \ + /*src+3*ystride*/ \ + "movq (%[src],%[ystride3]),%%mm3\n\t" \ + /*dst+0*ystride*/ \ + "movq %%mm0,(%[dst])\n\t" \ + /*dst+1*ystride*/ \ + "movq %%mm1,(%[dst],%[ystride])\n\t" \ + /*dst+2*ystride*/ \ + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ + /*dst+3*ystride*/ \ + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ + :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ + :[ystride]"r"((ptrdiff_t)(_ystride)) \ + :"memory" \ + ); \ + } \ + while(0) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ void oc_frag_copy_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride){ OC_FRAG_COPY_MMX(_dst,_src,_ystride); } +/*Copies the fragments specified by the lists of fragment indices from one + frame to another. + _dst_frame: The reference frame to copy to. + _src_frame: The reference frame to copy from. + _ystride: The row stride of the reference frames. + _fragis: A pointer to a list of fragment indices. + _nfragis: The number of fragment indices to copy. + _frag_buf_offs: The offsets of fragments in the reference frames.*/ +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ + ptrdiff_t fragii; + for(fragii=0;fragii<_nfragis;fragii++){ + ptrdiff_t frag_buf_off; + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; + OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, + _src_frame+frag_buf_off,_ystride); + } +} + + void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, const ogg_int16_t *_residue){ __asm__ __volatile__( @@ -280,7 +355,7 @@ void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, /*Advance dest ptr.*/ "lea (%[dst],%[ystride],2),%[dst]\n\t" :[dst]"+r"(_dst),[residue]"+r"(_residue), - [src1]"+%r"(_src1),[src2]"+r"(_src2) + [src1]"+r"(_src1),[src2]"+r"(_src2) :[ystride]"r"((ptrdiff_t)_ystride) :"memory" ); diff --git a/thirdparty/libtheora/x86/mmxfrag.h b/thirdparty/libtheora/x86/mmxfrag.h deleted file mode 100644 index a398427629..0000000000 --- a/thirdparty/libtheora/x86/mmxfrag.h +++ /dev/null @@ -1,64 +0,0 @@ -#if !defined(_x86_mmxfrag_H) -# define _x86_mmxfrag_H (1) -# include <stddef.h> -# include "x86int.h" - -#if defined(OC_X86_ASM) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ - do{ \ - const unsigned char *src; \ - unsigned char *dst; \ - ptrdiff_t ystride3; \ - src=(_src); \ - dst=(_dst); \ - __asm__ __volatile__( \ - /*src+0*ystride*/ \ - "movq (%[src]),%%mm0\n\t" \ - /*src+1*ystride*/ \ - "movq (%[src],%[ystride]),%%mm1\n\t" \ - /*ystride3=ystride*3*/ \ - "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ - /*src+2*ystride*/ \ - "movq (%[src],%[ystride],2),%%mm2\n\t" \ - /*src+3*ystride*/ \ - "movq (%[src],%[ystride3]),%%mm3\n\t" \ - /*dst+0*ystride*/ \ - "movq %%mm0,(%[dst])\n\t" \ - /*dst+1*ystride*/ \ - "movq %%mm1,(%[dst],%[ystride])\n\t" \ - /*Pointer to next 4.*/ \ - "lea (%[src],%[ystride],4),%[src]\n\t" \ - /*dst+2*ystride*/ \ - "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ - /*dst+3*ystride*/ \ - "movq %%mm3,(%[dst],%[ystride3])\n\t" \ - /*Pointer to next 4.*/ \ - "lea (%[dst],%[ystride],4),%[dst]\n\t" \ - /*src+0*ystride*/ \ - "movq (%[src]),%%mm0\n\t" \ - /*src+1*ystride*/ \ - "movq (%[src],%[ystride]),%%mm1\n\t" \ - /*src+2*ystride*/ \ - "movq (%[src],%[ystride],2),%%mm2\n\t" \ - /*src+3*ystride*/ \ - "movq (%[src],%[ystride3]),%%mm3\n\t" \ - /*dst+0*ystride*/ \ - "movq %%mm0,(%[dst])\n\t" \ - /*dst+1*ystride*/ \ - "movq %%mm1,(%[dst],%[ystride])\n\t" \ - /*dst+2*ystride*/ \ - "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ - /*dst+3*ystride*/ \ - "movq %%mm3,(%[dst],%[ystride3])\n\t" \ - :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ - :[ystride]"r"((ptrdiff_t)(_ystride)) \ - :"memory" \ - ); \ - } \ - while(0) - -# endif -#endif diff --git a/thirdparty/libtheora/x86/mmxidct.c b/thirdparty/libtheora/x86/mmxidct.c index 76424e6364..b8e3077066 100644 --- a/thirdparty/libtheora/x86/mmxidct.c +++ b/thirdparty/libtheora/x86/mmxidct.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -30,89 +30,66 @@ -/*A table of constants used by the MMX routines.*/ -static const ogg_uint16_t __attribute__((aligned(8),used)) - OC_IDCT_CONSTS[(7+1)*4]={ - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - 8, 8, 8, 8 -}; - -/*Converts the expression in the argument to a string.*/ -#define OC_M2STR(_s) #_s - /*38 cycles*/ -#define OC_IDCT_BEGIN \ +#define OC_IDCT_BEGIN(_y,_x) \ "#OC_IDCT_BEGIN\n\t" \ - "movq "OC_I(3)",%%mm2\n\t" \ - "movq "OC_C(3)",%%mm6\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ "movq %%mm2,%%mm4\n\t" \ - "movq "OC_J(5)",%%mm7\n\t" \ + "movq "OC_J(5,_x)",%%mm7\n\t" \ "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_C(5)",%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ "pmulhw %%mm7,%%mm6\n\t" \ "movq %%mm1,%%mm5\n\t" \ "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_I(1)",%%mm3\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ "pmulhw %%mm7,%%mm5\n\t" \ - "movq "OC_C(1)",%%mm0\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ "paddw %%mm2,%%mm4\n\t" \ "paddw %%mm7,%%mm6\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_J(7)",%%mm1\n\t" \ + "movq "OC_J(7,_x)",%%mm1\n\t" \ "paddw %%mm5,%%mm7\n\t" \ "movq %%mm0,%%mm5\n\t" \ "pmulhw %%mm3,%%mm0\n\t" \ "paddw %%mm7,%%mm4\n\t" \ "pmulhw %%mm1,%%mm5\n\t" \ - "movq "OC_C(7)",%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm3,%%mm0\n\t" \ "pmulhw %%mm7,%%mm3\n\t" \ - "movq "OC_I(2)",%%mm2\n\t" \ + "movq "OC_I(2,_x)",%%mm2\n\t" \ "pmulhw %%mm1,%%mm7\n\t" \ "paddw %%mm1,%%mm5\n\t" \ "movq %%mm2,%%mm1\n\t" \ - "pmulhw "OC_C(2)",%%mm2\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ "psubw %%mm5,%%mm3\n\t" \ - "movq "OC_J(6)",%%mm5\n\t" \ + "movq "OC_J(6,_x)",%%mm5\n\t" \ "paddw %%mm7,%%mm0\n\t" \ "movq %%mm5,%%mm7\n\t" \ "psubw %%mm4,%%mm0\n\t" \ - "pmulhw "OC_C(2)",%%mm5\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "pmulhw "OC_C(6)",%%mm1\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ "paddw %%mm4,%%mm4\n\t" \ "paddw %%mm0,%%mm4\n\t" \ "psubw %%mm6,%%mm3\n\t" \ "paddw %%mm7,%%mm5\n\t" \ "paddw %%mm6,%%mm6\n\t" \ - "pmulhw "OC_C(6)",%%mm7\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ "paddw %%mm3,%%mm6\n\t" \ - "movq %%mm4,"OC_I(1)"\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ "psubw %%mm5,%%mm1\n\t" \ - "movq "OC_C(4)",%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ "movq %%mm3,%%mm5\n\t" \ "pmulhw %%mm4,%%mm3\n\t" \ "paddw %%mm2,%%mm7\n\t" \ - "movq %%mm6,"OC_I(2)"\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0)",%%mm6\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ "pmulhw %%mm4,%%mm0\n\t" \ "paddw %%mm3,%%mm5\n\t" \ - "movq "OC_J(4)",%%mm3\n\t" \ + "movq "OC_J(4,_x)",%%mm3\n\t" \ "psubw %%mm1,%%mm5\n\t" \ "paddw %%mm0,%%mm2\n\t" \ "psubw %%mm3,%%mm6\n\t" \ @@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "paddw %%mm0,%%mm6\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1)",%%mm0\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ "paddw %%mm6,%%mm2\n\t" \ "paddw %%mm3,%%mm4\n\t" \ "psubw %%mm1,%%mm2\n\t" \ "#end OC_IDCT_BEGIN\n\t" \ /*38+8=46 cycles.*/ -#define OC_ROW_IDCT \ +#define OC_ROW_IDCT(_y,_x) \ "#OC_ROW_IDCT\n" \ - OC_IDCT_BEGIN \ + OC_IDCT_BEGIN(_y,_x) \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r4=E'=E-G*/ \ "psubw %%mm7,%%mm4\n\t" \ /*r1=H'+H'*/ \ @@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "psubw %%mm0,%%mm7\n\t" \ "paddw %%mm0,%%mm0\n\t" \ /*Save R1.*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r0=R0=G.+C.*/ \ "paddw %%mm7,%%mm0\n\t" \ "#end OC_ROW_IDCT\n\t" \ @@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) Since r1 is free at entry, we calculate the Js first.*/ /*19 cycles.*/ -#define OC_TRANSPOSE \ +#define OC_TRANSPOSE(_y) \ "#OC_TRANSPOSE\n\t" \ "movq %%mm4,%%mm1\n\t" \ "punpcklwd %%mm5,%%mm4\n\t" \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "punpckhwd %%mm5,%%mm1\n\t" \ "movq %%mm6,%%mm0\n\t" \ "punpcklwd %%mm7,%%mm6\n\t" \ @@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "punpckldq %%mm6,%%mm4\n\t" \ "punpckhdq %%mm6,%%mm5\n\t" \ "movq %%mm1,%%mm6\n\t" \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ "punpckhwd %%mm7,%%mm0\n\t" \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ "punpckhdq %%mm0,%%mm6\n\t" \ - "movq "OC_I(0)",%%mm4\n\t" \ + "movq "OC_I(0,_y)",%%mm4\n\t" \ "punpckldq %%mm0,%%mm1\n\t" \ - "movq "OC_I(1)",%%mm5\n\t" \ + "movq "OC_I(1,_y)",%%mm5\n\t" \ "movq %%mm4,%%mm0\n\t" \ - "movq %%mm6,"OC_J(7)"\n\t" \ + "movq %%mm6,"OC_J(7,_y)"\n\t" \ "punpcklwd %%mm5,%%mm0\n\t" \ - "movq %%mm1,"OC_J(6)"\n\t" \ + "movq %%mm1,"OC_J(6,_y)"\n\t" \ "punpckhwd %%mm5,%%mm4\n\t" \ "movq %%mm2,%%mm5\n\t" \ "punpcklwd %%mm3,%%mm2\n\t" \ @@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "punpckldq %%mm2,%%mm0\n\t" \ "punpckhdq %%mm2,%%mm1\n\t" \ "movq %%mm4,%%mm2\n\t" \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "punpckhwd %%mm3,%%mm5\n\t" \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ "punpckhdq %%mm5,%%mm4\n\t" \ "punpckldq %%mm5,%%mm2\n\t" \ - "movq %%mm4,"OC_I(3)"\n\t" \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm4,"OC_I(3,_y)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ "#end OC_TRANSPOSE\n\t" \ /*38+19=57 cycles.*/ -#define OC_COLUMN_IDCT \ +#define OC_COLUMN_IDCT(_y) \ "#OC_COLUMN_IDCT\n" \ - OC_IDCT_BEGIN \ - "paddw "OC_8",%%mm2\n\t" \ + OC_IDCT_BEGIN(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ /*r1=H'+H'*/ \ "paddw %%mm1,%%mm1\n\t" \ /*r1=R1=A''+H'*/ \ @@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r1=NR1*/ \ "psraw $4,%%mm1\n\t" \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r7=G+G*/ \ "paddw %%mm7,%%mm7\n\t" \ /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ /*r7=G'=E+G*/ \ "paddw %%mm4,%%mm7\n\t" \ /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r4=R4=E'-D'*/ \ "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_8",%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ /*r3=D'+D'*/ \ "paddw %%mm3,%%mm3\n\t" \ /*r3=R3=E'+D'*/ \ @@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "psubw %%mm5,%%mm6\n\t" \ /*r3=NR3*/ \ "psraw $4,%%mm3\n\t" \ - "paddw "OC_8",%%mm6\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ /*r5=B''+B''*/ \ "paddw %%mm5,%%mm5\n\t" \ /*r5=R5=F'+B''*/ \ @@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r6=NR6*/ \ "psraw $4,%%mm6\n\t" \ /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ /*r5=NR5*/ \ "psraw $4,%%mm5\n\t" \ /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3)"\n\t" \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ /*r7=R7=G'-C'*/ \ "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_8",%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ /*r0=C'+C'*/ \ "paddw %%mm0,%%mm0\n\t" \ /*r0=R0=G'+C'*/ \ @@ -295,113 +272,121 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r7=NR7*/ \ "psraw $4,%%mm7\n\t" \ /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6)"\n\t" \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ /*r0=NR0*/ \ "psraw $4,%%mm0\n\t" \ /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7)"\n\t" \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "#end OC_COLUMN_IDCT\n\t" \ -#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])" -#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) -#define OC_8 OC_MID(OC_EIGHT_OFFSET,0) - -static void oc_idct8x8_slow(ogg_int16_t _y[64]){ +static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + int i; /*This routine accepts an 8x8 matrix, but in partially transposed form. Every 4x4 block is transposed.*/ __asm__ __volatile__( -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])" - OC_ROW_IDCT - OC_TRANSPOSE +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+64)"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+72)"(%[y])" - OC_ROW_IDCT - OC_TRANSPOSE +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) #undef OC_I #undef OC_J - : - :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS) + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) ); + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); + for(i=0;i<4;i++){ + __asm__ __volatile__( + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" + :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) + ); + } } /*25 cycles.*/ -#define OC_IDCT_BEGIN_10 \ +#define OC_IDCT_BEGIN_10(_y,_x) \ "#OC_IDCT_BEGIN_10\n\t" \ - "movq "OC_I(3)",%%mm2\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ "nop\n\t" \ - "movq "OC_C(3)",%%mm6\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ "movq %%mm2,%%mm4\n\t" \ - "movq "OC_C(5)",%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_I(1)",%%mm3\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_C(1)",%%mm0\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ "paddw %%mm2,%%mm4\n\t" \ "pxor %%mm6,%%mm6\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_I(2)",%%mm5\n\t" \ + "movq "OC_I(2,_x)",%%mm5\n\t" \ "pmulhw %%mm3,%%mm0\n\t" \ "movq %%mm5,%%mm1\n\t" \ "paddw %%mm3,%%mm0\n\t" \ - "pmulhw "OC_C(7)",%%mm3\n\t" \ + "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ "psubw %%mm2,%%mm6\n\t" \ - "pmulhw "OC_C(2)",%%mm5\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ "psubw %%mm4,%%mm0\n\t" \ - "movq "OC_I(2)",%%mm7\n\t" \ + "movq "OC_I(2,_x)",%%mm7\n\t" \ "paddw %%mm4,%%mm4\n\t" \ "paddw %%mm5,%%mm7\n\t" \ "paddw %%mm0,%%mm4\n\t" \ - "pmulhw "OC_C(6)",%%mm1\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ "psubw %%mm6,%%mm3\n\t" \ - "movq %%mm4,"OC_I(1)"\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ "paddw %%mm6,%%mm6\n\t" \ - "movq "OC_C(4)",%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ "paddw %%mm3,%%mm6\n\t" \ "movq %%mm3,%%mm5\n\t" \ "pmulhw %%mm4,%%mm3\n\t" \ - "movq %%mm6,"OC_I(2)"\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0)",%%mm6\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ "pmulhw %%mm4,%%mm0\n\t" \ "paddw %%mm3,%%mm5\n\t" \ "paddw %%mm0,%%mm2\n\t" \ "psubw %%mm1,%%mm5\n\t" \ "pmulhw %%mm4,%%mm6\n\t" \ - "paddw "OC_I(0)",%%mm6\n\t" \ + "paddw "OC_I(0,_x)",%%mm6\n\t" \ "paddw %%mm1,%%mm1\n\t" \ "movq %%mm6,%%mm4\n\t" \ "paddw %%mm5,%%mm1\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1)",%%mm0\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ "paddw %%mm6,%%mm2\n\t" \ "psubw %%mm1,%%mm2\n\t" \ "nop\n\t" \ "#end OC_IDCT_BEGIN_10\n\t" \ /*25+8=33 cycles.*/ -#define OC_ROW_IDCT_10 \ +#define OC_ROW_IDCT_10(_y,_x) \ "#OC_ROW_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10 \ + OC_IDCT_BEGIN_10(_y,_x) \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r4=E'=E-G*/ \ "psubw %%mm7,%%mm4\n\t" \ /*r1=H'+H'*/ \ @@ -426,16 +411,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ "psubw %%mm0,%%mm7\n\t" \ "paddw %%mm0,%%mm0\n\t" \ /*Save R1.*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r0=R0=G'+C'*/ \ "paddw %%mm7,%%mm0\n\t" \ "#end OC_ROW_IDCT_10\n\t" \ /*25+19=44 cycles'*/ -#define OC_COLUMN_IDCT_10 \ +#define OC_COLUMN_IDCT_10(_y) \ "#OC_COLUMN_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10 \ - "paddw "OC_8",%%mm2\n\t" \ + OC_IDCT_BEGIN_10(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ /*r1=H'+H'*/ \ "paddw %%mm1,%%mm1\n\t" \ /*r1=R1=A''+H'*/ \ @@ -447,18 +432,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r1=NR1*/ \ "psraw $4,%%mm1\n\t" \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r7=G+G*/ \ "paddw %%mm7,%%mm7\n\t" \ /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ /*r7=G'=E+G*/ \ "paddw %%mm4,%%mm7\n\t" \ /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r4=R4=E'-D'*/ \ "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_8",%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ /*r3=D'+D'*/ \ "paddw %%mm3,%%mm3\n\t" \ /*r3=R3=E'+D'*/ \ @@ -469,7 +454,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ "psubw %%mm5,%%mm6\n\t" \ /*r3=NR3*/ \ "psraw $4,%%mm3\n\t" \ - "paddw "OC_8",%%mm6\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ /*r5=B''+B''*/ \ "paddw %%mm5,%%mm5\n\t" \ /*r5=R5=F'+B''*/ \ @@ -477,14 +462,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r6=NR6*/ \ "psraw $4,%%mm6\n\t" \ /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ /*r5=NR5*/ \ "psraw $4,%%mm5\n\t" \ /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3)"\n\t" \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ /*r7=R7=G'-C'*/ \ "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_8",%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ /*r0=C'+C'*/ \ "paddw %%mm0,%%mm0\n\t" \ /*r0=R0=G'+C'*/ \ @@ -492,46 +477,55 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r7=NR7*/ \ "psraw $4,%%mm7\n\t" \ /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6)"\n\t" \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ /*r0=NR0*/ \ "psraw $4,%%mm0\n\t" \ /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7)"\n\t" \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "#end OC_COLUMN_IDCT_10\n\t" \ -static void oc_idct8x8_10(ogg_int16_t _y[64]){ +static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ __asm__ __volatile__( -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])" +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) /*Done with dequant, descramble, and partial transpose. Now do the iDCT itself.*/ - OC_ROW_IDCT_10 - OC_TRANSPOSE + OC_ROW_IDCT_10(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) #undef OC_I #undef OC_J - : - :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS) + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) + ); + __asm__ __volatile__( + "pxor %%mm0,%%mm0\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) ); } /*Performs an inverse 8x8 Type-II DCT transform. The input is assumed to be scaled by a factor of 4 relative to orthonormal version of the transform.*/ -void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ /*_last_zzi is subtly different from an actual count of the number of coefficients we decoded for this block. It contains the value of zzi BEFORE the final token in the block was @@ -557,8 +551,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ gets. Needless to say we inherited this approach from VP3.*/ /*Then perform the iDCT.*/ - if(_last_zzi<10)oc_idct8x8_10(_y); - else oc_idct8x8_slow(_y); + if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); + else oc_idct8x8_slow_mmx(_y,_x); } #endif diff --git a/thirdparty/libtheora/x86/mmxloop.h b/thirdparty/libtheora/x86/mmxloop.h index 2e870c795d..1f6090b567 100644 --- a/thirdparty/libtheora/x86/mmxloop.h +++ b/thirdparty/libtheora/x86/mmxloop.h @@ -9,88 +9,191 @@ On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ #define OC_LOOP_FILTER8_MMX \ - "#OC_LOOP_FILTER8_MMX\n\t" \ - /*mm7=0*/ \ - "pxor %%mm7,%%mm7\n\t" \ - /*mm6:mm0={a0,...,a7}*/ \ - "movq %%mm0,%%mm6\n\t" \ - "punpcklbw %%mm7,%%mm0\n\t" \ - "punpckhbw %%mm7,%%mm6\n\t" \ - /*mm3:mm5={d0,...,d7}*/ \ - "movq %%mm3,%%mm5\n\t" \ - "punpcklbw %%mm7,%%mm3\n\t" \ - "punpckhbw %%mm7,%%mm5\n\t" \ - /*mm6:mm0={a0-d0,...,a7-d7}*/ \ - "psubw %%mm3,%%mm0\n\t" \ - "psubw %%mm5,%%mm6\n\t" \ - /*mm3:mm1={b0,...,b7}*/ \ - "movq %%mm1,%%mm3\n\t" \ - "punpcklbw %%mm7,%%mm1\n\t" \ - "movq %%mm2,%%mm4\n\t" \ - "punpckhbw %%mm7,%%mm3\n\t" \ - /*mm5:mm4={c0,...,c7}*/ \ - "movq %%mm2,%%mm5\n\t" \ - "punpcklbw %%mm7,%%mm4\n\t" \ - "punpckhbw %%mm7,%%mm5\n\t" \ - /*mm7={3}x4 \ - mm5:mm4={c0-b0,...,c7-b7}*/ \ - "pcmpeqw %%mm7,%%mm7\n\t" \ - "psubw %%mm1,%%mm4\n\t" \ - "psrlw $14,%%mm7\n\t" \ - "psubw %%mm3,%%mm5\n\t" \ - /*Scale by 3.*/ \ - "pmullw %%mm7,%%mm4\n\t" \ - "pmullw %%mm7,%%mm5\n\t" \ - /*mm7={4}x4 \ - mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ - "psrlw $1,%%mm7\n\t" \ - "paddw %%mm0,%%mm4\n\t" \ - "psllw $2,%%mm7\n\t" \ - "movq (%[ll]),%%mm0\n\t" \ - "paddw %%mm6,%%mm5\n\t" \ - /*R_i has the range [-127,128], so we compute -R_i instead. \ - mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ - "psubw %%mm7,%%mm4\n\t" \ - "psubw %%mm7,%%mm5\n\t" \ - "psraw $3,%%mm4\n\t" \ - "psraw $3,%%mm5\n\t" \ - "pcmpeqb %%mm7,%%mm7\n\t" \ - "packsswb %%mm5,%%mm4\n\t" \ - "pxor %%mm6,%%mm6\n\t" \ - "pxor %%mm7,%%mm4\n\t" \ - "packuswb %%mm3,%%mm1\n\t" \ - /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ - /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ - we have to split things by sign (the other option is to work in 16 bits, \ - but working in 8 bits gives much better parallelism). \ - We compute abs(R_i), but save a mask of which terms were negative in mm6. \ - Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ - Finally, we split mm4 into positive and negative pieces using the mask in \ - mm6, and add and subtract them as appropriate.*/ \ - /*mm4=abs(-R_i)*/ \ - /*mm7=255-2*L*/ \ - "pcmpgtb %%mm4,%%mm6\n\t" \ - "psubb %%mm0,%%mm7\n\t" \ - "pxor %%mm6,%%mm4\n\t" \ - "psubb %%mm0,%%mm7\n\t" \ - "psubb %%mm6,%%mm4\n\t" \ - /*mm7=255-max(2*L-abs(R_i),0)*/ \ - "paddusb %%mm4,%%mm7\n\t" \ - /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ - "paddusb %%mm7,%%mm4\n\t" \ - "psubusb %%mm7,%%mm4\n\t" \ - /*Now split mm4 by the original sign of -R_i.*/ \ - "movq %%mm4,%%mm5\n\t" \ - "pand %%mm6,%%mm4\n\t" \ - "pandn %%mm5,%%mm6\n\t" \ - /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ - /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ - "paddusb %%mm4,%%mm1\n\t" \ - "psubusb %%mm4,%%mm2\n\t" \ - "psubusb %%mm6,%%mm1\n\t" \ - "paddusb %%mm6,%%mm2\n\t" \ + "#OC_LOOP_FILTER8_MMX\n\t" \ + /*mm7=0*/ \ + "pxor %%mm7,%%mm7\n\t" \ + /*mm6:mm0={a0,...,a7}*/ \ + "movq %%mm0,%%mm6\n\t" \ + "punpcklbw %%mm7,%%mm0\n\t" \ + "punpckhbw %%mm7,%%mm6\n\t" \ + /*mm3:mm5={d0,...,d7}*/ \ + "movq %%mm3,%%mm5\n\t" \ + "punpcklbw %%mm7,%%mm3\n\t" \ + "punpckhbw %%mm7,%%mm5\n\t" \ + /*mm6:mm0={a0-d0,...,a7-d7}*/ \ + "psubw %%mm3,%%mm0\n\t" \ + "psubw %%mm5,%%mm6\n\t" \ + /*mm3:mm1={b0,...,b7}*/ \ + "movq %%mm1,%%mm3\n\t" \ + "punpcklbw %%mm7,%%mm1\n\t" \ + "movq %%mm2,%%mm4\n\t" \ + "punpckhbw %%mm7,%%mm3\n\t" \ + /*mm5:mm4={c0,...,c7}*/ \ + "movq %%mm2,%%mm5\n\t" \ + "punpcklbw %%mm7,%%mm4\n\t" \ + "punpckhbw %%mm7,%%mm5\n\t" \ + /*mm7={3}x4 \ + mm5:mm4={c0-b0,...,c7-b7}*/ \ + "pcmpeqw %%mm7,%%mm7\n\t" \ + "psubw %%mm1,%%mm4\n\t" \ + "psrlw $14,%%mm7\n\t" \ + "psubw %%mm3,%%mm5\n\t" \ + /*Scale by 3.*/ \ + "pmullw %%mm7,%%mm4\n\t" \ + "pmullw %%mm7,%%mm5\n\t" \ + /*mm7={4}x4 \ + mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ + "psrlw $1,%%mm7\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "psllw $2,%%mm7\n\t" \ + "movq (%[ll]),%%mm0\n\t" \ + "paddw %%mm6,%%mm5\n\t" \ + /*R_i has the range [-127,128], so we compute -R_i instead. \ + mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ + "psubw %%mm7,%%mm4\n\t" \ + "psubw %%mm7,%%mm5\n\t" \ + "psraw $3,%%mm4\n\t" \ + "psraw $3,%%mm5\n\t" \ + "pcmpeqb %%mm7,%%mm7\n\t" \ + "packsswb %%mm5,%%mm4\n\t" \ + "pxor %%mm6,%%mm6\n\t" \ + "pxor %%mm7,%%mm4\n\t" \ + "packuswb %%mm3,%%mm1\n\t" \ + /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ + we have to split things by sign (the other option is to work in 16 bits, \ + but working in 8 bits gives much better parallelism). \ + We compute abs(R_i), but save a mask of which terms were negative in mm6. \ + Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ + Finally, we split mm4 into positive and negative pieces using the mask in \ + mm6, and add and subtract them as appropriate.*/ \ + /*mm4=abs(-R_i)*/ \ + /*mm7=255-2*L*/ \ + "pcmpgtb %%mm4,%%mm6\n\t" \ + "psubb %%mm0,%%mm7\n\t" \ + "pxor %%mm6,%%mm4\n\t" \ + "psubb %%mm0,%%mm7\n\t" \ + "psubb %%mm6,%%mm4\n\t" \ + /*mm7=255-max(2*L-abs(R_i),0)*/ \ + "paddusb %%mm4,%%mm7\n\t" \ + /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ + "paddusb %%mm7,%%mm4\n\t" \ + "psubusb %%mm7,%%mm4\n\t" \ + /*Now split mm4 by the original sign of -R_i.*/ \ + "movq %%mm4,%%mm5\n\t" \ + "pand %%mm6,%%mm4\n\t" \ + "pandn %%mm5,%%mm6\n\t" \ + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ + "paddusb %%mm4,%%mm1\n\t" \ + "psubusb %%mm4,%%mm2\n\t" \ + "psubusb %%mm6,%%mm1\n\t" \ + "paddusb %%mm6,%%mm2\n\t" \ -#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \ +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}. + All other MMX registers are clobbered.*/ +#define OC_LOOP_FILTER8_MMXEXT \ + "#OC_LOOP_FILTER8_MMXEXT\n\t" \ + /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \ + -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \ + /*This first part is based on the transformation \ + f = -(3*(c-b)+a-d+4>>3) \ + = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \ + = -(3*(c+~b)+(a+~d)-1016>>3) \ + = 127-(3*(c+~b)+(a+~d)>>3) \ + = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \ + Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \ + fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \ + Using this, the last expression above can be computed in 8 bits of working \ + precision via: \ + u = ~pavgb(~b,c); \ + v = pavgb(b,~c); \ + This mask is 0 or 0xFF, and controls whether t is biased up or down: \ + m = u-v; \ + t = m^pavgb(m^~a,m^d); \ + f = 128+pavgb(pavgb(t,u),v); \ + This required some careful analysis to ensure that carries are propagated \ + correctly in all cases, but has been checked exhaustively.*/ \ + /*input (a, b, c, d, ., ., ., .)*/ \ + /*ff=0xFF; \ + u=b; \ + v=c; \ + ll=255-2*L;*/ \ + "pcmpeqb %%mm7,%%mm7\n\t" \ + "movq %%mm1,%%mm4\n\t" \ + "movq %%mm2,%%mm5\n\t" \ + "movq (%[ll]),%%mm6\n\t" \ + /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \ + /*u^=ff; \ + v^=ff;*/ \ + "pxor %%mm7,%%mm4\n\t" \ + "pxor %%mm7,%%mm5\n\t" \ + /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \ + /*u=pavgb(u,c); \ + v=pavgb(v,b);*/ \ + "pavgb %%mm2,%%mm4\n\t" \ + "pavgb %%mm1,%%mm5\n\t" \ + /*u^=ff; \ + a^=ff;*/ \ + "pxor %%mm7,%%mm4\n\t" \ + "pxor %%mm7,%%mm0\n\t" \ + /*m=u-v;*/ \ + "psubb %%mm5,%%mm4\n\t" \ + /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \ + /*a^=m; \ + d^=m;*/ \ + "pxor %%mm4,%%mm0\n\t" \ + "pxor %%mm4,%%mm3\n\t" \ + /*t=pavgb(a,d);*/ \ + "pavgb %%mm3,%%mm0\n\t" \ + "psllw $7,%%mm7\n\t" \ + /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \ + /*t^=m; \ + u=m+v;*/ \ + "pxor %%mm4,%%mm0\n\t" \ + "paddb %%mm5,%%mm4\n\t" \ + /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \ + /*f=pavgb(f,u); \ + of=128;*/ \ + "pavgb %%mm4,%%mm0\n\t" \ + "packsswb %%mm7,%%mm7\n\t" \ + /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \ + /*f=pavgb(f,v);*/ \ + "pavgb %%mm5,%%mm0\n\t" \ + "movq %%mm7,%%mm3\n\t" \ + "movq %%mm6,%%mm4\n\t" \ + /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \ + /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \ + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ + we have to split things by sign (the other option is to work in 16 bits, \ + but staying in 8 bits gives much better parallelism).*/ \ + /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \ + This is the same number of instructions as computing a mask and splitting \ + after the lflim computation, but has shorter dependency chains.*/ \ + /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\ + mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \ + "psubusb %%mm0,%%mm3\n\t" \ + "psubusb %%mm7,%%mm0\n\t" \ + /*mm6=255-max(2*L-abs(R_i<0),0) \ + mm4=255-max(2*L-abs(R_i>0),0)*/ \ + "paddusb %%mm3,%%mm4\n\t" \ + "paddusb %%mm0,%%mm6\n\t" \ + /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \ + mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \ + "paddusb %%mm4,%%mm3\n\t" \ + "paddusb %%mm6,%%mm0\n\t" \ + "psubusb %%mm4,%%mm3\n\t" \ + "psubusb %%mm6,%%mm0\n\t" \ + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ + "paddusb %%mm3,%%mm1\n\t" \ + "psubusb %%mm3,%%mm2\n\t" \ + "psubusb %%mm0,%%mm1\n\t" \ + "paddusb %%mm0,%%mm2\n\t" \ + +#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \ do{ \ ptrdiff_t ystride3__; \ __asm__ __volatile__( \ @@ -104,7 +207,7 @@ "movq (%[pix],%[ystride]),%%mm1\n\t" \ /*mm2={c0,...,c7}*/ \ "movq (%[pix],%[ystride],2),%%mm2\n\t" \ - OC_LOOP_FILTER8_MMX \ + _filter \ /*Write it back out.*/ \ "movq %%mm1,(%[pix],%[ystride])\n\t" \ "movq %%mm2,(%[pix],%[ystride],2)\n\t" \ @@ -116,7 +219,7 @@ } \ while(0) -#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \ +#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \ do{ \ unsigned char *pix__; \ ptrdiff_t ystride3__; \ @@ -174,7 +277,7 @@ "punpckldq %%mm5,%%mm2\n\t" \ /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ "punpckhdq %%mm5,%%mm3\n\t" \ - OC_LOOP_FILTER8_MMX \ + _filter \ /*mm2={b0+R_0'',...,b7+R_7''}*/ \ "movq %%mm1,%%mm0\n\t" \ /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ diff --git a/thirdparty/libtheora/x86/mmxstate.c b/thirdparty/libtheora/x86/mmxstate.c index 808b0a789b..eebea14fba 100644 --- a/thirdparty/libtheora/x86/mmxstate.c +++ b/thirdparty/libtheora/x86/mmxstate.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -19,23 +19,23 @@ Originally written by Rudolf Marek.*/ #include <string.h> #include "x86int.h" -#include "mmxfrag.h" #include "mmxloop.h" #if defined(OC_X86_ASM) void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){ + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ unsigned char *dst; ptrdiff_t frag_buf_off; int ystride; - int mb_mode; + int refi; /*Apply the inverse transform.*/ /*Special case only having a DC component.*/ if(_last_zzi<2){ /*Note that this value must be unsigned, to keep the __asm__ block from sign-extending it when it puts it in a register.*/ ogg_uint16_t p; + int i; /*We round this dequant product (and not any of the others) because there's no iDCT rounding.*/ p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); @@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, "punpcklwd %%mm0,%%mm0\n\t" /*mm0=AAAA AAAA AAAA AAAA*/ "punpckldq %%mm0,%%mm0\n\t" - "movq %%mm0,(%[y])\n\t" - "movq %%mm0,8(%[y])\n\t" - "movq %%mm0,16(%[y])\n\t" - "movq %%mm0,24(%[y])\n\t" - "movq %%mm0,32(%[y])\n\t" - "movq %%mm0,40(%[y])\n\t" - "movq %%mm0,48(%[y])\n\t" - "movq %%mm0,56(%[y])\n\t" - "movq %%mm0,64(%[y])\n\t" - "movq %%mm0,72(%[y])\n\t" - "movq %%mm0,80(%[y])\n\t" - "movq %%mm0,88(%[y])\n\t" - "movq %%mm0,96(%[y])\n\t" - "movq %%mm0,104(%[y])\n\t" - "movq %%mm0,112(%[y])\n\t" - "movq %%mm0,120(%[y])\n\t" : - :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p) - :"memory" + :[p]"r"((unsigned)p) ); + for(i=0;i<4;i++){ + __asm__ __volatile__( + "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) + ); + } } else{ /*Dequantize the DC coefficient.*/ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_mmx(_dct_coeffs,_last_zzi); + oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); } /*Fill in the target buffer.*/ frag_buf_off=_state->frag_buf_offs[_fragi]; - mb_mode=_state->frags[_fragi].mb_mode; + refi=_state->frags[_fragi].refi; ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off; - if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs); + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); else{ const unsigned char *ref; int mvoffsets[2]; - ref= - _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]] - +frag_buf_off; + ref=_state->ref_frame_data[refi]+frag_buf_off; if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){ + _state->frag_mvs[_fragi])>1){ oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs); + _dct_coeffs+64); } - else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs); + else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); } } /*We copy these entire function to inline the actual MMX routines so that we use only a single indirect call.*/ -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _pli: The color plane the fragments lie in.*/ -void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli){ - const ptrdiff_t *frag_buf_offs; - const unsigned char *src_frame_data; - unsigned char *dst_frame_data; - ptrdiff_t fragii; - int ystride; - dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]]; - src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]]; - ystride=_state->ref_ystride[_pli]; - frag_buf_offs=_state->frag_buf_offs; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=frag_buf_offs[_fragis[fragii]]; - OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off, - src_frame_data+frag_buf_off,ystride); - } +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ + memset(_bv,_flimit,8); } /*Apply the loop filter to a given set of fragment rows in the given plane. @@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, _fragy0: The Y coordinate of the first fragment row to filter. _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ OC_ALIGN8(unsigned char ll[8]); const oc_fragment_plane *fplane; const oc_fragment *frags; @@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, if(frags[fragi].coded){ unsigned char *ref; ref=ref_frame_data+frag_buf_offs[fragi]; - if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll); - if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll); + if(fragi>fragi0){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); + } + if(fragi0>fragi_top){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); + } + if(fragi+1<fragi_end&&!frags[fragi+1].coded){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll); + } + if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll); + } + } + fragi++; + } + fragi0+=nhfrags; + } +} + +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){ + memset(_bv,~(_flimit<<1),8); +} + +/*Apply the loop filter to a given set of fragment rows in the given plane. + The filter may be run on the bottom edge, affecting pixels in the next row of + fragments, so this row also needs to be available. + _bv: The bounding values array. + _refi: The index of the frame buffer to filter. + _pli: The color plane to filter. + _fragy0: The Y coordinate of the first fragment row to filter. + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ + const oc_fragment_plane *fplane; + const oc_fragment *frags; + const ptrdiff_t *frag_buf_offs; + unsigned char *ref_frame_data; + ptrdiff_t fragi_top; + ptrdiff_t fragi_bot; + ptrdiff_t fragi0; + ptrdiff_t fragi0_end; + int ystride; + int nhfrags; + fplane=_state->fplanes+_pli; + nhfrags=fplane->nhfrags; + fragi_top=fplane->froffset; + fragi_bot=fragi_top+fplane->nfrags; + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; + fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; + ystride=_state->ref_ystride[_pli]; + frags=_state->frags; + frag_buf_offs=_state->frag_buf_offs; + ref_frame_data=_state->ref_frame_data[_refi]; + /*The following loops are constructed somewhat non-intuitively on purpose. + The main idea is: if a block boundary has at least one coded fragment on + it, the filter is applied to it. + However, the order that the filters are applied in matters, and VP3 chose + the somewhat strange ordering used below.*/ + while(fragi0<fragi0_end){ + ptrdiff_t fragi; + ptrdiff_t fragi_end; + fragi=fragi0; + fragi_end=fragi+nhfrags; + while(fragi<fragi_end){ + if(frags[fragi].coded){ + unsigned char *ref; + ref=ref_frame_data+frag_buf_offs[fragi]; + if(fragi>fragi0){ + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); + } + if(fragi0>fragi_top){ + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); + } if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll); + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv); } if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll); + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv); } } fragi++; diff --git a/thirdparty/libtheora/x86/sse2encfrag.c b/thirdparty/libtheora/x86/sse2encfrag.c new file mode 100644 index 0000000000..43aeb17711 --- /dev/null +++ b/thirdparty/libtheora/x86/sse2encfrag.c @@ -0,0 +1,501 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ + + ********************************************************************/ +#include <stddef.h> +#include "x86enc.h" +#include "sse2trans.h" + +#if defined(OC_X86_ASM) + +/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their + 16-bit differences. + On output, these are stored in _m0, xmm1, xmm2, and xmm3. + xmm4 and xmm5 are clobbered.*/ +#define OC_LOAD_SUB_4x8(_m0) \ + "#OC_LOAD_SUB_4x8\n\t" \ + /*Load the first three rows.*/ \ + "movq (%[src]),"_m0"\n\t" \ + "movq (%[ref]),%%xmm4\n\t" \ + "movq (%[src],%[ystride]),%%xmm1\n\t" \ + "movq (%[ref],%[ystride]),%%xmm3\n\t" \ + "movq (%[src],%[ystride],2),%%xmm2\n\t" \ + "movq (%[ref],%[ystride],2),%%xmm5\n\t" \ + /*Unpack and subtract.*/ \ + "punpcklbw %%xmm4,"_m0"\n\t" \ + "punpcklbw %%xmm4,%%xmm4\n\t" \ + "punpcklbw %%xmm3,%%xmm1\n\t" \ + "punpcklbw %%xmm3,%%xmm3\n\t" \ + "psubw %%xmm4,"_m0"\n\t" \ + "psubw %%xmm3,%%xmm1\n\t" \ + /*Load the last row.*/ \ + "movq (%[src],%[ystride3]),%%xmm3\n\t" \ + "movq (%[ref],%[ystride3]),%%xmm4\n\t" \ + /*Unpack, subtract, and advance the pointers.*/ \ + "punpcklbw %%xmm5,%%xmm2\n\t" \ + "punpcklbw %%xmm5,%%xmm5\n\t" \ + "lea (%[src],%[ystride],4),%[src]\n\t" \ + "psubw %%xmm5,%%xmm2\n\t" \ + "punpcklbw %%xmm4,%%xmm3\n\t" \ + "punpcklbw %%xmm4,%%xmm4\n\t" \ + "lea (%[ref],%[ystride],4),%[ref]\n\t" \ + "psubw %%xmm4,%%xmm3\n\t" \ + +/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3. + On output, xmm0 contains the sum of two of the rows, and the other two are + added to xmm7.*/ +#define OC_SSD_4x8(_m0) \ + "pmaddwd "_m0","_m0"\n\t" \ + "pmaddwd %%xmm1,%%xmm1\n\t" \ + "pmaddwd %%xmm2,%%xmm2\n\t" \ + "pmaddwd %%xmm3,%%xmm3\n\t" \ + "paddd %%xmm1,"_m0"\n\t" \ + "paddd %%xmm3,%%xmm2\n\t" \ + "paddd %%xmm2,%%xmm7\n\t" \ + +unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + unsigned ret; + __asm__ __volatile__( + OC_LOAD_SUB_4x8("%%xmm7") + OC_SSD_4x8("%%xmm7") + OC_LOAD_SUB_4x8("%%xmm0") + OC_SSD_4x8("%%xmm0") + "paddd %%xmm0,%%xmm7\n\t" + "movdqa %%xmm7,%%xmm6\n\t" + "punpckhqdq %%xmm7,%%xmm7\n\t" + "paddd %%xmm6,%%xmm7\n\t" + "pshufd $1,%%xmm7,%%xmm6\n\t" + "paddd %%xmm6,%%xmm7\n\t" + "movd %%xmm7,%[ret]\n\t" + :[ret]"=a"(ret) + :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride), + [ystride3]"r"((ptrdiff_t)_ystride*3) + ); + return ret; +} + +static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={ + 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 +}; + +/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their + horizontal sums as well as their 16-bit differences subject to a mask. + %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/ +#define OC_LOAD_SUB_MASK_2x8 \ + "#OC_LOAD_SUB_MASK_2x8\n\t" \ + /*Start the loads and expand the next 8 bits of the mask.*/ \ + "shl $8,%[m]\n\t" \ + "movq (%[src]),%%xmm0\n\t" \ + "mov %h[m],%b[m]\n\t" \ + "movq (%[ref]),%%xmm2\n\t" \ + "movd %[m],%%xmm4\n\t" \ + "shr $8,%[m]\n\t" \ + "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \ + "mov %h[m],%b[m]\n\t" \ + "pand %%xmm6,%%xmm4\n\t" \ + "pcmpeqb %%xmm6,%%xmm4\n\t" \ + /*Perform the masking.*/ \ + "pand %%xmm4,%%xmm0\n\t" \ + "pand %%xmm4,%%xmm2\n\t" \ + /*Finish the loads while unpacking the first set of rows, and expand the next + 8 bits of the mask.*/ \ + "movd %[m],%%xmm4\n\t" \ + "movq (%[src],%[ystride]),%%xmm1\n\t" \ + "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \ + "movq (%[ref],%[ystride]),%%xmm3\n\t" \ + "pand %%xmm6,%%xmm4\n\t" \ + "punpcklbw %%xmm2,%%xmm0\n\t" \ + "pcmpeqb %%xmm6,%%xmm4\n\t" \ + "punpcklbw %%xmm2,%%xmm2\n\t" \ + /*Mask and unpack the second set of rows.*/ \ + "pand %%xmm4,%%xmm1\n\t" \ + "pand %%xmm4,%%xmm3\n\t" \ + "punpcklbw %%xmm3,%%xmm1\n\t" \ + "punpcklbw %%xmm3,%%xmm3\n\t" \ + "psubw %%xmm2,%%xmm0\n\t" \ + "psubw %%xmm3,%%xmm1\n\t" \ + +unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src, + const unsigned char *_ref,int _ystride,ogg_int64_t _mask){ + ptrdiff_t ystride; + unsigned ret; + int i; + ystride=_ystride; + __asm__ __volatile__( + "pxor %%xmm7,%%xmm7\n\t" + "movq %[c],%%xmm6\n\t" + : + :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8)) + ); + for(i=0;i<4;i++){ + unsigned m; + m=_mask&0xFFFF; + _mask>>=16; + if(m){ + __asm__ __volatile__( + OC_LOAD_SUB_MASK_2x8 + "pmaddwd %%xmm0,%%xmm0\n\t" + "pmaddwd %%xmm1,%%xmm1\n\t" + "paddd %%xmm0,%%xmm7\n\t" + "paddd %%xmm1,%%xmm7\n\t" + :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m) + ); + } + _src+=2*ystride; + _ref+=2*ystride; + } + __asm__ __volatile__( + "movdqa %%xmm7,%%xmm6\n\t" + "punpckhqdq %%xmm7,%%xmm7\n\t" + "paddd %%xmm6,%%xmm7\n\t" + "pshufd $1,%%xmm7,%%xmm6\n\t" + "paddd %%xmm6,%%xmm7\n\t" + "movd %%xmm7,%[ret]\n\t" + :[ret]"=a"(ret) + ); + return ret; +} + + +/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their + 16-bit difference in %%xmm0...%%xmm7.*/ +#define OC_LOAD_SUB_8x8 \ + "#OC_LOAD_SUB_8x8\n\t" \ + "movq (%[src]),%%xmm0\n\t" \ + "movq (%[ref]),%%xmm4\n\t" \ + "movq (%[src],%[src_ystride]),%%xmm1\n\t" \ + "lea (%[src],%[src_ystride],2),%[src]\n\t" \ + "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \ + "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ + "movq (%[src]),%%xmm2\n\t" \ + "movq (%[ref]),%%xmm7\n\t" \ + "movq (%[src],%[src_ystride]),%%xmm3\n\t" \ + "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \ + "punpcklbw %%xmm4,%%xmm0\n\t" \ + "lea (%[src],%[src_ystride],2),%[src]\n\t" \ + "punpcklbw %%xmm4,%%xmm4\n\t" \ + "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ + "psubw %%xmm4,%%xmm0\n\t" \ + "movq (%[src]),%%xmm4\n\t" \ + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + "movq (%[ref]),%%xmm0\n\t" \ + "punpcklbw %%xmm5,%%xmm1\n\t" \ + "punpcklbw %%xmm5,%%xmm5\n\t" \ + "psubw %%xmm5,%%xmm1\n\t" \ + "movq (%[src],%[src_ystride]),%%xmm5\n\t" \ + "punpcklbw %%xmm7,%%xmm2\n\t" \ + "punpcklbw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm7,%%xmm2\n\t" \ + "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \ + "punpcklbw %%xmm6,%%xmm3\n\t" \ + "lea (%[src],%[src_ystride],2),%[src]\n\t" \ + "punpcklbw %%xmm6,%%xmm6\n\t" \ + "psubw %%xmm6,%%xmm3\n\t" \ + "movq (%[src]),%%xmm6\n\t" \ + "punpcklbw %%xmm0,%%xmm4\n\t" \ + "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ + "punpcklbw %%xmm0,%%xmm0\n\t" \ + "lea (%[src],%[src_ystride],2),%[src]\n\t" \ + "psubw %%xmm0,%%xmm4\n\t" \ + "movq (%[ref]),%%xmm0\n\t" \ + "punpcklbw %%xmm7,%%xmm5\n\t" \ + "neg %[src_ystride]\n\t" \ + "punpcklbw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm7,%%xmm5\n\t" \ + "movq (%[src],%[src_ystride]),%%xmm7\n\t" \ + "punpcklbw %%xmm0,%%xmm6\n\t" \ + "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ + "punpcklbw %%xmm0,%%xmm0\n\t" \ + "neg %[ref_ystride]\n\t" \ + "psubw %%xmm0,%%xmm6\n\t" \ + "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \ + "punpcklbw %%xmm0,%%xmm7\n\t" \ + "punpcklbw %%xmm0,%%xmm0\n\t" \ + "psubw %%xmm0,%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ + +/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/ +#define OC_LOAD_8x8 \ + "#OC_LOAD_8x8\n\t" \ + "movq (%[src]),%%xmm0\n\t" \ + "movq (%[src],%[ystride]),%%xmm1\n\t" \ + "movq (%[src],%[ystride],2),%%xmm2\n\t" \ + "pxor %%xmm7,%%xmm7\n\t" \ + "movq (%[src],%[ystride3]),%%xmm3\n\t" \ + "punpcklbw %%xmm7,%%xmm0\n\t" \ + "movq (%[src4]),%%xmm4\n\t" \ + "punpcklbw %%xmm7,%%xmm1\n\t" \ + "movq (%[src4],%[ystride]),%%xmm5\n\t" \ + "punpcklbw %%xmm7,%%xmm2\n\t" \ + "movq (%[src4],%[ystride],2),%%xmm6\n\t" \ + "punpcklbw %%xmm7,%%xmm3\n\t" \ + "movq (%[src4],%[ystride3]),%%xmm7\n\t" \ + "punpcklbw %%xmm4,%%xmm4\n\t" \ + "punpcklbw %%xmm5,%%xmm5\n\t" \ + "psrlw $8,%%xmm4\n\t" \ + "psrlw $8,%%xmm5\n\t" \ + "punpcklbw %%xmm6,%%xmm6\n\t" \ + "punpcklbw %%xmm7,%%xmm7\n\t" \ + "psrlw $8,%%xmm6\n\t" \ + "psrlw $8,%%xmm7\n\t" \ + +/*Performs the first two stages of an 8-point 1-D Hadamard transform in place. + Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to + perform this stage in place with no temporary registers).*/ +#define OC_HADAMARD_AB_8x8 \ + "#OC_HADAMARD_AB_8x8\n\t" \ + /*Stage A:*/ \ + "paddw %%xmm5,%%xmm1\n\t" \ + "paddw %%xmm6,%%xmm2\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "paddw %%xmm6,%%xmm6\n\t" \ + "psubw %%xmm1,%%xmm5\n\t" \ + "psubw %%xmm2,%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm3\n\t" \ + "paddw %%xmm4,%%xmm0\n\t" \ + "paddw %%xmm7,%%xmm7\n\t" \ + "paddw %%xmm4,%%xmm4\n\t" \ + "psubw %%xmm3,%%xmm7\n\t" \ + "psubw %%xmm0,%%xmm4\n\t" \ + /*Stage B:*/ \ + "paddw %%xmm2,%%xmm0\n\t" \ + "paddw %%xmm3,%%xmm1\n\t" \ + "paddw %%xmm6,%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm5\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm6,%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm0,%%xmm2\n\t" \ + "psubw %%xmm1,%%xmm3\n\t" \ + "psubw %%xmm4,%%xmm6\n\t" \ + "psubw %%xmm5,%%xmm7\n\t" \ + +/*Performs the last stage of an 8-point 1-D Hadamard transform in place. + Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in + place with no temporary registers).*/ +#define OC_HADAMARD_C_8x8 \ + "#OC_HADAMARD_C_8x8\n\t" \ + /*Stage C:*/ \ + "paddw %%xmm1,%%xmm0\n\t" \ + "paddw %%xmm3,%%xmm2\n\t" \ + "paddw %%xmm5,%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm1\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "paddw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm0,%%xmm1\n\t" \ + "psubw %%xmm2,%%xmm3\n\t" \ + "psubw %%xmm4,%%xmm5\n\t" \ + "psubw %%xmm6,%%xmm7\n\t" \ + +/*Performs an 8-point 1-D Hadamard transform in place. + Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform + in place with no temporary registers).*/ +#define OC_HADAMARD_8x8 \ + OC_HADAMARD_AB_8x8 \ + OC_HADAMARD_C_8x8 \ + +/*Performs the first part of the final stage of the Hadamard transform and + summing of absolute values. + At the end of this part, %%xmm1 will contain the DC coefficient of the + transform.*/ +#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \ + /*We use the fact that \ + (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ + to merge the final butterfly with the abs and the first stage of \ + accumulation. \ + Thus we can avoid using pabsw, which is not available until SSSE3. \ + Emulating pabsw takes 3 instructions, so the straightforward SSE2 \ + implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ + registers). \ + Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ + This implementation is only 26 (+4 for spilling registers).*/ \ + "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*xmm7={0x7FFF}x4 \ + xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \ + "pcmpeqb %%xmm7,%%xmm7\n\t" \ + "movdqa %%xmm4,%%xmm6\n\t" \ + "psrlw $1,%%xmm7\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "pmaxsw %%xmm5,%%xmm4\n\t" \ + "paddsw %%xmm7,%%xmm6\n\t" \ + "psubw %%xmm6,%%xmm4\n\t" \ + /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \ + xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \ + "movdqa %%xmm2,%%xmm6\n\t" \ + "movdqa %%xmm0,%%xmm5\n\t" \ + "pmaxsw %%xmm3,%%xmm2\n\t" \ + "pmaxsw %%xmm1,%%xmm0\n\t" \ + "paddw %%xmm3,%%xmm6\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \ + "paddw %%xmm5,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \ + +/*Performs the second part of the final stage of the Hadamard transform and + summing of absolute values.*/ +#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \ + "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \ + "paddsw %%xmm7,%%xmm6\n\t" \ + "paddsw %%xmm7,%%xmm1\n\t" \ + "psubw %%xmm6,%%xmm2\n\t" \ + "psubw %%xmm1,%%xmm0\n\t" \ + /*xmm7={1}x4 (needed for the horizontal add that follows) \ + xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \ + "movdqa %%xmm3,%%xmm6\n\t" \ + "pmaxsw %%xmm5,%%xmm3\n\t" \ + "paddw %%xmm2,%%xmm0\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm0\n\t" \ + "paddsw %%xmm7,%%xmm6\n\t" \ + "paddw %%xmm3,%%xmm0\n\t" \ + "psrlw $14,%%xmm7\n\t" \ + "psubw %%xmm6,%%xmm0\n\t" \ + +/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the + absolute value of each component, and accumulates everything into xmm0.*/ +#define OC_HADAMARD_C_ABS_ACCUM_8x8 \ + OC_HADAMARD_C_ABS_ACCUM_A_8x8 \ + OC_HADAMARD_C_ABS_ACCUM_B_8x8 \ + +/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each + component, and accumulates everything into xmm0. + Note that xmm0 will have an extra 4 added to each column, and that after + removing this value, the remainder will be half the conventional value.*/ +#define OC_HADAMARD_ABS_ACCUM_8x8 \ + OC_HADAMARD_AB_8x8 \ + OC_HADAMARD_C_ABS_ACCUM_8x8 + +static unsigned oc_int_frag_satd_sse2(int *_dc, + const unsigned char *_src,int _src_ystride, + const unsigned char *_ref,int _ref_ystride){ + OC_ALIGN16(ogg_int16_t buf[16]); + unsigned ret; + unsigned ret2; + int dc; + __asm__ __volatile__( + OC_LOAD_SUB_8x8 + OC_HADAMARD_8x8 + OC_TRANSPOSE_8x8 + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x8 + OC_HADAMARD_C_ABS_ACCUM_A_8x8 + "movd %%xmm1,%[dc]\n\t" + OC_HADAMARD_C_ABS_ACCUM_B_8x8 + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords. + We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long + latency of pmaddwd by starting to compute abs(dc) here.*/ + "pmaddwd %%xmm7,%%xmm0\n\t" + "movsx %w[dc],%[dc]\n\t" + "cdq\n\t" + "movdqa %%xmm0,%%xmm1\n\t" + "punpckhqdq %%xmm0,%%xmm0\n\t" + "paddd %%xmm1,%%xmm0\n\t" + "pshuflw $0xE,%%xmm0,%%xmm1\n\t" + "paddd %%xmm1,%%xmm0\n\t" + "movd %%xmm0,%[ret]\n\t" + /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4 + added to them, a factor of two removed, and the DC value included; + correct the final sum here.*/ + "lea -64(%[ret2],%[ret],2),%[ret]\n\t" + "xor %[dc],%[ret2]\n\t" + "sub %[ret2],%[ret]\n\t" + /*Although it looks like we're using 7 registers here, gcc can alias %[ret] + and %[dc] with some of the inputs, since for once we don't write to + them until after we're done using everything but %[buf].*/ + /*Note that _src_ystride and _ref_ystride must be given non-overlapping + constraints, otherewise if gcc can prove they're equal it will allocate + them to the same register (which is bad); _src and _ref face a similar + problem. + All four are destructively modified, but if we list them as output + constraints, gcc can't alias them with other outputs.*/ + :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc), + [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)) + :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride), + [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride) + /*We have to use neg, so we actually clobber the condition codes for once + (not to mention sub, and add).*/ + :"cc" + ); + *_dc=dc; + return ret; +} + +unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride); +} + +unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ + OC_ALIGN8(unsigned char ref[64]); + oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); + return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8); +} + +unsigned oc_enc_frag_intra_satd_sse2(int *_dc, + const unsigned char *_src,int _ystride){ + OC_ALIGN16(ogg_int16_t buf[16]); + unsigned ret; + int dc; + __asm__ __volatile__( + OC_LOAD_8x8 + OC_HADAMARD_8x8 + OC_TRANSPOSE_8x8 + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x8 + OC_HADAMARD_C_ABS_ACCUM_A_8x8 + "movd %%xmm1,%[dc]\n\t" + OC_HADAMARD_C_ABS_ACCUM_B_8x8 + /*Up to this point, everything fit in 16 bits (8 input + 1 for the + difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 + for the factor of two we dropped + 3 for the vertical accumulation). + Now we finally have to promote things to dwords.*/ + "pmaddwd %%xmm7,%%xmm0\n\t" + /*We assume that the DC coefficient is always positive (which is true, + because the input to the INTRA transform was not a difference).*/ + "movzx %w[dc],%[dc]\n\t" + "movdqa %%xmm0,%%xmm1\n\t" + "punpckhqdq %%xmm0,%%xmm0\n\t" + "paddd %%xmm1,%%xmm0\n\t" + "pshuflw $0xE,%%xmm0,%%xmm1\n\t" + "paddd %%xmm1,%%xmm0\n\t" + "movd %%xmm0,%[ret]\n\t" + "lea -64(%[ret],%[ret]),%[ret]\n\t" + "sub %[dc],%[ret]\n\t" + /*Although it looks like we're using 7 registers here, gcc can alias %[ret] + and %[dc] with some of the inputs, since for once we don't write to + them until after we're done using everything but %[buf].*/ + :[ret]"=a"(ret),[dc]"=r"(dc), + [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)) + :[src]"r"(_src),[src4]"r"(_src+4*_ystride), + [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride) + /*We have to use sub, so we actually clobber the condition codes for once.*/ + :"cc" + ); + *_dc=dc; + return ret; +} + +#endif diff --git a/thirdparty/libtheora/x86/sse2fdct.c b/thirdparty/libtheora/x86/sse2fdct.c index 86c17d68b1..64c1d27372 100644 --- a/thirdparty/libtheora/x86/sse2fdct.c +++ b/thirdparty/libtheora/x86/sse2fdct.c @@ -13,12 +13,14 @@ /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ #include <stddef.h> #include "x86enc.h" +#include "x86zigzag.h" +#include "sse2trans.h" #if defined(OC_X86_64_ASM) -# define OC_FDCT8x8 \ +# define OC_FDCT_8x8 \ /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \ - "#OC_FDCT8x8\n\t" \ + "#OC_FDCT_8x8\n\t" \ /*Stage 1:*/ \ "movdqa %%xmm0,%%xmm11\n\t" \ "movdqa %%xmm1,%%xmm10\n\t" \ @@ -349,81 +351,6 @@ "psubw %%xmm14,%%xmm10\n\t" \ "paddw %%xmm10,%%xmm7\n\t " \ -# define OC_TRANSPOSE8x8 \ - "#OC_TRANSPOSE8x8\n\t" \ - "movdqa %%xmm4,%%xmm8\n\t" \ - /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ - "punpcklwd %%xmm5,%%xmm4\n\t" \ - /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ - "punpckhwd %%xmm5,%%xmm8\n\t" \ - /*xmm5 is free.*/ \ - "movdqa %%xmm0,%%xmm5\n\t" \ - /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ - "punpcklwd %%xmm1,%%xmm0\n\t" \ - /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ - "punpckhwd %%xmm1,%%xmm5\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm6,%%xmm1\n\t" \ - /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ - "punpcklwd %%xmm7,%%xmm6\n\t" \ - /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ - "punpckhwd %%xmm7,%%xmm1\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm2,%%xmm7\n\t" \ - /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ - "punpcklwd %%xmm3,%%xmm7\n\t" \ - /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ - "punpckhwd %%xmm3,%%xmm2\n\t" \ - /*xmm3 is free.*/ \ - "movdqa %%xmm0,%%xmm3\n\t" \ - /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ - "punpckldq %%xmm7,%%xmm0\n\t" \ - /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ - "punpckhdq %%xmm7,%%xmm3\n\t" \ - /*xmm7 is free.*/ \ - "movdqa %%xmm5,%%xmm7\n\t" \ - /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ - "punpckldq %%xmm2,%%xmm5\n\t" \ - /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ - "punpckhdq %%xmm2,%%xmm7\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm4,%%xmm2\n\t" \ - /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ - "punpckldq %%xmm6,%%xmm2\n\t" \ - /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ - "punpckhdq %%xmm6,%%xmm4\n\t" \ - /*xmm6 is free.*/ \ - "movdqa %%xmm8,%%xmm6\n\t" \ - /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ - "punpckldq %%xmm1,%%xmm6\n\t" \ - /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ - "punpckhdq %%xmm1,%%xmm8\n\t" \ - /*xmm1 is free.*/ \ - "movdqa %%xmm0,%%xmm1\n\t" \ - /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ - "punpcklqdq %%xmm2,%%xmm0\n\t" \ - /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ - "punpckhqdq %%xmm2,%%xmm1\n\t" \ - /*xmm2 is free.*/ \ - "movdqa %%xmm3,%%xmm2\n\t" \ - /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ - "punpcklqdq %%xmm4,%%xmm2\n\t" \ - /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ - "punpckhqdq %%xmm4,%%xmm3\n\t" \ - /*xmm4 is free.*/ \ - "movdqa %%xmm5,%%xmm4\n\t" \ - /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ - "punpcklqdq %%xmm6,%%xmm4\n\t" \ - /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ - "punpckhqdq %%xmm6,%%xmm5\n\t" \ - /*xmm6 is free.*/ \ - "movdqa %%xmm7,%%xmm6\n\t" \ - /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ - "punpcklqdq %%xmm8,%%xmm6\n\t" \ - /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ - "punpckhqdq %%xmm8,%%xmm7\n\t" \ - /*xmm8 is free.*/ \ - /*SSE2 implementation of the fDCT for x86-64 only. Because of the 8 extra XMM registers on x86-64, this version can operate without any temporary stack access at all.*/ @@ -482,12 +409,10 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/ "psubw %%xmm9,%%xmm1\n\t" /*Transform columns.*/ - OC_FDCT8x8 + OC_FDCT_8x8 /*Transform rows.*/ - OC_TRANSPOSE8x8 - OC_FDCT8x8 - /*TODO: zig-zag ordering?*/ - OC_TRANSPOSE8x8 + OC_TRANSPOSE_8x8 + OC_FDCT_8x8 /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/ "paddw %%xmm14,%%xmm14\n\t" "psubw %%xmm14,%%xmm0\n\t" @@ -506,15 +431,19 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ "psubw %%xmm14,%%xmm7\n\t" "psraw $2,%%xmm6\n\t" "psraw $2,%%xmm7\n\t" - /*Store the result.*/ - "movdqa %%xmm0,0x00(%[y])\n\t" - "movdqa %%xmm1,0x10(%[y])\n\t" - "movdqa %%xmm2,0x20(%[y])\n\t" - "movdqa %%xmm3,0x30(%[y])\n\t" - "movdqa %%xmm4,0x40(%[y])\n\t" - "movdqa %%xmm5,0x50(%[y])\n\t" - "movdqa %%xmm6,0x60(%[y])\n\t" - "movdqa %%xmm7,0x70(%[y])\n\t" + /*Transpose, zig-zag, and store the result.*/ + /*We could probably do better using SSSE3's palignr, but re-using MMXEXT + version will do for now.*/ +#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \ + "movdq2q %%xmm"#_row","_reg"\n\t" \ + +#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \ + "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \ + "movdq2q %%xmm"#_row","_reg"\n\t" \ + + OC_TRANSPOSE_ZIG_ZAG_MMXEXT +#undef OC_ZZ_LOAD_ROW_LO +#undef OC_ZZ_LOAD_ROW_HI :[a]"=&r"(a) :[y]"r"(_y),[x]"r"(_x) :"memory" diff --git a/thirdparty/libtheora/x86/sse2idct.c b/thirdparty/libtheora/x86/sse2idct.c new file mode 100644 index 0000000000..4597ab074f --- /dev/null +++ b/thirdparty/libtheora/x86/sse2idct.c @@ -0,0 +1,456 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + + ********************************************************************/ + +/*SSE2 acceleration of Theora's iDCT.*/ +#include "x86int.h" +#include "sse2trans.h" +#include "../dct.h" + +#if defined(OC_X86_ASM) + +/*A table of constants used by the MMX routines.*/ +const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ + 8, 8, 8, 8, 8, 8, 8, 8, + OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, + OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, + OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, + OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, + OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, + OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, + OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 +}; + + +/*Performs the first three stages of the iDCT. + xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input + (accessed in that order). + The remaining rows must be in _x at their corresponding locations. + On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7.*/ +#define OC_IDCT_8x8_ABC(_x) \ + "#OC_IDCT_8x8_ABC\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ + "movdqa %%xmm1,%%xmm0\n\t" \ + "pmulhw %%xmm2,%%xmm1\n\t" \ + "movdqa %%xmm4,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm0\n\t" \ + "pmulhw %%xmm2,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "paddw %%xmm6,%%xmm0\n\t" \ + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm0,%%xmm7\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + "paddw %%xmm4,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ + "movdqa %%xmm4,%%xmm2\n\t" \ + "movdqa %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm3,%%xmm4\n\t" \ + "pmulhw %%xmm5,%%xmm1\n\t" \ + "pmulhw %%xmm3,%%xmm6\n\t" \ + "pmulhw %%xmm5,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm3\n\t" \ + "paddw %%xmm6,%%xmm3\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ + "paddw %%xmm5,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ + "paddw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ + "psubw %%xmm4,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ + /*4-7 rotation by 7pi/16. \ + xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ + "movdqa %%xmm3,%%xmm0\n\t" \ + "movdqa %%xmm4,%%xmm7\n\t" \ + "pmulhw %%xmm5,%%xmm3\n\t" \ + "pmulhw %%xmm5,%%xmm7\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "pmulhw %%xmm6,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm4\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ + "paddw %%xmm5,%%xmm7\n\t" \ + "psubw %%xmm4,%%xmm3\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ + /*0-1 butterfly. \ + xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ + "paddw %%xmm7,%%xmm6\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + "pmulhw %%xmm6,%%xmm4\n\t" \ + "paddw %%xmm7,%%xmm7\n\t" \ + "psubw %%xmm6,%%xmm7\n\t" \ + "paddw %%xmm6,%%xmm4\n\t" \ + /*Stage 2:*/ \ + /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ + 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ + "movdqa %%xmm3,%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm3\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "movdqa %%xmm5,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm5\n\t" \ + "paddw %%xmm7,%%xmm5\n\t" \ + "movdqa %%xmm0,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm0\n\t" \ + "psubw %%xmm2,%%xmm7\n\t" \ + "movdqa %%xmm1,%%xmm2\n\t" \ + "pmulhw %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm2\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ + "paddw %%xmm2,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "psubw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm7,%%xmm4\n\t" \ + "psubw %%xmm6,%%xmm5\n\t" \ + +/*Performs the last stage of the iDCT. + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7. + On output, xmm0 through xmm7 contain the corresponding rows.*/ +#define OC_IDCT_8x8_D \ + "#OC_IDCT_8x8_D\n\t" \ + /*Stage 4: \ + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ + "psubw %%xmm0,%%xmm7\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "psubw %%xmm2,%%xmm5\n\t" \ + "psubw %%xmm3,%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm0\n\t" \ + "paddw %%xmm1,%%xmm1\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm3\n\t" \ + +/*Performs the last stage of the iDCT. + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 + contain rows 4 through 7. + On output, xmm0 through xmm7 contain the corresponding rows.*/ +#define OC_IDCT_8x8_D_STORE \ + "#OC_IDCT_8x8_D_STORE\n\t" \ + /*Stage 4: \ + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ + "psubw %%xmm3,%%xmm4\n\t" \ + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ + "psubw %%xmm0,%%xmm7\n\t" \ + "psubw %%xmm1,%%xmm6\n\t" \ + "psubw %%xmm2,%%xmm5\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm4,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm5\n\t" \ + "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm0\n\t" \ + "paddw %%xmm1,%%xmm1\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm3,%%xmm3\n\t" \ + "paddw %%xmm7,%%xmm0\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "psraw $4,%%xmm0\n\t" \ + "paddw %%xmm5,%%xmm2\n\t" \ + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ + "psraw $4,%%xmm1\n\t" \ + "paddw %%xmm4,%%xmm3\n\t" \ + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ + "psraw $4,%%xmm2\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ + "psraw $4,%%xmm3\n\t" \ + "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ + "psraw $4,%%xmm4\n\t" \ + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ + "psraw $4,%%xmm5\n\t" \ + "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ + "psraw $4,%%xmm6\n\t" \ + "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ + "psraw $4,%%xmm7\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ + +static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + OC_ALIGN16(ogg_int16_t buf[16]); + int i; + /*This routine accepts an 8x8 matrix pre-transposed.*/ + __asm__ __volatile__( + /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ + "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" + "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" + "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" + "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" + OC_IDCT_8x8_ABC(x) + OC_IDCT_8x8_D + OC_TRANSPOSE_8x8 + /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" + OC_IDCT_8x8_ABC(y) + OC_IDCT_8x8_D_STORE + :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) + :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) + ); + __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); + /*Clear input data for next block (decoder only).*/ + for(i=0;i<2;i++){ + __asm__ __volatile__( + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) + ); + } +} + +/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only + need to work with four columns at a time. + Doing this in MMX is faster on processors with a 64-bit data path.*/ +#define OC_IDCT_8x8_10_MMX \ + "#OC_IDCT_8x8_10_MMX\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ + "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ + "pmulhw %%mm2,%%mm6\n\t" \ + "pmulhw %%mm2,%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ + "paddw %%mm6,%%mm2\n\t" \ + "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ + "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ + "pmulhw %%mm3,%%mm5\n\t" \ + "pmulhw %%mm3,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ + "paddw %%mm3,%%mm5\n\t" \ + "paddw %%mm3,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ + /*4-7 rotation by 7pi/16. \ + mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ + "pmulhw %%mm1,%%mm3\n\t" \ + "pmulhw %%mm1,%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ + "movq %%mm3,%%mm6\n\t" \ + "paddw %%mm1,%%mm7\n\t" \ + /*0-1 butterfly. \ + mm4=C4, mm0=X0, X4=0.*/ \ + /*Stage 2:*/ \ + /*4-5 butterfly: mm3=t[4], mm5=t[5] \ + 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ + "psubw %%mm5,%%mm3\n\t" \ + "paddw %%mm5,%%mm6\n\t" \ + "movq %%mm4,%%mm1\n\t" \ + "pmulhw %%mm0,%%mm4\n\t" \ + "paddw %%mm0,%%mm4\n\t" \ + "movq %%mm7,%%mm0\n\t" \ + "movq %%mm4,%%mm5\n\t" \ + "paddw %%mm2,%%mm0\n\t" \ + "psubw %%mm2,%%mm7\n\t" \ + "movq %%mm1,%%mm2\n\t" \ + "pmulhw %%mm6,%%mm1\n\t" \ + "pmulhw %%mm7,%%mm2\n\t" \ + "paddw %%mm6,%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ + "paddw %%mm7,%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ + 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ + 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ + "paddw %%mm2,%%mm1\n\t" \ + "paddw %%mm5,%%mm6\n\t" \ + "paddw %%mm4,%%mm7\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "paddw %%mm4,%%mm4\n\t" \ + "paddw %%mm5,%%mm5\n\t" \ + "psubw %%mm1,%%mm2\n\t" \ + "psubw %%mm7,%%mm4\n\t" \ + "psubw %%mm6,%%mm5\n\t" \ + /*Stage 4: \ + 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ + 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ + 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ + 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ + "psubw %%mm0,%%mm7\n\t" \ + "psubw %%mm1,%%mm6\n\t" \ + "psubw %%mm2,%%mm5\n\t" \ + "psubw %%mm3,%%mm4\n\t" \ + "paddw %%mm0,%%mm0\n\t" \ + "paddw %%mm1,%%mm1\n\t" \ + "paddw %%mm2,%%mm2\n\t" \ + "paddw %%mm3,%%mm3\n\t" \ + "paddw %%mm7,%%mm0\n\t" \ + "paddw %%mm6,%%mm1\n\t" \ + "paddw %%mm5,%%mm2\n\t" \ + "paddw %%mm4,%%mm3\n\t" \ + +#define OC_IDCT_8x8_10_ABC \ + "#OC_IDCT_8x8_10_ABC\n\t" \ + /*Stage 1:*/ \ + /*2-3 rotation by 6pi/16. \ + xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ + "pmulhw %%xmm2,%%xmm6\n\t" \ + "pmulhw %%xmm2,%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ + "paddw %%xmm6,%%xmm2\n\t" \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*5-6 rotation by 3pi/16. \ + xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ + "pmulhw %%xmm3,%%xmm5\n\t" \ + "pmulhw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ + "paddw %%xmm3,%%xmm5\n\t" \ + "paddw %%xmm3,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ + /*4-7 rotation by 7pi/16. \ + xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ + "pmulhw %%xmm1,%%xmm3\n\t" \ + "pmulhw %%xmm1,%%xmm7\n\t" \ + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ + "movdqa %%xmm3,%%xmm6\n\t" \ + "paddw %%xmm1,%%xmm7\n\t" \ + /*0-1 butterfly. \ + xmm4=C4, xmm0=X0, X4=0.*/ \ + /*Stage 2:*/ \ + /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ + 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ + "psubw %%xmm5,%%xmm3\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "movdqa %%xmm4,%%xmm1\n\t" \ + "pmulhw %%xmm0,%%xmm4\n\t" \ + "paddw %%xmm0,%%xmm4\n\t" \ + "movdqa %%xmm7,%%xmm0\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + "paddw %%xmm2,%%xmm0\n\t" \ + "psubw %%xmm2,%%xmm7\n\t" \ + "movdqa %%xmm1,%%xmm2\n\t" \ + "pmulhw %%xmm6,%%xmm1\n\t" \ + "pmulhw %%xmm7,%%xmm2\n\t" \ + "paddw %%xmm6,%%xmm1\n\t" \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ + "paddw %%xmm7,%%xmm2\n\t" \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ + /*Stage 3: \ + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ + "paddw %%xmm2,%%xmm1\n\t" \ + "paddw %%xmm5,%%xmm6\n\t" \ + "paddw %%xmm4,%%xmm7\n\t" \ + "paddw %%xmm2,%%xmm2\n\t" \ + "paddw %%xmm4,%%xmm4\n\t" \ + "paddw %%xmm5,%%xmm5\n\t" \ + "psubw %%xmm1,%%xmm2\n\t" \ + "psubw %%xmm7,%%xmm4\n\t" \ + "psubw %%xmm6,%%xmm5\n\t" \ + +static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + OC_ALIGN16(ogg_int16_t buf[16]); + /*This routine accepts an 8x8 matrix pre-transposed.*/ + __asm__ __volatile__( + "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" + "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" + "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" + "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" + OC_IDCT_8x8_10_MMX + OC_TRANSPOSE_8x4_MMX2SSE + OC_IDCT_8x8_10_ABC + OC_IDCT_8x8_D_STORE + :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) + ); + /*Clear input data for next block (decoder only).*/ + __asm__ __volatile__( + "pxor %%mm0,%%mm0\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) + ); +} + +/*Performs an inverse 8x8 Type-II DCT transform. + The input is assumed to be scaled by a factor of 4 relative to orthonormal + version of the transform.*/ +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ + /*_last_zzi is subtly different from an actual count of the number of + coefficients we decoded for this block. + It contains the value of zzi BEFORE the final token in the block was + decoded. + In most cases this is an EOB token (the continuation of an EOB run from a + previous block counts), and so this is the same as the coefficient count. + However, in the case that the last token was NOT an EOB token, but filled + the block up with exactly 64 coefficients, _last_zzi will be less than 64. + Provided the last token was not a pure zero run, the minimum value it can + be is 46, and so that doesn't affect any of the cases in this routine. + However, if the last token WAS a pure zero run of length 63, then _last_zzi + will be 1 while the number of coefficients decoded is 64. + Thus, we will trigger the following special case, where the real + coefficient count would not. + Note also that a zero run of length 64 will give _last_zzi a value of 0, + but we still process the DC coefficient, which might have a non-zero value + due to DC prediction. + Although convoluted, this is arguably the correct behavior: it allows us to + use a smaller transform when the block ends with a long zero run instead + of a normal EOB token. + It could be smarter... multiple separate zero runs at the end of a block + will fool it, but an encoder that generates these really deserves what it + gets. + Needless to say we inherited this approach from VP3.*/ + /*Then perform the iDCT.*/ + if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); + else oc_idct8x8_slow_sse2(_y,_x); +} + +#endif diff --git a/thirdparty/libtheora/x86/sse2trans.h b/thirdparty/libtheora/x86/sse2trans.h new file mode 100644 index 0000000000..e76da5140b --- /dev/null +++ b/thirdparty/libtheora/x86/sse2trans.h @@ -0,0 +1,242 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_sse2trans_H) +# define _x86_sse2trans_H (1) +# include "x86int.h" + +# if defined(OC_X86_64_ASM) +/*On x86-64 we can transpose in-place without spilling registers. + By clever choices of the order to apply the butterflies and the order of + their outputs, we can take the rows in order and output the columns in order + without any extra operations and using just one temporary register.*/ +# define OC_TRANSPOSE_8x8 \ + "#OC_TRANSPOSE_8x8\n\t" \ + "movdqa %%xmm4,%%xmm8\n\t" \ + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ + "punpckhwd %%xmm5,%%xmm8\n\t" \ + /*xmm5 is free.*/ \ + "movdqa %%xmm0,%%xmm5\n\t" \ + /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm0\n\t" \ + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ + "punpckhwd %%xmm1,%%xmm5\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm6,%%xmm1\n\t" \ + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm7,%%xmm6\n\t" \ + /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ + "punpckhwd %%xmm7,%%xmm1\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm2,%%xmm7\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "punpckhwd %%xmm3,%%xmm2\n\t" \ + /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm3,%%xmm7\n\t" \ + /*xmm3 is free.*/ \ + "movdqa %%xmm0,%%xmm3\n\t" \ + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm7,%%xmm0\n\t" \ + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm7,%%xmm3\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm5,%%xmm7\n\t" \ + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ + "punpckldq %%xmm2,%%xmm5\n\t" \ + /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ + "punpckhdq %%xmm2,%%xmm7\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm4,%%xmm2\n\t" \ + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm6,%%xmm4\n\t" \ + /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm6,%%xmm2\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm8,%%xmm6\n\t" \ + /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ + "punpckldq %%xmm1,%%xmm6\n\t" \ + /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "punpckhdq %%xmm1,%%xmm8\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm0,%%xmm1\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm2,%%xmm0\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm2,%%xmm1\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm3,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm4,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm4,%%xmm2\n\t" \ + /*xmm4 is free.*/ \ + "movdqa %%xmm5,%%xmm4\n\t" \ + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ + "punpckhqdq %%xmm6,%%xmm5\n\t" \ + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ + "punpcklqdq %%xmm6,%%xmm4\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm7,%%xmm6\n\t" \ + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ + "punpckhqdq %%xmm8,%%xmm7\n\t" \ + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ + "punpcklqdq %%xmm8,%%xmm6\n\t" \ + /*xmm8 is free.*/ \ + +# else +/*Otherwise, we need to spill some values to %[buf] temporarily. + Again, the butterflies are carefully arranged to get the columns to come out + in order, minimizing register spills and maximizing the delay between a load + and when the value loaded is actually used.*/ +# define OC_TRANSPOSE_8x8 \ + "#OC_TRANSPOSE_8x8\n\t" \ + /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*xmm0 is free.*/ \ + "movdqa %%xmm2,%%xmm0\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "punpckhwd %%xmm3,%%xmm2\n\t" \ + /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm3,%%xmm0\n\t" \ + /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ + /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm6,%%xmm2\n\t" \ + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm7,%%xmm6\n\t" \ + /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ + "punpckhwd %%xmm7,%%xmm2\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm4,%%xmm7\n\t" \ + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ + "punpckhwd %%xmm5,%%xmm7\n\t" \ + /*xmm5 is free.*/ \ + "movdqa %%xmm3,%%xmm5\n\t" \ + /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm3\n\t" \ + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ + "punpckhwd %%xmm1,%%xmm5\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm7,%%xmm1\n\t" \ + /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ + "punpckldq %%xmm2,%%xmm7\n\t" \ + /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "punpckhdq %%xmm2,%%xmm1\n\t" \ + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ + /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ + /*xmm1 is free.*/ \ + "movdqa %%xmm3,%%xmm1\n\t" \ + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm0,%%xmm3\n\t" \ + /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm0,%%xmm1\n\t" \ + /*xmm0 is free.*/ \ + "movdqa %%xmm4,%%xmm0\n\t" \ + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm6,%%xmm4\n\t" \ + /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm6,%%xmm0\n\t" \ + /*xmm6 is free.*/ \ + "movdqa %%xmm5,%%xmm6\n\t" \ + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ + "punpckldq %%xmm2,%%xmm5\n\t" \ + /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ + "punpckhdq %%xmm2,%%xmm6\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm1,%%xmm2\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm0,%%xmm1\n\t" \ + /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm0,%%xmm2\n\t" \ + /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ + /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ + /*xmm2 is free.*/ \ + "movdqa %%xmm3,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm4,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm4,%%xmm2\n\t" \ + /*xmm4 is free.*/ \ + "movdqa %%xmm5,%%xmm4\n\t" \ + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ + "punpckhqdq %%xmm7,%%xmm5\n\t" \ + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ + "punpcklqdq %%xmm7,%%xmm4\n\t" \ + /*xmm7 is free.*/ \ + "movdqa %%xmm6,%%xmm7\n\t" \ + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ + "punpcklqdq %%xmm0,%%xmm6\n\t" \ + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ + "punpckhqdq %%xmm0,%%xmm7\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ + +# endif + +/*Transpose 4 values in each of 8 MMX registers into 8 values in the first + four SSE registers. + No need to be clever here; we have plenty of room.*/ +# define OC_TRANSPOSE_8x4_MMX2SSE \ + "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ + "movq2dq %%mm0,%%xmm0\n\t" \ + "movq2dq %%mm1,%%xmm1\n\t" \ + /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ + "punpcklwd %%xmm1,%%xmm0\n\t" \ + "movq2dq %%mm2,%%xmm3\n\t" \ + "movq2dq %%mm3,%%xmm2\n\t" \ + /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ + "punpcklwd %%xmm2,%%xmm3\n\t" \ + "movq2dq %%mm4,%%xmm4\n\t" \ + "movq2dq %%mm5,%%xmm5\n\t" \ + /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ + "punpcklwd %%xmm5,%%xmm4\n\t" \ + "movq2dq %%mm6,%%xmm7\n\t" \ + "movq2dq %%mm7,%%xmm6\n\t" \ + /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ + "punpcklwd %%xmm6,%%xmm7\n\t" \ + "movdqa %%xmm0,%%xmm2\n\t" \ + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ + "punpckldq %%xmm3,%%xmm0\n\t" \ + /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ + "punpckhdq %%xmm3,%%xmm2\n\t" \ + "movdqa %%xmm4,%%xmm5\n\t" \ + /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ + "punpckldq %%xmm7,%%xmm4\n\t" \ + /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ + "punpckhdq %%xmm7,%%xmm5\n\t" \ + "movdqa %%xmm0,%%xmm1\n\t" \ + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ + "punpcklqdq %%xmm4,%%xmm0\n\t" \ + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ + "punpckhqdq %%xmm4,%%xmm1\n\t" \ + "movdqa %%xmm2,%%xmm3\n\t" \ + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ + "punpcklqdq %%xmm5,%%xmm2\n\t" \ + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ + "punpckhqdq %%xmm5,%%xmm3\n\t" \ + +#endif diff --git a/thirdparty/libtheora/x86/x86cpu.c b/thirdparty/libtheora/x86/x86cpu.c new file mode 100644 index 0000000000..49fd76d0ac --- /dev/null +++ b/thirdparty/libtheora/x86/x86cpu.c @@ -0,0 +1,182 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + CPU capability detection for x86 processors. + Originally written by Rudolf Marek. + + function: + last mod: $Id$ + + ********************************************************************/ + +#include "x86cpu.h" + +#if !defined(OC_X86_ASM) +ogg_uint32_t oc_cpu_flags_get(void){ + return 0; +} +#else +# if defined(__amd64__)||defined(__x86_64__) +/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when + compiling with -fPIC.*/ +# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ + __asm__ __volatile__( \ + "cpuid\n\t" \ + :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ + :"a"(_op) \ + :"cc" \ + ) +# else +/*On x86-32, not so much.*/ +# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ + __asm__ __volatile__( \ + "xchgl %%ebx,%[ebx]\n\t" \ + "cpuid\n\t" \ + "xchgl %%ebx,%[ebx]\n\t" \ + :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ + :"a"(_op) \ + :"cc" \ + ) +# endif + +static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ + ogg_uint32_t flags; + /*If there isn't even MMX, give up.*/ + if(!(_edx&0x00800000))return 0; + flags=OC_CPU_X86_MMX; + if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE; + if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2; + if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI; + if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3; + if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1; + if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2; + return flags; +} + +static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ + ogg_uint32_t flags; + /*If there isn't even MMX, give up.*/ + if(!(_edx&0x00800000))return 0; + flags=OC_CPU_X86_MMX; + if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT; + if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW; + if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT; + if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A; + if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5; + return flags; +} + +ogg_uint32_t oc_cpu_flags_get(void){ + ogg_uint32_t flags; + ogg_uint32_t eax; + ogg_uint32_t ebx; + ogg_uint32_t ecx; + ogg_uint32_t edx; +# if !defined(__amd64__)&&!defined(__x86_64__) + /*Not all x86-32 chips support cpuid, so we have to check.*/ + __asm__ __volatile__( + "pushfl\n\t" + "pushfl\n\t" + "popl %[a]\n\t" + "movl %[a],%[b]\n\t" + "xorl $0x200000,%[a]\n\t" + "pushl %[a]\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %[a]\n\t" + "popfl\n\t" + :[a]"=r"(eax),[b]"=r"(ebx) + : + :"cc" + ); + /*No cpuid.*/ + if(eax==ebx)return 0; +# endif + cpuid(0,eax,ebx,ecx,edx); + /* l e t n I e n i u n e G*/ + if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547|| + /* 6 8 x M T e n i u n e G*/ + ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){ + int family; + int model; + /*Intel, Transmeta (tested with Crusoe TM5800):*/ + cpuid(1,eax,ebx,ecx,edx); + flags=oc_parse_intel_flags(edx,ecx); + family=(eax>>8)&0xF; + model=(eax>>4)&0xF; + /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX + unit, so don't use it.*/ + if(family==6&&(model==9||model==13||model==14)){ + flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI); + } + } + /* D M A c i t n e h t u A*/ + else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541|| + /* C S N y b e d o e G*/ + ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){ + /*AMD, Geode:*/ + cpuid(0x80000000,eax,ebx,ecx,edx); + if(eax<0x80000001)flags=0; + else{ + cpuid(0x80000001,eax,ebx,ecx,edx); + flags=oc_parse_amd_flags(edx,ecx); + } + /*Also check for SSE.*/ + cpuid(1,eax,ebx,ecx,edx); + flags|=oc_parse_intel_flags(edx,ecx); + } + /*Technically some VIA chips can be configured in the BIOS to return any + string here the user wants. + There is a special detection method that can be used to identify such + processors, but in my opinion, if the user really wants to change it, they + deserve what they get.*/ + /* s l u a H r u a t n e C*/ + else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){ + /*VIA:*/ + /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming) + chips (thanks to the engineers from Centaur Technology who provided it). + These chips support Intel-like cpuid info. + The C3-2 (Nehemiah) cores appear to, as well.*/ + cpuid(1,eax,ebx,ecx,edx); + flags=oc_parse_intel_flags(edx,ecx); + if(eax>=0x80000001){ + /*The (non-Nehemiah) C3 processors support AMD-like cpuid info. + We need to check this even if the Intel test succeeds to pick up 3DNow! + support on these processors. + Unlike actual AMD processors, we cannot _rely_ on this info, since + some cores (e.g., the 693 stepping of the Nehemiah) claim to support + this function, yet return edx=0, despite the Intel test indicating + MMX support. + Therefore the features detected here are strictly added to those + detected by the Intel test.*/ + /*TODO: How about earlier chips?*/ + cpuid(0x80000001,eax,ebx,ecx,edx); + /*Note: As of the C7, this function returns Intel-style extended feature + flags, not AMD-style. + Currently, this only defines bits 11, 20, and 29 (0x20100800), which + do not conflict with any of the AMD flags we inspect. + For the remaining bits, Intel tells us, "Do not count on their value", + but VIA assures us that they will all be zero (at least on the C7 and + Isaiah chips). + In the (unlikely) event a future processor uses bits 18, 19, 30, or 31 + (0xC0C00000) for something else, we will have to add code to detect + the model to decide when it is appropriate to inspect them.*/ + flags|=oc_parse_amd_flags(edx,ecx); + } + } + else{ + /*Implement me.*/ + flags=0; + } + return flags; +} +#endif diff --git a/thirdparty/libtheora/cpu.h b/thirdparty/libtheora/x86/x86cpu.h index a43c957a39..e0192d52e2 100644 --- a/thirdparty/libtheora/cpu.h +++ b/thirdparty/libtheora/x86/x86cpu.h @@ -10,13 +10,13 @@ * * ******************************************************************** function: - last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ -#if !defined(_x86_cpu_H) -# define _x86_cpu_H (1) -#include "internal.h" +#if !defined(_x86_x86cpu_H) +# define _x86_x86cpu_H (1) +#include "../internal.h" #define OC_CPU_X86_MMX (1<<0) #define OC_CPU_X86_3DNOW (1<<1) @@ -31,4 +31,6 @@ #define OC_CPU_X86_SSE4A (1<<10) #define OC_CPU_X86_SSE5 (1<<11) +ogg_uint32_t oc_cpu_flags_get(void); + #endif diff --git a/thirdparty/libtheora/x86/x86enc.c b/thirdparty/libtheora/x86/x86enc.c index 43b7be3ea3..ffa9c14a42 100644 --- a/thirdparty/libtheora/x86/x86enc.c +++ b/thirdparty/libtheora/x86/x86enc.c @@ -18,32 +18,46 @@ #if defined(OC_X86_ASM) -#include "../cpu.c" - -void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){ +void oc_enc_accel_init_x86(oc_enc_ctx *_enc){ ogg_uint32_t cpu_flags; - cpu_flags=oc_cpu_flags_get(); - oc_enc_vtable_init_c(_enc); + cpu_flags=_enc->state.cpu_flags; + oc_enc_accel_init_c(_enc); +# if defined(OC_ENC_USE_VTABLE) if(cpu_flags&OC_CPU_X86_MMX){ _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx; _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx; _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; - _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx; } if(cpu_flags&OC_CPU_X86_MMXEXT){ _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext; _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext; _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext; - _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext; - _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext; + _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext; + _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext; _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext; _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext; + _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext; } if(cpu_flags&OC_CPU_X86_SSE2){ -# if defined(OC_X86_64_ASM) - /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/ +# if defined(OC_X86_64_ASM) + _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2; +# endif + _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2; + _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2; + _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2; + _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2; + _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2; + _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86; + _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86; + _enc->opt_vtable.quantize=oc_enc_quantize_sse2; +# else + (void) cpu_flags; # endif + _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t); + _enc->opt_data.enquant_table_alignment=16; +# if defined(OC_ENC_USE_VTABLE) } +# endif } #endif diff --git a/thirdparty/libtheora/x86/x86enc.h b/thirdparty/libtheora/x86/x86enc.h index 06c3908bcd..c258247d67 100644 --- a/thirdparty/libtheora/x86/x86enc.h +++ b/thirdparty/libtheora/x86/x86enc.h @@ -17,11 +17,62 @@ #if !defined(_x86_x86enc_H) # define _x86_x86enc_H (1) -# include "../encint.h" # include "x86int.h" -void oc_enc_vtable_init_x86(oc_enc_ctx *_enc); +# if defined(OC_X86_ASM) +# define oc_enc_accel_init oc_enc_accel_init_x86 +# if defined(OC_X86_64_ASM) +/*x86-64 guarantees SIMD support up through at least SSE2. + If the best routine we have available only needs SSE2 (which at the moment + covers all of them), then we can avoid runtime detection and the indirect + call.*/ +# define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \ + oc_enc_frag_sub_mmx(_diff,_x,_y,_stride) +# define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \ + oc_enc_frag_sub_128_mmx(_diff,_x,_stride) +# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \ + oc_enc_frag_sad_mmxext(_src,_ref,_ystride) +# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \ + oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh) +# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \ + oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh) +# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \ + oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride) +# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \ + oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride) +# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \ + oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride) +# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \ + oc_enc_frag_ssd_sse2(_src,_ref,_ystride) +# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \ + oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask) +# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \ + oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride) +# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \ + oc_enc_enquant_table_init_x86(_enquant,_dequant) +# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \ + oc_enc_enquant_table_fixup_x86(_enquant,_nqis) +# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \ + oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant) +# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \ + oc_frag_recon_intra_mmx(_dst,_ystride,_residue) +# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \ + oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue) +# define oc_enc_fdct8x8(_enc,_y,_x) \ + oc_enc_fdct8x8_x86_64sse2(_y,_x) +# else +# define OC_ENC_USE_VTABLE (1) +# endif +# endif + +# include "../encint.h" +void oc_enc_accel_init_x86(oc_enc_ctx *_enc); + +void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64], + const unsigned char *_x,const unsigned char *_y,int _stride); +void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64], + const unsigned char *_x,int _stride); unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, const unsigned char *_ref,int _ystride); unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, @@ -29,19 +80,35 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, unsigned _thresh); -unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh); -unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh); -unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride); -void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64], - const unsigned char *_x,const unsigned char *_y,int _stride); -void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64], - const unsigned char *_x,int _stride); +unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride); +unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride); +unsigned oc_enc_frag_intra_satd_mmxext(int *_dc, + const unsigned char *_src,int _ystride); +unsigned oc_enc_frag_intra_satd_sse2(int *_dc, + const unsigned char *_src,int _ystride); +unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src, + const unsigned char *_ref,int _ystride,ogg_int64_t _mask); +void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, + const unsigned char *_src1,const unsigned char *_src2,int _src_ystride); void oc_enc_frag_copy2_mmxext(unsigned char *_dst, const unsigned char *_src1,const unsigned char *_src2,int _ystride); -void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]); +void oc_enc_enquant_table_init_x86(void *_enquant, + const ogg_uint16_t _dequant[64]); +void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis); +int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], + const ogg_uint16_t _dequant[64],const void *_enquant); +void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]); + +# if defined(OC_X86_64_ASM) void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]); +# endif #endif diff --git a/thirdparty/libtheora/x86/x86enquant.c b/thirdparty/libtheora/x86/x86enquant.c new file mode 100644 index 0000000000..39477ecc21 --- /dev/null +++ b/thirdparty/libtheora/x86/x86enquant.c @@ -0,0 +1,149 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $ + + ********************************************************************/ + +#include "x86enc.h" + +#if defined(OC_X86_ASM) + + + +/*The default enquant table is not quite suitable for SIMD purposes. + First, the m and l parameters need to be separated so that an entire row full + of m's or l's can be loaded at a time. + Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to + emulate one with a multiply. + Therefore we translate the shift count into a scale factor.*/ +void oc_enc_enquant_table_init_x86(void *_enquant, + const ogg_uint16_t _dequant[64]){ + ogg_int16_t *m; + ogg_int16_t *l; + int zzi; + m=(ogg_int16_t *)_enquant; + l=m+64; + for(zzi=0;zzi<64;zzi++){ + oc_iquant q; + oc_iquant_init(&q,_dequant[zzi]); + m[zzi]=q.m; + /*q.l must be at least 2 for this to work; fortunately, once all the scale + factors are baked in, the minimum quantizer is much larger than that.*/ + l[zzi]=1<<16-q.l; + } +} + +void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){ + int pli; + int qii; + int qti; + for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){ + ((ogg_int16_t *)_enquant[pli][qii][qti])[0]= + ((ogg_int16_t *)_enquant[pli][0][qti])[0]; + ((ogg_int16_t *)_enquant[pli][qii][qti])[64]= + ((ogg_int16_t *)_enquant[pli][0][qti])[64]; + } +} + +int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], + const ogg_uint16_t _dequant[64],const void *_enquant){ + ptrdiff_t r; + __asm__ __volatile__( + "xor %[r],%[r]\n\t" + /*Loop through two rows at a time.*/ + ".p2align 4\n\t" + "0:\n\t" + /*Load the first two rows of the data and the quant matrices.*/ + "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t" + "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t" + "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t" + "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t" + "movdqa 0x00(%[q],%[r]),%%xmm4\n\t" + "movdqa 0x10(%[q],%[r]),%%xmm5\n\t" + /*Double the input and propagate its sign to the rounding factor. + Using SSSE3's psignw would help here, but we need the mask later anyway.*/ + "movdqa %%xmm0,%%xmm6\n\t" + "psraw $15,%%xmm0\n\t" + "movdqa %%xmm1,%%xmm7\n\t" + "paddw %%xmm6,%%xmm6\n\t" + "psraw $15,%%xmm1\n\t" + "paddw %%xmm7,%%xmm7\n\t" + "paddw %%xmm0,%%xmm2\n\t" + "paddw %%xmm1,%%xmm3\n\t" + "pxor %%xmm0,%%xmm2\n\t" + "pxor %%xmm1,%%xmm3\n\t" + /*Add the rounding factor and perform the first multiply.*/ + "paddw %%xmm2,%%xmm6\n\t" + "paddw %%xmm3,%%xmm7\n\t" + "pmulhw %%xmm6,%%xmm4\n\t" + "pmulhw %%xmm7,%%xmm5\n\t" + "movdqa 0x80(%[q],%[r]),%%xmm2\n\t" + "movdqa 0x90(%[q],%[r]),%%xmm3\n\t" + "paddw %%xmm4,%%xmm6\n\t" + "paddw %%xmm5,%%xmm7\n\t" + /*Emulate an element-wise right-shift via a second multiply.*/ + "pmulhw %%xmm2,%%xmm6\n\t" + "pmulhw %%xmm3,%%xmm7\n\t" + "add $32,%[r]\n\t" + "cmp $96,%[r]\n\t" + /*Correct for the sign.*/ + "psubw %%xmm0,%%xmm6\n\t" + "psubw %%xmm1,%%xmm7\n\t" + /*Save the result.*/ + "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t" + "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t" + "jle 0b\n\t" + /*Now find the location of the last non-zero value.*/ + "movdqa 0x50(%[qdct]),%%xmm5\n\t" + "movdqa 0x40(%[qdct]),%%xmm4\n\t" + "packsswb %%xmm7,%%xmm6\n\t" + "packsswb %%xmm5,%%xmm4\n\t" + "pxor %%xmm0,%%xmm0\n\t" + "mov $-1,%k[dq]\n\t" + "pcmpeqb %%xmm0,%%xmm6\n\t" + "pcmpeqb %%xmm0,%%xmm4\n\t" + "pmovmskb %%xmm6,%k[q]\n\t" + "pmovmskb %%xmm4,%k[r]\n\t" + "shl $16,%k[q]\n\t" + "or %k[r],%k[q]\n\t" + "mov $32,%[r]\n\t" + /*We have to use xor here instead of not in order to set the flags.*/ + "xor %k[dq],%k[q]\n\t" + "jnz 1f\n\t" + "movdqa 0x30(%[qdct]),%%xmm7\n\t" + "movdqa 0x20(%[qdct]),%%xmm6\n\t" + "movdqa 0x10(%[qdct]),%%xmm5\n\t" + "movdqa 0x00(%[qdct]),%%xmm4\n\t" + "packsswb %%xmm7,%%xmm6\n\t" + "packsswb %%xmm5,%%xmm4\n\t" + "pcmpeqb %%xmm0,%%xmm6\n\t" + "pcmpeqb %%xmm0,%%xmm4\n\t" + "pmovmskb %%xmm6,%k[q]\n\t" + "pmovmskb %%xmm4,%k[r]\n\t" + "shl $16,%k[q]\n\t" + "or %k[r],%k[q]\n\t" + "xor %[r],%[r]\n\t" + "not %k[q]\n\t" + "or $1,%k[q]\n\t" + "1:\n\t" + "bsr %k[q],%k[q]\n\t" + "add %k[q],%k[r]\n\t" + :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant) + :[dct]"r"(_dct),[qdct]"r"(_qdct) + :"cc","memory" + ); + return (int)r; +} + +#endif diff --git a/thirdparty/libtheora/x86/x86int.h b/thirdparty/libtheora/x86/x86int.h index ede724f5aa..ceb2dbb0ec 100644 --- a/thirdparty/libtheora/x86/x86int.h +++ b/thirdparty/libtheora/x86/x86int.h @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -19,24 +19,104 @@ # define _x86_x86int_H (1) # include "../internal.h" -void oc_state_vtable_init_x86(oc_theora_state *_state); +# if defined(OC_X86_ASM) +# define oc_state_accel_init oc_state_accel_init_x86 +# if defined(OC_X86_64_ASM) +/*x86-64 guarantees SIMD support up through at least SSE2. + If the best routine we have available only needs SSE2 (which at the moment + covers all of them), then we can avoid runtime detection and the indirect + call.*/ +# define oc_frag_copy(_state,_dst,_src,_ystride) \ + oc_frag_copy_mmx(_dst,_src,_ystride) +# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) \ + oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \ + _fragis,_nfragis,_frag_buf_offs) +# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \ + oc_frag_recon_intra_mmx(_dst,_ystride,_residue) +# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \ + oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue) +# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \ + oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue) +# define oc_idct8x8(_state,_y,_x,_last_zzi) \ + oc_idct8x8_sse2(_y,_x,_last_zzi) +# define oc_state_frag_recon oc_state_frag_recon_mmx +# define oc_loop_filter_init(_state,_bv,_flimit) \ + oc_loop_filter_init_mmxext(_bv,_flimit) +# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext +# define oc_restore_fpu(_state) \ + oc_restore_fpu_mmx() +# else +# define OC_STATE_USE_VTABLE (1) +# endif +# endif + +# include "../state.h" +# include "x86cpu.h" + +/*Converts the expression in the argument to a string.*/ +#define OC_M2STR(_s) #_s + +/*Memory operands do not always include an offset. + To avoid warnings, we force an offset with %H (which adds 8).*/ +# if __GNUC_PREREQ(4,0) +# define OC_MEM_OFFS(_offs,_name) \ + OC_M2STR(_offs-8+%H[_name]) +# endif +/*If your gcc version does't support %H, then you get to suffer the warnings. + Note that Apple's gas breaks on things like _offs+(%esp): it throws away the + whole offset, instead of substituting in 0 for the missing operand to +.*/ +# if !defined(OC_MEM_OFFS) +# define OC_MEM_OFFS(_offs,_name) \ + OC_M2STR(_offs+%[_name]) +# endif + +/*Declare an array operand with an exact size. + This tells gcc we're going to clobber this memory region, without having to + clobber all of "memory" and lets us access local buffers directly using the + stack pointer, without allocating a separate register to point to them.*/ +#define OC_ARRAY_OPERAND(_type,_ptr,_size) \ + (*({ \ + struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \ + array_addr__; \ + })) + +/*Declare an array operand with an exact size. + This tells gcc we're going to clobber this memory region, without having to + clobber all of "memory" and lets us access local buffers directly using the + stack pointer, without allocating a separate register to point to them.*/ +#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \ + (*({ \ + const struct{_type array_value__[(_size)];} *array_addr__= \ + (const void *)(_ptr); \ + array_addr__; \ + })) + +extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64]; + +void oc_state_accel_init_x86(oc_theora_state *_state); void oc_frag_copy_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride); +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, const ogg_int16_t *_residue); void oc_frag_recon_inter_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi); +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli); + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit); +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit); void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); void oc_restore_fpu_mmx(void); #endif diff --git a/thirdparty/libtheora/x86/x86state.c b/thirdparty/libtheora/x86/x86state.c index a786bec284..9f8bceb534 100644 --- a/thirdparty/libtheora/x86/x86state.c +++ b/thirdparty/libtheora/x86/x86state.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -19,8 +19,7 @@ #if defined(OC_X86_ASM) -#include "../cpu.c" - +#if defined(OC_STATE_USE_VTABLE) /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into each quadrant of the destination.*/ static const unsigned char OC_FZIG_ZAG_MMX[128]={ @@ -39,24 +38,60 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={ 64,64,64,64,64,64,64,64, 64,64,64,64,64,64,64,64, 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 +}; +#endif + +/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into + the destination.*/ +static const unsigned char OC_FZIG_ZAG_SSE2[128]={ + 0, 8, 1, 2, 9,16,24,17, + 10, 3, 4,11,18,25,32,40, + 33,26,19,12, 5, 6,13,20, + 27,34,41,48,56,49,42,35, + 28,21,14, 7,15,22,29,36, + 43,50,57,58,51,44,37,30, + 23,31,38,45,52,59,60,53, + 46,39,47,54,61,62,55,63, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64, + 64,64,64,64,64,64,64,64 }; -void oc_state_vtable_init_x86(oc_theora_state *_state){ +void oc_state_accel_init_x86(oc_theora_state *_state){ + oc_state_accel_init_c(_state); _state->cpu_flags=oc_cpu_flags_get(); +# if defined(OC_STATE_USE_VTABLE) if(_state->cpu_flags&OC_CPU_X86_MMX){ _state->opt_vtable.frag_copy=oc_frag_copy_mmx; + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx; _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; _state->opt_vtable.idct8x8=oc_idct8x8_mmx; _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; - _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx; _state->opt_vtable.state_loop_filter_frag_rows= oc_state_loop_filter_frag_rows_mmx; _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; } - else oc_state_vtable_init_c(_state); + if(_state->cpu_flags&OC_CPU_X86_MMXEXT){ + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext; + _state->opt_vtable.state_loop_filter_frag_rows= + oc_state_loop_filter_frag_rows_mmxext; + } + if(_state->cpu_flags&OC_CPU_X86_SSE2){ + _state->opt_vtable.idct8x8=oc_idct8x8_sse2; +# endif + _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2; +# if defined(OC_STATE_USE_VTABLE) + } +# endif } #endif diff --git a/thirdparty/libtheora/x86/x86zigzag.h b/thirdparty/libtheora/x86/x86zigzag.h new file mode 100644 index 0000000000..fb21e0bb43 --- /dev/null +++ b/thirdparty/libtheora/x86/x86zigzag.h @@ -0,0 +1,244 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_x86zigzag_H) +# define _x86_x86zigzag_H (1) +# include "x86enc.h" + + +/*Converts DCT coefficients from transposed order into zig-zag scan order and + stores them in %[y]. + This relies on two macros to load the contents of each row: + OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load + the first four and second four entries of each row into the specified + register, respectively. + OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row + (because when the rows are already in SSE2 registers, loading the high half + destructively modifies the register). + The index of each output element in the original 64-element array should wind + up in the following 8x8 matrix (the letters indicate the order we compute + each 4-tuple below): + A 0 8 1 2 9 16 24 17 B + C 10 3 4 11 18 25 32 40 E + F 33 26 19 12 5 6 13 20 D + G 27 34 41 48 56 49 42 35 I + L 28 21 14 7 15 22 29 36 M + H 43 50 57 58 51 44 37 30 O + N 23 31 38 45 52 59 60 53 J + P 46 39 47 54 61 62 55 63 K + The order of the coefficients within each tuple is reversed in the comments + below to reflect the usual MSB to LSB notation.*/ +#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \ + OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \ + OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \ + OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \ + OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \ + OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \ + OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \ + OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \ + "movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \ + "punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \ + "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \ + "punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \ + "pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \ + "punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \ + "movq %%mm7,0x00(%[y])\n\t" \ + "punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \ + "movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \ + "punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \ + "punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \ + "punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \ + "punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \ + "punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \ + OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \ + "movq %%mm1,0x08(%[y])\n\t" \ + OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \ + "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \ + "movq %%mm0,0x10(%[y])\n\t" \ + "punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \ + "punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \ + "movq %%mm4,0x28(%[y])\n\t" \ + "psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \ + "pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \ + "movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \ + "punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \ + "punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \ + "punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \ + "punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \ + "punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \ + OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \ + "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \ + "movq %%mm4,0x18(%[y])\n\t" \ + OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \ + "punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \ + "movq %%mm2,0x20(%[y])\n\t" \ + "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \ + "pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \ + "movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \ + "punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \ + "pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \ + "punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \ + "movq %%mm3,0x30(%[y])\n\t" \ + "punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \ + "movq %%mm1,0x50(%[y])\n\t" \ + OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \ + "punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \ + OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \ + "psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \ + "movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \ + "punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \ + OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \ + "movq %%mm4,0x38(%[y])\n\t" \ + "punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \ + "punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \ + "movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \ + "punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \ + "punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \ + OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \ + "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \ + "movq %%mm3,0x68(%[y])\n\t" \ + "movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \ + "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \ + "punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \ + OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \ + "movq %%mm4,0x78(%[y])\n\t" \ + "punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \ + "punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \ + "punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \ + "pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \ + "pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \ + "movq %%mm5,0x40(%[y])\n\t" \ + "punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \ + "movq %%mm7,0x48(%[y])\n\t" \ + "pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \ + "punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \ + "punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \ + "punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \ + "movq %%mm6,0x60(%[y])\n\t" \ + "punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \ + "punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \ + "movq %%mm3,0x58(%[y])\n\t" \ + "pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \ + "movq %%mm0,0x70(%[y])\n\t" \ + +/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan + order and stores them in %[qdct]. + The index of each output element in the original 64-element array should wind + up in the following 8x8 matrix (the letters indicate the order we compute + each 4-tuple below): + A 0 1 8 16 9 2 3 10 B + C 17 24 32 25 18 11 4 5 D + E 12 19 26 33 40 48 41 34 I + H 27 20 13 6 7 14 21 28 G + K 35 42 49 56 57 50 43 36 J + F 29 22 15 23 30 37 44 51 M + P 58 59 52 45 38 31 39 46 L + N 53 60 61 54 47 55 62 63 O + The order of the coefficients within each tuple is reversed in the comments + below to reflect the usual MSB to LSB notation.*/ +#define OC_ZIG_ZAG_MMXEXT \ + "movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \ + "movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \ + "movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \ + "movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \ + "movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \ + "movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \ + "movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \ + "punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \ + "movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \ + "punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \ + "movq %%mm0,0x00(%[qdct])\n\t" \ + "movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \ + "punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \ + "psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \ + "punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \ + "punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \ + "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \ + "movq %%mm6,0x08(%[qdct])\n\t" \ + "psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \ + "movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \ + "punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \ + "movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \ + "punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \ + "por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \ + "punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \ + "movq %%mm2,0x10(%[qdct])\n\t" \ + "movq %%mm3,0x18(%[qdct])\n\t" \ + "movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \ + "movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \ + "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \ + "punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \ + "punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \ + "punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \ + "punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \ + "punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \ + "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \ + "psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \ + "punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \ + "punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \ + "punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \ + "movq %%mm0,0x20(%[qdct])\n\t" \ + "movq %%mm3,0x50(%[qdct])\n\t" \ + "movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \ + "movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \ + "movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \ + "punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \ + "psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \ + "movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \ + "punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \ + "punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \ + "movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \ + "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \ + "movq %%mm2,0x30(%[qdct])\n\t" \ + "movq %%mm6,0x38(%[qdct])\n\t" \ + "movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \ + "punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \ + "movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \ + "punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \ + "psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \ + "punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \ + "punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \ + "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \ + "movq %%mm0,0x28(%[qdct])\n\t" \ + "punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \ + "punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \ + "punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \ + "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \ + "movq %%mm4,0x40(%[qdct])\n\t" \ + "movq %%mm6,0x48(%[qdct])\n\t" \ + "movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \ + "movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \ + "psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \ + "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \ + "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \ + "punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \ + "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \ + "punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \ + "punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \ + "movq %%mm2,0x68(%[qdct])\n\t" \ + "movq %%mm1,0x58(%[qdct])\n\t" \ + "punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \ + "punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \ + "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \ + "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \ + "punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \ + "punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \ + "movq %%mm6,0x70(%[qdct])\n\t" \ + "movq %%mm5,0x78(%[qdct])\n\t" \ + "movq %%mm7,0x60(%[qdct])\n\t" \ + +#endif diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c index 94f1d06513..a6be819135 100644 --- a/thirdparty/libtheora/x86_vc/mmxencfrag.c +++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c @@ -266,7 +266,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, /*Performs the first two stages of an 8-point 1-D Hadamard transform. The transform is performed in place, except that outputs 0-3 are swapped with outputs 4-7. - Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to + Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to perform this stage in place with no temporary registers).*/ #define OC_HADAMARD_AB_8x4 __asm{ \ /*Stage A: \ @@ -299,7 +299,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, } /*Performs the last stage of an 8-point 1-D Hadamard transform in place. - Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in + Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in place with no temporary registers).*/ #define OC_HADAMARD_C_8x4 __asm{ \ /*Stage C:*/ \ @@ -468,12 +468,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, mm7 = d3 c3 b3 a3*/ \ } -static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, - int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret1; - unsigned ret2; +static unsigned oc_int_frag_satd_mmxext(int *_dc, + const unsigned char *_src,int _src_ystride, + const unsigned char *_ref,int _ref_ystride){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret; + unsigned ret2; + int dc; bufp=buf; __asm{ #define SRC esi @@ -481,8 +483,10 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, #define SRC_YSTRIDE ecx #define REF_YSTRIDE edx #define BUF edi -#define RET eax -#define RET2 edx +#define RET edx +#define RET2 ecx +#define DC eax +#define DC_WORD ax mov SRC,_src mov SRC_YSTRIDE,_src_ystride mov REF,_ref @@ -508,14 +512,18 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, movq mm2,[0x20+BUF] movq mm3,[0x30+BUF] movq mm0,[0x00+BUF] - OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38) + /*We split out the stages here so we can save the DC coefficient in the + middle.*/ + OC_HADAMARD_AB_8x4 + OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) + movd DC,mm1 + OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) /*Up to this point, everything fit in 16 bits (8 input + 1 for the difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 for the factor of two we dropped + 3 for the vertical accumulation). Now we finally have to promote things to dwords. We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long latency of pmaddwd by starting the next series of loads now.*/ - mov RET2,_thresh pmaddwd mm0,mm7 movq mm1,[0x50+BUF] movq mm5,[0x58+BUF] @@ -525,29 +533,28 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src, movq mm6,[0x68+BUF] paddd mm4,mm0 movq mm3,[0x70+BUF] - movd RET,mm4 + movd RET2,mm4 movq mm7,[0x78+BUF] - /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 - added to them, and a factor of two removed; correct the final sum here.*/ - lea RET,[RET+RET-32] movq mm0,[0x40+BUF] - cmp RET,RET2 movq mm4,[0x48+BUF] - jae at_end OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) pmaddwd mm0,mm7 - /*There isn't much to stick in here to hide the latency this time, but the - alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose - latency is even worse.*/ - sub RET,32 + /*Subtract abs(dc) from 2*ret2.*/ + movsx DC,DC_WORD + cdq + lea RET2,[RET+RET2*2] movq mm4,mm0 punpckhdq mm0,mm0 + xor RET,DC paddd mm4,mm0 - movd RET2,mm4 - lea RET,[RET+RET2*2] - align 16 -at_end: - mov ret1,RET + /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 + added to them, a factor of two removed, and the DC value included; + correct the final sum here.*/ + sub RET2,RET + movd RET,mm4 + lea RET,[RET2+RET*2-64] + mov ret,RET + mov dc,DC #undef SRC #undef REF #undef SRC_YSTRIDE @@ -555,18 +562,21 @@ at_end: #undef BUF #undef RET #undef RET2 +#undef DC +#undef DC_WORD } - return ret1; + *_dc=dc; + return ret; } -unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh){ - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh); +unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride){ + return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride); } /*Our internal implementation of frag_copy2 takes an extra stride parameter so - we can share code with oc_enc_frag_satd2_thresh_mmxext().*/ + we can share code with oc_enc_frag_satd2_mmxext().*/ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ __asm{ @@ -694,30 +704,31 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, } } -unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh){ +unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ OC_ALIGN8(unsigned char ref[64]); oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); - return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh); + return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8); } -unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, +unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src, int _ystride){ - OC_ALIGN8(ogg_int16_t buf[64]); - ogg_int16_t *bufp; - unsigned ret1; - unsigned ret2; + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + unsigned ret1; + unsigned ret2; + int dc; bufp=buf; __asm{ #define SRC eax #define SRC4 esi #define BUF edi -#define RET eax -#define RET_WORD ax -#define RET2 ecx #define YSTRIDE edx #define YSTRIDE3 ecx +#define RET eax +#define RET2 ecx +#define DC edx +#define DC_WORD dx mov SRC,_src mov BUF,bufp mov YSTRIDE,_ystride @@ -749,7 +760,7 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, middle.*/ OC_HADAMARD_AB_8x4 OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) - movd RET,mm1 + movd DC,mm1 OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) /*Up to this point, everything fit in 16 bits (8 input + 1 for the difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 @@ -767,31 +778,34 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src, movq mm3,[0x70+BUF] paddd mm4,mm0 movq mm7,[0x78+BUF] - movd RET2,mm4 + movd RET,mm4 movq mm0,[0x40+BUF] movq mm4,[0x48+BUF] OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) pmaddwd mm0,mm7 /*We assume that the DC coefficient is always positive (which is true, because the input to the INTRA transform was not a difference).*/ - movzx RET,RET_WORD - add RET2,RET2 - sub RET2,RET + movzx DC,DC_WORD + add RET,RET + sub RET,DC movq mm4,mm0 punpckhdq mm0,mm0 paddd mm4,mm0 - movd RET,mm4 - lea RET,[-64+RET2+RET*2] + movd RET2,mm4 + lea RET,[-64+RET+RET2*2] + mov [dc],DC mov [ret1],RET #undef SRC #undef SRC4 #undef BUF -#undef RET -#undef RET_WORD -#undef RET2 #undef YSTRIDE #undef YSTRIDE3 +#undef RET +#undef RET2 +#undef DC +#undef DC_WORD } + *_dc=dc; return ret1; } diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c index d908ce2413..c9ee530ea2 100644 --- a/thirdparty/libtheora/x86_vc/mmxfdct.c +++ b/thirdparty/libtheora/x86_vc/mmxfdct.c @@ -12,6 +12,7 @@ /*MMX fDCT implementation for x86_32*/ /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ #include "x86enc.h" +#include "x86zigzag.h" #if defined(OC_X86_ASM) @@ -462,18 +463,22 @@ } /*MMX implementation of the fDCT.*/ -void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ - ptrdiff_t a; +void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ + OC_ALIGN8(ogg_int16_t buf[64]); + ogg_int16_t *bufp; + bufp=buf; __asm{ +#define X edx #define Y eax #define A ecx -#define X edx +#define BUF esi /*Add two extra bits of working precision to improve accuracy; any more and we could overflow.*/ /*We also add biases to correct for some systematic error that remains in the full fDCT->iDCT round trip.*/ mov X, _x mov Y, _y + mov BUF, bufp movq mm0,[0x00+X] movq mm1,[0x10+X] movq mm2,[0x20+X] @@ -591,79 +596,90 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ movq mm3,[0x30+Y] OC_FDCT_STAGE1_8x4 OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) - OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38) /*mm0={-2}x4*/ - pcmpeqw mm0,mm0 - paddw mm0,mm0 - /*Round the results.*/ - psubw mm1,mm0 - psubw mm2,mm0 - psraw mm1,2 - psubw mm3,mm0 - movq [0x18+Y],mm1 - psraw mm2,2 - psubw mm4,mm0 - movq mm1,[0x08+Y] - psraw mm3,2 - psubw mm5,mm0 + pcmpeqw mm2,mm2 + paddw mm2,mm2 + /*Round and store the results (no transpose).*/ + movq mm7,[Y+0x10] + psubw mm4,mm2 + psubw mm6,mm2 psraw mm4,2 - psubw mm6,mm0 - psraw mm5,2 - psubw mm7,mm0 + psubw mm0,mm2 + movq [BUF+0x00],mm4 + movq mm4,[Y+0x30] psraw mm6,2 - psubw mm1,mm0 + psubw mm5,mm2 + movq [BUF+0x20],mm6 + psraw mm0,2 + psubw mm3,mm2 + movq [BUF+0x40],mm0 + psraw mm5,2 + psubw mm1,mm2 + movq [BUF+0x50],mm5 + psraw mm3,2 + psubw mm7,mm2 + movq [BUF+0x60],mm3 + psraw mm1,2 + psubw mm4,mm2 + movq [BUF+0x70],mm1 psraw mm7,2 + movq [BUF+0x10],mm7 + psraw mm4,2 + movq [BUF+0x30],mm4 + /*Load the next block.*/ movq mm0,[0x40+Y] - psraw mm1,2 - movq [0x30+Y],mm7 movq mm7,[0x78+Y] - movq [0x08+Y],mm1 movq mm1,[0x50+Y] - movq [0x20+Y],mm6 movq mm6,[0x68+Y] - movq [0x28+Y],mm2 movq mm2,[0x60+Y] - movq [0x10+Y],mm5 movq mm5,[0x58+Y] - movq [0x38+Y],mm3 movq mm3,[0x70+Y] - movq [0x00+Y],mm4 movq mm4,[0x48+Y] OC_FDCT_STAGE1_8x4 OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) - OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78) /*mm0={-2}x4*/ - pcmpeqw mm0,mm0 - paddw mm0,mm0 - /*Round the results.*/ - psubw mm1,mm0 - psubw mm2,mm0 - psraw mm1,2 - psubw mm3,mm0 - movq [0x58+Y],mm1 - psraw mm2,2 - psubw mm4,mm0 - movq mm1,[0x48+Y] - psraw mm3,2 - psubw mm5,mm0 - movq [0x68+Y],mm2 + pcmpeqw mm2,mm2 + paddw mm2,mm2 + /*Round and store the results (no transpose).*/ + movq mm7,[Y+0x50] + psubw mm4,mm2 + psubw mm6,mm2 psraw mm4,2 - psubw mm6,mm0 - movq [0x78+Y],mm3 - psraw mm5,2 - psubw mm7,mm0 - movq [0x40+Y],mm4 + psubw mm0,mm2 + movq [BUF+0x08],mm4 + movq mm4,[Y+0x70] psraw mm6,2 - psubw mm1,mm0 - movq [0x50+Y],mm5 - psraw mm7,2 - movq [0x60+Y],mm6 + psubw mm5,mm2 + movq [BUF+0x28],mm6 + psraw mm0,2 + psubw mm3,mm2 + movq [BUF+0x48],mm0 + psraw mm5,2 + psubw mm1,mm2 + movq [BUF+0x58],mm5 + psraw mm3,2 + psubw mm7,mm2 + movq [BUF+0x68],mm3 psraw mm1,2 - movq [0x70+Y],mm7 - movq [0x48+Y],mm1 + psubw mm4,mm2 + movq [BUF+0x78],mm1 + psraw mm7,2 + movq [BUF+0x18],mm7 + psraw mm4,2 + movq [BUF+0x38],mm4 +#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \ + __asm movq _reg,[BUF+16*(_row)] \ + +#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \ + __asm movq _reg,[BUF+16*(_row)+8] \ + + OC_TRANSPOSE_ZIG_ZAG_MMXEXT +#undef OC_ZZ_LOAD_ROW_LO +#undef OC_ZZ_LOAD_ROW_HI +#undef X #undef Y #undef A -#undef X +#undef BUF } } diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.c b/thirdparty/libtheora/x86_vc/mmxfrag.c index 4eb2084dc6..248312ff90 100644 --- a/thirdparty/libtheora/x86_vc/mmxfrag.c +++ b/thirdparty/libtheora/x86_vc/mmxfrag.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $ + last mod: $Id$ ********************************************************************/ @@ -22,12 +22,63 @@ The iteration each instruction belongs to is marked in the comments as #i.*/ #include <stddef.h> #include "x86int.h" -#include "mmxfrag.h" #if defined(OC_X86_ASM) /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes between rows.*/ +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ + do{ \ + const unsigned char *src; \ + unsigned char *dst; \ + src=(_src); \ + dst=(_dst); \ + __asm mov SRC,src \ + __asm mov DST,dst \ + __asm mov YSTRIDE,_ystride \ + /*src+0*ystride*/ \ + __asm movq mm0,[SRC] \ + /*src+1*ystride*/ \ + __asm movq mm1,[SRC+YSTRIDE] \ + /*ystride3=ystride*3*/ \ + __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ + /*src+2*ystride*/ \ + __asm movq mm2,[SRC+YSTRIDE*2] \ + /*src+3*ystride*/ \ + __asm movq mm3,[SRC+YSTRIDE3] \ + /*dst+0*ystride*/ \ + __asm movq [DST],mm0 \ + /*dst+1*ystride*/ \ + __asm movq [DST+YSTRIDE],mm1 \ + /*Pointer to next 4.*/ \ + __asm lea SRC,[SRC+YSTRIDE*4] \ + /*dst+2*ystride*/ \ + __asm movq [DST+YSTRIDE*2],mm2 \ + /*dst+3*ystride*/ \ + __asm movq [DST+YSTRIDE3],mm3 \ + /*Pointer to next 4.*/ \ + __asm lea DST,[DST+YSTRIDE*4] \ + /*src+0*ystride*/ \ + __asm movq mm0,[SRC] \ + /*src+1*ystride*/ \ + __asm movq mm1,[SRC+YSTRIDE] \ + /*src+2*ystride*/ \ + __asm movq mm2,[SRC+YSTRIDE*2] \ + /*src+3*ystride*/ \ + __asm movq mm3,[SRC+YSTRIDE3] \ + /*dst+0*ystride*/ \ + __asm movq [DST],mm0 \ + /*dst+1*ystride*/ \ + __asm movq [DST+YSTRIDE],mm1 \ + /*dst+2*ystride*/ \ + __asm movq [DST+YSTRIDE*2],mm2 \ + /*dst+3*ystride*/ \ + __asm movq [DST+YSTRIDE3],mm3 \ + } \ + while(0) + +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes + between rows.*/ void oc_frag_copy_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride){ #define SRC edx @@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst, #undef YSTRIDE3 } +/*Copies the fragments specified by the lists of fragment indices from one + frame to another. + _dst_frame: The reference frame to copy to. + _src_frame: The reference frame to copy from. + _ystride: The row stride of the reference frames. + _fragis: A pointer to a list of fragment indices. + _nfragis: The number of fragment indices to copy. + _frag_buf_offs: The offsets of fragments in the reference frames.*/ +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ + ptrdiff_t fragii; + for(fragii=0;fragii<_nfragis;fragii++){ + ptrdiff_t frag_buf_off; + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; +#define SRC edx +#define DST eax +#define YSTRIDE ecx +#define YSTRIDE3 edi + OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, + _src_frame+frag_buf_off,_ystride); +#undef SRC +#undef DST +#undef YSTRIDE +#undef YSTRIDE3 + } +} + void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, const ogg_int16_t *_residue){ __asm{ diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.h b/thirdparty/libtheora/x86_vc/mmxfrag.h deleted file mode 100644 index 45ee93e777..0000000000 --- a/thirdparty/libtheora/x86_vc/mmxfrag.h +++ /dev/null @@ -1,61 +0,0 @@ -#if !defined(_x86_vc_mmxfrag_H) -# define _x86_vc_mmxfrag_H (1) -# include <stddef.h> -# include "x86int.h" - -#if defined(OC_X86_ASM) - -/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes - between rows.*/ -#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ - do{ \ - const unsigned char *src; \ - unsigned char *dst; \ - src=(_src); \ - dst=(_dst); \ - __asm mov SRC,src \ - __asm mov DST,dst \ - __asm mov YSTRIDE,_ystride \ - /*src+0*ystride*/ \ - __asm movq mm0,[SRC] \ - /*src+1*ystride*/ \ - __asm movq mm1,[SRC+YSTRIDE] \ - /*ystride3=ystride*3*/ \ - __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ - /*src+2*ystride*/ \ - __asm movq mm2,[SRC+YSTRIDE*2] \ - /*src+3*ystride*/ \ - __asm movq mm3,[SRC+YSTRIDE3] \ - /*dst+0*ystride*/ \ - __asm movq [DST],mm0 \ - /*dst+1*ystride*/ \ - __asm movq [DST+YSTRIDE],mm1 \ - /*Pointer to next 4.*/ \ - __asm lea SRC,[SRC+YSTRIDE*4] \ - /*dst+2*ystride*/ \ - __asm movq [DST+YSTRIDE*2],mm2 \ - /*dst+3*ystride*/ \ - __asm movq [DST+YSTRIDE3],mm3 \ - /*Pointer to next 4.*/ \ - __asm lea DST,[DST+YSTRIDE*4] \ - /*src+0*ystride*/ \ - __asm movq mm0,[SRC] \ - /*src+1*ystride*/ \ - __asm movq mm1,[SRC+YSTRIDE] \ - /*src+2*ystride*/ \ - __asm movq mm2,[SRC+YSTRIDE*2] \ - /*src+3*ystride*/ \ - __asm movq mm3,[SRC+YSTRIDE3] \ - /*dst+0*ystride*/ \ - __asm movq [DST],mm0 \ - /*dst+1*ystride*/ \ - __asm movq [DST+YSTRIDE],mm1 \ - /*dst+2*ystride*/ \ - __asm movq [DST+YSTRIDE*2],mm2 \ - /*dst+3*ystride*/ \ - __asm movq [DST+YSTRIDE3],mm3 \ - } \ - while(0) - -# endif -#endif diff --git a/thirdparty/libtheora/x86_vc/mmxidct.c b/thirdparty/libtheora/x86_vc/mmxidct.c index 8f5ff6803c..55e00aedcf 100644 --- a/thirdparty/libtheora/x86_vc/mmxidct.c +++ b/thirdparty/libtheora/x86_vc/mmxidct.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -24,15 +24,15 @@ /*These are offsets into the table of constants below.*/ /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ -#define OC_COSINE_OFFSET (0) +#define OC_COSINE_OFFSET (8) /*A row of 8's.*/ -#define OC_EIGHT_OFFSET (56) +#define OC_EIGHT_OFFSET (0) /*A table of constants used by the MMX routines.*/ -static const __declspec(align(16))ogg_uint16_t - OC_IDCT_CONSTS[(7+1)*4]={ +static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={ + 8, 8, 8, 8, (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, @@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - 8, 8, 8, 8 + (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1 }; /*38 cycles*/ -#define OC_IDCT_BEGIN __asm{ \ - __asm movq mm2,OC_I(3) \ +#define OC_IDCT_BEGIN(_y,_x) __asm{ \ + __asm movq mm2,OC_I(3,_x) \ __asm movq mm6,OC_C(3) \ __asm movq mm4,mm2 \ - __asm movq mm7,OC_J(5) \ + __asm movq mm7,OC_J(5,_x) \ __asm pmulhw mm4,mm6 \ __asm movq mm1,OC_C(5) \ __asm pmulhw mm6,mm7 \ __asm movq mm5,mm1 \ __asm pmulhw mm1,mm2 \ - __asm movq mm3,OC_I(1) \ + __asm movq mm3,OC_I(1,_x) \ __asm pmulhw mm5,mm7 \ __asm movq mm0,OC_C(1) \ __asm paddw mm4,mm2 \ __asm paddw mm6,mm7 \ __asm paddw mm2,mm1 \ - __asm movq mm1,OC_J(7) \ + __asm movq mm1,OC_J(7,_x) \ __asm paddw mm7,mm5 \ __asm movq mm5,mm0 \ __asm pmulhw mm0,mm3 \ @@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t __asm psubw mm6,mm2 \ __asm paddw mm0,mm3 \ __asm pmulhw mm3,mm7 \ - __asm movq mm2,OC_I(2) \ + __asm movq mm2,OC_I(2,_x) \ __asm pmulhw mm7,mm1 \ __asm paddw mm5,mm1 \ __asm movq mm1,mm2 \ __asm pmulhw mm2,OC_C(2) \ __asm psubw mm3,mm5 \ - __asm movq mm5,OC_J(6) \ + __asm movq mm5,OC_J(6,_x) \ __asm paddw mm0,mm7 \ __asm movq mm7,mm5 \ __asm psubw mm0,mm4 \ @@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t __asm paddw mm6,mm6 \ __asm pmulhw mm7,OC_C(6) \ __asm paddw mm6,mm3 \ - __asm movq OC_I(1),mm4 \ + __asm movq OC_I(1,_y),mm4 \ __asm psubw mm1,mm5 \ __asm movq mm4,OC_C(4) \ __asm movq mm5,mm3 \ __asm pmulhw mm3,mm4 \ __asm paddw mm7,mm2 \ - __asm movq OC_I(2),mm6 \ + __asm movq OC_I(2,_y),mm6 \ __asm movq mm2,mm0 \ - __asm movq mm6,OC_I(0) \ + __asm movq mm6,OC_I(0,_x) \ __asm pmulhw mm0,mm4 \ __asm paddw mm5,mm3 \ - __asm movq mm3,OC_J(4) \ + __asm movq mm3,OC_J(4,_x) \ __asm psubw mm5,mm1 \ __asm paddw mm2,mm0 \ __asm psubw mm6,mm3 \ @@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t __asm paddw mm6,mm0 \ __asm psubw mm6,mm2 \ __asm paddw mm2,mm2 \ - __asm movq mm0,OC_I(1) \ + __asm movq mm0,OC_I(1,_y) \ __asm paddw mm2,mm6 \ __asm paddw mm4,mm3 \ __asm psubw mm2,mm1 \ } /*38+8=46 cycles.*/ -#define OC_ROW_IDCT __asm{ \ - OC_IDCT_BEGIN \ +#define OC_ROW_IDCT(_y,_x) __asm{ \ + OC_IDCT_BEGIN(_y,_x) \ /*r3=D'*/ \ - __asm movq mm3,OC_I(2) \ + __asm movq mm3,OC_I(2,_y) \ /*r4=E'=E-G*/ \ __asm psubw mm4,mm7 \ /*r1=H'+H'*/ \ @@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t __asm psubw mm7,mm0 \ __asm paddw mm0,mm0 \ /*Save R1.*/ \ - __asm movq OC_I(1),mm1 \ + __asm movq OC_I(1,_y),mm1 \ /*r0=R0=G.+C.*/ \ __asm paddw mm0,mm7 \ } @@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t Since r1 is free at entry, we calculate the Js first.*/ /*19 cycles.*/ -#define OC_TRANSPOSE __asm{ \ +#define OC_TRANSPOSE(_y) __asm{ \ __asm movq mm1,mm4 \ __asm punpcklwd mm4,mm5 \ - __asm movq OC_I(0),mm0 \ + __asm movq OC_I(0,_y),mm0 \ __asm punpckhwd mm1,mm5 \ __asm movq mm0,mm6 \ __asm punpcklwd mm6,mm7 \ @@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t __asm punpckldq mm4,mm6 \ __asm punpckhdq mm5,mm6 \ __asm movq mm6,mm1 \ - __asm movq OC_J(4),mm4 \ + __asm movq OC_J(4,_y),mm4 \ __asm punpckhwd mm0,mm7 \ - __asm movq OC_J(5),mm5 \ + __asm movq OC_J(5,_y),mm5 \ __asm punpckhdq mm6,mm0 \ - __asm movq mm4,OC_I(0) \ + __asm movq mm4,OC_I(0,_y) \ __asm punpckldq mm1,mm0 \ - __asm movq mm5,OC_I(1) \ + __asm movq mm5,OC_I(1,_y) \ __asm movq mm0,mm4 \ - __asm movq OC_J(7),mm6 \ + __asm movq OC_J(7,_y),mm6 \ __asm punpcklwd mm0,mm5 \ - __asm movq OC_J(6),mm1 \ + __asm movq OC_J(6,_y),mm1 \ __asm punpckhwd mm4,mm5 \ __asm movq mm5,mm2 \ __asm punpcklwd mm2,mm3 \ @@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t __asm punpckldq mm0,mm2 \ __asm punpckhdq mm1,mm2 \ __asm movq mm2,mm4 \ - __asm movq OC_I(0),mm0 \ + __asm movq OC_I(0,_y),mm0 \ __asm punpckhwd mm5,mm3 \ - __asm movq OC_I(1),mm1 \ + __asm movq OC_I(1,_y),mm1 \ __asm punpckhdq mm4,mm5 \ __asm punpckldq mm2,mm5 \ - __asm movq OC_I(3),mm4 \ - __asm movq OC_I(2),mm2 \ + __asm movq OC_I(3,_y),mm4 \ + __asm movq OC_I(2,_y),mm2 \ } /*38+19=57 cycles.*/ -#define OC_COLUMN_IDCT __asm{ \ - OC_IDCT_BEGIN \ +#define OC_COLUMN_IDCT(_y) __asm{ \ + OC_IDCT_BEGIN(_y,_y) \ __asm paddw mm2,OC_8 \ /*r1=H'+H'*/ \ __asm paddw mm1,mm1 \ @@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t /*r1=NR1*/ \ __asm psraw mm1,4 \ /*r3=D'*/ \ - __asm movq mm3,OC_I(2) \ + __asm movq mm3,OC_I(2,_y) \ /*r7=G+G*/ \ __asm paddw mm7,mm7 \ /*Store NR2 at I(2).*/ \ - __asm movq OC_I(2),mm2 \ + __asm movq OC_I(2,_y),mm2 \ /*r7=G'=E+G*/ \ __asm paddw mm7,mm4 \ /*Store NR1 at I(1).*/ \ - __asm movq OC_I(1),mm1 \ + __asm movq OC_I(1,_y),mm1 \ /*r4=R4=E'-D'*/ \ __asm psubw mm4,mm3 \ __asm paddw mm4,OC_8 \ @@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t /*r6=NR6*/ \ __asm psraw mm6,4 \ /*Store NR4 at J(4).*/ \ - __asm movq OC_J(4),mm4 \ + __asm movq OC_J(4,_y),mm4 \ /*r5=NR5*/ \ __asm psraw mm5,4 \ /*Store NR3 at I(3).*/ \ - __asm movq OC_I(3),mm3 \ + __asm movq OC_I(3,_y),mm3 \ /*r7=R7=G'-C'*/ \ __asm psubw mm7,mm0 \ __asm paddw mm7,OC_8 \ @@ -288,71 +287,89 @@ static const __declspec(align(16))ogg_uint16_t /*r7=NR7*/ \ __asm psraw mm7,4 \ /*Store NR6 at J(6).*/ \ - __asm movq OC_J(6),mm6 \ + __asm movq OC_J(6,_y),mm6 \ /*r0=NR0*/ \ __asm psraw mm0,4 \ /*Store NR5 at J(5).*/ \ - __asm movq OC_J(5),mm5 \ + __asm movq OC_J(5,_y),mm5 \ /*Store NR7 at J(7).*/ \ - __asm movq OC_J(7),mm7 \ + __asm movq OC_J(7,_y),mm7 \ /*Store NR0 at I(0).*/ \ - __asm movq OC_I(0),mm0 \ + __asm movq OC_I(0,_y),mm0 \ } #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8] #define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) #define OC_8 OC_MID(OC_EIGHT_OFFSET,0) -static void oc_idct8x8_slow(ogg_int16_t _y[64]){ +static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + int i; /*This routine accepts an 8x8 matrix, but in partially transposed form. Every 4x4 block is transposed.*/ __asm{ #define CONSTS eax #define Y edx +#define X ecx mov CONSTS,offset OC_IDCT_CONSTS mov Y,_y -#define OC_I(_k) [Y+_k*16] -#define OC_J(_k) [Y+(_k-4)*16+8] - OC_ROW_IDCT - OC_TRANSPOSE + mov X,_x +#define OC_I(_k,_y) [(_y)+(_k)*16] +#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] + OC_ROW_IDCT(Y,X) + OC_TRANSPOSE(Y) #undef OC_I #undef OC_J -#define OC_I(_k) [Y+(_k*16)+64] -#define OC_J(_k) [Y+(_k-4)*16+72] - OC_ROW_IDCT - OC_TRANSPOSE +#define OC_I(_k,_y) [(_y)+(_k)*16+64] +#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72] + OC_ROW_IDCT(Y,X) + OC_TRANSPOSE(Y) #undef OC_I #undef OC_J -#define OC_I(_k) [Y+_k*16] -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) [(_y)+(_k)*16] +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(Y) #undef OC_I #undef OC_J -#define OC_I(_k) [Y+_k*16+8] -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) [(_y)+(_k)*16+8] +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(Y) #undef OC_I #undef OC_J #undef CONSTS #undef Y +#undef X + } + __asm pxor mm0,mm0; + for(i=0;i<4;i++){ + ogg_int16_t *x; + x=_x+16*i; +#define X ecx + __asm{ + mov X,x + movq [X+0x00],mm0 + movq [X+0x08],mm0 + movq [X+0x10],mm0 + movq [X+0x18],mm0 + } +#undef X } } /*25 cycles.*/ -#define OC_IDCT_BEGIN_10 __asm{ \ - __asm movq mm2,OC_I(3) \ +#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \ + __asm movq mm2,OC_I(3,_x) \ __asm nop \ __asm movq mm6,OC_C(3) \ __asm movq mm4,mm2 \ __asm movq mm1,OC_C(5) \ __asm pmulhw mm4,mm6 \ - __asm movq mm3,OC_I(1) \ + __asm movq mm3,OC_I(1,_x) \ __asm pmulhw mm1,mm2 \ __asm movq mm0,OC_C(1) \ __asm paddw mm4,mm2 \ __asm pxor mm6,mm6 \ __asm paddw mm2,mm1 \ - __asm movq mm5,OC_I(2) \ + __asm movq mm5,OC_I(2,_x) \ __asm pmulhw mm0,mm3 \ __asm movq mm1,mm5 \ __asm paddw mm0,mm3 \ @@ -360,43 +377,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ __asm psubw mm6,mm2 \ __asm pmulhw mm5,OC_C(2) \ __asm psubw mm0,mm4 \ - __asm movq mm7,OC_I(2) \ + __asm movq mm7,OC_I(2,_x) \ __asm paddw mm4,mm4 \ __asm paddw mm7,mm5 \ __asm paddw mm4,mm0 \ __asm pmulhw mm1,OC_C(6) \ __asm psubw mm3,mm6 \ - __asm movq OC_I(1),mm4 \ + __asm movq OC_I(1,_y),mm4 \ __asm paddw mm6,mm6 \ __asm movq mm4,OC_C(4) \ __asm paddw mm6,mm3 \ __asm movq mm5,mm3 \ __asm pmulhw mm3,mm4 \ - __asm movq OC_I(2),mm6 \ + __asm movq OC_I(2,_y),mm6 \ __asm movq mm2,mm0 \ - __asm movq mm6,OC_I(0) \ + __asm movq mm6,OC_I(0,_x) \ __asm pmulhw mm0,mm4 \ __asm paddw mm5,mm3 \ __asm paddw mm2,mm0 \ __asm psubw mm5,mm1 \ __asm pmulhw mm6,mm4 \ - __asm paddw mm6,OC_I(0) \ + __asm paddw mm6,OC_I(0,_x) \ __asm paddw mm1,mm1 \ __asm movq mm4,mm6 \ __asm paddw mm1,mm5 \ __asm psubw mm6,mm2 \ __asm paddw mm2,mm2 \ - __asm movq mm0,OC_I(1) \ + __asm movq mm0,OC_I(1,_y) \ __asm paddw mm2,mm6 \ __asm psubw mm2,mm1 \ __asm nop \ } /*25+8=33 cycles.*/ -#define OC_ROW_IDCT_10 __asm{ \ - OC_IDCT_BEGIN_10 \ +#define OC_ROW_IDCT_10(_y,_x) __asm{ \ + OC_IDCT_BEGIN_10(_y,_x) \ /*r3=D'*/ \ - __asm movq mm3,OC_I(2) \ + __asm movq mm3,OC_I(2,_y) \ /*r4=E'=E-G*/ \ __asm psubw mm4,mm7 \ /*r1=H'+H'*/ \ @@ -421,14 +438,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ __asm psubw mm7,mm0 \ __asm paddw mm0,mm0 \ /*Save R1.*/ \ - __asm movq OC_I(1),mm1 \ + __asm movq OC_I(1,_y),mm1 \ /*r0=R0=G'+C'*/ \ __asm paddw mm0,mm7 \ } /*25+19=44 cycles'*/ -#define OC_COLUMN_IDCT_10 __asm{ \ - OC_IDCT_BEGIN_10 \ +#define OC_COLUMN_IDCT_10(_y) __asm{ \ + OC_IDCT_BEGIN_10(_y,_y) \ __asm paddw mm2,OC_8 \ /*r1=H'+H'*/ \ __asm paddw mm1,mm1 \ @@ -441,15 +458,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r1=NR1*/ \ __asm psraw mm1,4 \ /*r3=D'*/ \ - __asm movq mm3,OC_I(2) \ + __asm movq mm3,OC_I(2,_y) \ /*r7=G+G*/ \ __asm paddw mm7,mm7 \ /*Store NR2 at I(2).*/ \ - __asm movq OC_I(2),mm2 \ + __asm movq OC_I(2,_y),mm2 \ /*r7=G'=E+G*/ \ __asm paddw mm7,mm4 \ /*Store NR1 at I(1).*/ \ - __asm movq OC_I(1),mm1 \ + __asm movq OC_I(1,_y),mm1 \ /*r4=R4=E'-D'*/ \ __asm psubw mm4,mm3 \ __asm paddw mm4,OC_8 \ @@ -471,11 +488,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r6=NR6*/ \ __asm psraw mm6,4 \ /*Store NR4 at J(4).*/ \ - __asm movq OC_J(4),mm4 \ + __asm movq OC_J(4,_y),mm4 \ /*r5=NR5*/ \ __asm psraw mm5,4 \ /*Store NR3 at I(3).*/ \ - __asm movq OC_I(3),mm3 \ + __asm movq OC_I(3,_y),mm3 \ /*r7=R7=G'-C'*/ \ __asm psubw mm7,mm0 \ __asm paddw mm7,OC_8 \ @@ -486,50 +503,63 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r7=NR7*/ \ __asm psraw mm7,4 \ /*Store NR6 at J(6).*/ \ - __asm movq OC_J(6),mm6 \ + __asm movq OC_J(6,_y),mm6 \ /*r0=NR0*/ \ __asm psraw mm0,4 \ /*Store NR5 at J(5).*/ \ - __asm movq OC_J(5),mm5 \ + __asm movq OC_J(5,_y),mm5 \ /*Store NR7 at J(7).*/ \ - __asm movq OC_J(7),mm7 \ + __asm movq OC_J(7,_y),mm7 \ /*Store NR0 at I(0).*/ \ - __asm movq OC_I(0),mm0 \ + __asm movq OC_I(0,_y),mm0 \ } -static void oc_idct8x8_10(ogg_int16_t _y[64]){ +static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ __asm{ #define CONSTS eax #define Y edx +#define X ecx mov CONSTS,offset OC_IDCT_CONSTS mov Y,_y -#define OC_I(_k) [Y+_k*16] -#define OC_J(_k) [Y+(_k-4)*16+8] + mov X,_x +#define OC_I(_k,_y) [(_y)+(_k)*16] +#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] /*Done with dequant, descramble, and partial transpose. Now do the iDCT itself.*/ - OC_ROW_IDCT_10 - OC_TRANSPOSE + OC_ROW_IDCT_10(Y,X) + OC_TRANSPOSE(Y) #undef OC_I #undef OC_J -#define OC_I(_k) [Y+_k*16] -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) [(_y)+(_k)*16] +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(Y) #undef OC_I #undef OC_J -#define OC_I(_k) [Y+_k*16+8] -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) [(_y)+(_k)*16+8] +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(Y) #undef OC_I #undef OC_J #undef CONSTS #undef Y +#undef X + } +#define X ecx + __asm{ + pxor mm0,mm0; + mov X,_x + movq [X+0x00],mm0 + movq [X+0x10],mm0 + movq [X+0x20],mm0 + movq [X+0x30],mm0 } +#undef X } /*Performs an inverse 8x8 Type-II DCT transform. The input is assumed to be scaled by a factor of 4 relative to orthonormal version of the transform.*/ -void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ /*_last_zzi is subtly different from an actual count of the number of coefficients we decoded for this block. It contains the value of zzi BEFORE the final token in the block was @@ -555,8 +585,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ gets. Needless to say we inherited this approach from VP3.*/ /*Perform the iDCT.*/ - if(_last_zzi<10)oc_idct8x8_10(_y); - else oc_idct8x8_slow(_y); + if(_last_zzi<=10)oc_idct8x8_10(_y,_x); + else oc_idct8x8_slow(_y,_x); } #endif diff --git a/thirdparty/libtheora/x86_vc/mmxstate.c b/thirdparty/libtheora/x86_vc/mmxstate.c index 73bd1981cf..f532ee1b6f 100644 --- a/thirdparty/libtheora/x86_vc/mmxstate.c +++ b/thirdparty/libtheora/x86_vc/mmxstate.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $ + last mod: $Id$ ********************************************************************/ @@ -19,17 +19,16 @@ Originally written by Rudolf Marek.*/ #include <string.h> #include "x86int.h" -#include "mmxfrag.h" #include "mmxloop.h" #if defined(OC_X86_ASM) void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){ + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ unsigned char *dst; ptrdiff_t frag_buf_off; int ystride; - int mb_mode; + int refi; /*Apply the inverse transform.*/ /*Special case only having a DC component.*/ if(_last_zzi<2){ @@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, #define P ecx mov Y,_dct_coeffs movzx P,p + lea Y,[Y+128] /*mm0=0000 0000 0000 AAAA*/ movd mm0,P /*mm0=0000 0000 AAAA AAAA*/ @@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, else{ /*Dequantize the DC coefficient.*/ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); - oc_idct8x8_mmx(_dct_coeffs,_last_zzi); + oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi); } /*Fill in the target buffer.*/ frag_buf_off=_state->frag_buf_offs[_fragi]; - mb_mode=_state->frags[_fragi].mb_mode; + refi=_state->frags[_fragi].refi; ystride=_state->ref_ystride[_pli]; - dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off; - if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs); + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); else{ const unsigned char *ref; int mvoffsets[2]; - ref= - _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]] - +frag_buf_off; + ref=_state->ref_frame_data[refi]+frag_buf_off; if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, - _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){ + _state->frag_mvs[_fragi])>1){ oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, - _dct_coeffs); + _dct_coeffs+64); } - else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs); + else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); } } /*We copy these entire function to inline the actual MMX routines so that we use only a single indirect call.*/ -/*Copies the fragments specified by the lists of fragment indices from one - frame to another. - _fragis: A pointer to a list of fragment indices. - _nfragis: The number of fragment indices to copy. - _dst_frame: The reference frame to copy to. - _src_frame: The reference frame to copy from. - _pli: The color plane the fragments lie in.*/ -void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli){ - const ptrdiff_t *frag_buf_offs; - const unsigned char *src_frame_data; - unsigned char *dst_frame_data; - ptrdiff_t fragii; - int ystride; - dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]]; - src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]]; - ystride=_state->ref_ystride[_pli]; - frag_buf_offs=_state->frag_buf_offs; - for(fragii=0;fragii<_nfragis;fragii++){ - ptrdiff_t frag_buf_off; - frag_buf_off=frag_buf_offs[_fragis[fragii]]; -#define SRC edx -#define DST eax -#define YSTRIDE ecx -#define YSTRIDE3 edi - OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off, - src_frame_data+frag_buf_off,ystride); -#undef SRC -#undef DST -#undef YSTRIDE -#undef YSTRIDE3 - } +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ + memset(_bv,~(_flimit<<1),8); } /*Apply the loop filter to a given set of fragment rows in the given plane. @@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, _fragy0: The Y coordinate of the first fragment row to filter. _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ - OC_ALIGN8(unsigned char ll[8]); + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ const oc_fragment_plane *fplane; const oc_fragment *frags; const ptrdiff_t *frag_buf_offs; @@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, ptrdiff_t fragi0_end; int ystride; int nhfrags; - memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); fplane=_state->fplanes+_pli; nhfrags=fplane->nhfrags; fragi_top=fplane->froffset; fragi_bot=fragi_top+fplane->nfrags; fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; - fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; + fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; ystride=_state->ref_ystride[_pli]; frags=_state->frags; frag_buf_offs=_state->frag_buf_offs; @@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, #define LL edx #define D esi #define D_WORD si - if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll); - if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll); + if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv); + if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv); if(fragi+1<fragi_end&&!frags[fragi+1].coded){ - OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll); + OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv); } if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ - OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll); + OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv); } #undef PIX #undef YSTRIDE3 diff --git a/thirdparty/libtheora/cpu.c b/thirdparty/libtheora/x86_vc/x86cpu.c index a863aad7f3..6a1d8d850c 100644 --- a/thirdparty/libtheora/cpu.c +++ b/thirdparty/libtheora/x86_vc/x86cpu.c @@ -14,41 +14,17 @@ Originally written by Rudolf Marek. function: - last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ -#include "cpu.h" +#include "x86cpu.h" #if !defined(OC_X86_ASM) -static ogg_uint32_t oc_cpu_flags_get(void){ +ogg_uint32_t oc_cpu_flags_get(void){ return 0; } #else -# if !defined(_MSC_VER) -# if defined(__amd64__)||defined(__x86_64__) -/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when - compiling with -fPIC.*/ -# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ - __asm__ __volatile__( \ - "cpuid\n\t" \ - :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ - :"a"(_op) \ - :"cc" \ - ) -# else -/*On x86-32, not so much.*/ -# define cpuid(_op,_eax,_ebx,_ecx,_edx) \ - __asm__ __volatile__( \ - "xchgl %%ebx,%[ebx]\n\t" \ - "cpuid\n\t" \ - "xchgl %%ebx,%[ebx]\n\t" \ - :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \ - :"a"(_op) \ - :"cc" \ - ) -# endif -# else /*Why does MSVC need this complicated rigamarole? At this point I honestly do not care.*/ @@ -95,7 +71,6 @@ static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){ mov [ecx],ebx } } -# endif static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ ogg_uint32_t flags; @@ -124,7 +99,7 @@ static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){ return flags; } -static ogg_uint32_t oc_cpu_flags_get(void){ +ogg_uint32_t oc_cpu_flags_get(void){ ogg_uint32_t flags; ogg_uint32_t eax; ogg_uint32_t ebx; @@ -132,25 +107,7 @@ static ogg_uint32_t oc_cpu_flags_get(void){ ogg_uint32_t edx; # if !defined(__amd64__)&&!defined(__x86_64__) /*Not all x86-32 chips support cpuid, so we have to check.*/ -# if !defined(_MSC_VER) - __asm__ __volatile__( - "pushfl\n\t" - "pushfl\n\t" - "popl %[a]\n\t" - "movl %[a],%[b]\n\t" - "xorl $0x200000,%[a]\n\t" - "pushl %[a]\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %[a]\n\t" - "popfl\n\t" - :[a]"=r"(eax),[b]"=r"(ebx) - : - :"cc" - ); -# else oc_detect_cpuid_helper(&eax,&ebx); -# endif /*No cpuid.*/ if(eax==ebx)return 0; # endif @@ -159,9 +116,18 @@ static ogg_uint32_t oc_cpu_flags_get(void){ if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547|| /* 6 8 x M T e n i u n e G*/ ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){ + int family; + int model; /*Intel, Transmeta (tested with Crusoe TM5800):*/ cpuid(1,eax,ebx,ecx,edx); flags=oc_parse_intel_flags(edx,ecx); + family=(eax>>8)&0xF; + model=(eax>>4)&0xF; + /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX + unit, so don't use it.*/ + if(family==6&&(model==9||model==13||model==14)){ + flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI); + } } /* D M A c i t n e h t u A*/ else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541|| diff --git a/thirdparty/libtheora/x86_vc/x86cpu.h b/thirdparty/libtheora/x86_vc/x86cpu.h new file mode 100644 index 0000000000..eea261d448 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/x86cpu.h @@ -0,0 +1,36 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + function: + last mod: $Id$ + + ********************************************************************/ + +#if !defined(_x86_vc_x86cpu_H) +# define _x86_vc_x86cpu_H (1) +#include "../internal.h" + +#define OC_CPU_X86_MMX (1<<0) +#define OC_CPU_X86_3DNOW (1<<1) +#define OC_CPU_X86_3DNOWEXT (1<<2) +#define OC_CPU_X86_MMXEXT (1<<3) +#define OC_CPU_X86_SSE (1<<4) +#define OC_CPU_X86_SSE2 (1<<5) +#define OC_CPU_X86_PNI (1<<6) +#define OC_CPU_X86_SSSE3 (1<<7) +#define OC_CPU_X86_SSE4_1 (1<<8) +#define OC_CPU_X86_SSE4_2 (1<<9) +#define OC_CPU_X86_SSE4A (1<<10) +#define OC_CPU_X86_SSE5 (1<<11) + +ogg_uint32_t oc_cpu_flags_get(void); + +#endif diff --git a/thirdparty/libtheora/x86_vc/x86enc.c b/thirdparty/libtheora/x86_vc/x86enc.c index e1960e1f0b..e9d59e85e3 100644 --- a/thirdparty/libtheora/x86_vc/x86enc.c +++ b/thirdparty/libtheora/x86_vc/x86enc.c @@ -18,27 +18,25 @@ #if defined(OC_X86_ASM) -#include "../cpu.c" - -void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){ +void oc_enc_accel_init_x86(oc_enc_ctx *_enc){ ogg_uint32_t cpu_flags; - cpu_flags=oc_cpu_flags_get(); - oc_enc_vtable_init_c(_enc); + cpu_flags=_enc->state.cpu_flags; + oc_enc_accel_init_c(_enc); if(cpu_flags&OC_CPU_X86_MMX){ _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx; _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx; _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; - _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx; } if(cpu_flags&OC_CPU_X86_MMXEXT){ _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext; _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext; _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext; - _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext; - _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext; + _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext; + _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext; _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext; _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext; + _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext; } if(cpu_flags&OC_CPU_X86_SSE2){ # if defined(OC_X86_64_ASM) diff --git a/thirdparty/libtheora/x86_vc/x86enc.h b/thirdparty/libtheora/x86_vc/x86enc.h index 581484641f..885406a54d 100644 --- a/thirdparty/libtheora/x86_vc/x86enc.h +++ b/thirdparty/libtheora/x86_vc/x86enc.h @@ -17,10 +17,14 @@ #if !defined(_x86_vc_x86enc_H) # define _x86_vc_x86enc_H (1) -# include "../encint.h" # include "x86int.h" +# if defined(OC_X86_ASM) +# define oc_enc_accel_init oc_enc_accel_init_x86 +# define OC_ENC_USE_VTABLE (1) +# endif +# include "../encint.h" -void oc_enc_vtable_init_x86(oc_enc_ctx *_enc); +void oc_enc_accel_init_x86(oc_enc_ctx *_enc); unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, const unsigned char *_ref,int _ystride); @@ -29,19 +33,19 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, unsigned _thresh); -unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref,int _ystride,unsigned _thresh); -unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src, - const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, - unsigned _thresh); -unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride); +unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src, + const unsigned char *_ref,int _ystride); +unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src, + const unsigned char *_ref1,const unsigned char *_ref2,int _ystride); +unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc, + const unsigned char *_src,int _ystride); void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64], const unsigned char *_x,const unsigned char *_y,int _stride); void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64], const unsigned char *_x,int _stride); void oc_enc_frag_copy2_mmxext(unsigned char *_dst, const unsigned char *_src1,const unsigned char *_src2,int _ystride); -void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]); +void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]); void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]); #endif diff --git a/thirdparty/libtheora/x86_vc/x86int.h b/thirdparty/libtheora/x86_vc/x86int.h index 4cca485311..318a09dca0 100644 --- a/thirdparty/libtheora/x86_vc/x86int.h +++ b/thirdparty/libtheora/x86_vc/x86int.h @@ -11,32 +11,39 @@ ******************************************************************** function: - last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ #if !defined(_x86_vc_x86int_H) # define _x86_vc_x86int_H (1) # include "../internal.h" +# if defined(OC_X86_ASM) +# define oc_state_accel_init oc_state_accel_init_x86 +# define OC_STATE_USE_VTABLE (1) +# endif +# include "../state.h" +# include "x86cpu.h" -void oc_state_vtable_init_x86(oc_theora_state *_state); +void oc_state_accel_init_x86(oc_theora_state *_state); void oc_frag_copy_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride); +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, + const unsigned char *_src_frame,int _ystride, + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs); void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, const ogg_int16_t *_residue); void oc_frag_recon_inter_mmx(unsigned char *_dst, const unsigned char *_src,int _ystride,const ogg_int16_t *_residue); void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue); -void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi); +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi); void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, - int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant); -void oc_state_frag_copy_list_mmx(const oc_theora_state *_state, - const ptrdiff_t *_fragis,ptrdiff_t _nfragis, - int _dst_frame,int _src_frame,int _pli); + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant); +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit); void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, - int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end); void oc_restore_fpu_mmx(void); #endif diff --git a/thirdparty/libtheora/x86_vc/x86state.c b/thirdparty/libtheora/x86_vc/x86state.c index a786bec284..fa3a0d42fc 100644 --- a/thirdparty/libtheora/x86_vc/x86state.c +++ b/thirdparty/libtheora/x86_vc/x86state.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -19,8 +19,6 @@ #if defined(OC_X86_ASM) -#include "../cpu.c" - /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into each quadrant of the destination.*/ static const unsigned char OC_FZIG_ZAG_MMX[128]={ @@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={ 64,64,64,64,64,64,64,64, }; -void oc_state_vtable_init_x86(oc_theora_state *_state){ +void oc_state_accel_init_x86(oc_theora_state *_state){ _state->cpu_flags=oc_cpu_flags_get(); if(_state->cpu_flags&OC_CPU_X86_MMX){ _state->opt_vtable.frag_copy=oc_frag_copy_mmx; + _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx; _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx; _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx; _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx; _state->opt_vtable.idct8x8=oc_idct8x8_mmx; _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; - _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx; + _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx; _state->opt_vtable.state_loop_filter_frag_rows= oc_state_loop_filter_frag_rows_mmx; _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx; _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX; } - else oc_state_vtable_init_c(_state); + else oc_state_accel_init_c(_state); } #endif diff --git a/thirdparty/libtheora/x86_vc/x86zigzag.h b/thirdparty/libtheora/x86_vc/x86zigzag.h new file mode 100644 index 0000000000..26f5ed2ea5 --- /dev/null +++ b/thirdparty/libtheora/x86_vc/x86zigzag.h @@ -0,0 +1,244 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ + + ********************************************************************/ + +#if !defined(_x86_vc_x86zigzag_H) +# define _x86_vc_x86zigzag_H (1) +# include "x86enc.h" + + +/*Converts DCT coefficients from transposed order into zig-zag scan order and + stores them in Y. + This relies on two macros to load the contents of each row: + OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the + first four and second four entries of each row into the specified register, + respectively. + OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row + (because when the rows are already in SSE2 registers, loading the high half + destructively modifies the register). + The index of each output element in the original 64-element array should wind + up in the following 8x8 matrix (the letters indicate the order we compute + each 4-tuple below): + A 0 8 1 2 9 16 24 17 B + C 10 3 4 11 18 25 32 40 E + F 33 26 19 12 5 6 13 20 D + G 27 34 41 48 56 49 42 35 I + L 28 21 14 7 15 22 29 36 M + H 43 50 57 58 51 44 37 30 O + N 23 31 38 45 52 59 60 53 J + P 46 39 47 54 61 62 55 63 K + The order of the coefficients within each tuple is reversed in the comments + below to reflect the usual MSB to LSB notation.*/ +#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \ + OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \ + OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \ + OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \ + OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \ + OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \ + OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \ + OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \ + __asm movq mm7,mm0 /*mm7=03 02 01 00*/ \ + __asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \ + __asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \ + __asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \ + __asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \ + __asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \ + __asm movq [Y+0x00],mm7 \ + __asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \ + __asm movq mm7,mm2 /*mm7=19 18 17 16*/ \ + __asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \ + __asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \ + __asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \ + __asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \ + __asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \ + OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \ + __asm movq [Y+0x08],mm1 \ + OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \ + __asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \ + __asm movq [Y+0x10],mm0 \ + __asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \ + __asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \ + __asm movq [Y+0x28],mm4 \ + __asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \ + __asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \ + __asm movq mm4,mm7 /*mm4=12 19 15 18*/ \ + __asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \ + __asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \ + __asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \ + __asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \ + __asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \ + OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \ + __asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \ + __asm movq [Y+0x18],mm4 \ + OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \ + __asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \ + __asm movq [Y+0x20],mm2 \ + __asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \ + __asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \ + __asm movq mm2,mm3 /*mm2=35 42 34 27*/ \ + __asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \ + __asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \ + __asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \ + __asm movq [Y+0x30],mm3 \ + __asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \ + __asm movq [Y+0x50],mm1 \ + OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \ + __asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \ + OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \ + __asm psllq mm6,16 /*mm6=07 23 22 ..*/ \ + __asm movq mm3,mm4 /*mm3=49 56 51 59*/ \ + __asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \ + OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \ + __asm movq [Y+0x38],mm4 \ + __asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \ + __asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \ + __asm movq mm4,mm3 /*mm4=61 51 60 59*/ \ + __asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \ + __asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \ + OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \ + __asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \ + __asm movq [Y+0x68],mm3 \ + __asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \ + __asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \ + __asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \ + OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \ + __asm movq [Y+0x78],mm4 \ + __asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \ + __asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \ + __asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \ + __asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \ + __asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \ + __asm movq [Y+0x40],mm5 \ + __asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \ + __asm movq [Y+0x48],mm7 \ + __asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \ + __asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \ + __asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \ + __asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \ + __asm movq [Y+0x60],mm6 \ + __asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \ + __asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \ + __asm movq [Y+0x58],mm3 \ + __asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \ + __asm movq [Y+0x70],mm0 \ + +/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan + order and stores them in %[qdct]. + The index of each output element in the original 64-element array should wind + up in the following 8x8 matrix (the letters indicate the order we compute + each 4-tuple below): + A 0 1 8 16 9 2 3 10 B + C 17 24 32 25 18 11 4 5 D + E 12 19 26 33 40 48 41 34 I + H 27 20 13 6 7 14 21 28 G + K 35 42 49 56 57 50 43 36 J + F 29 22 15 23 30 37 44 51 M + P 58 59 52 45 38 31 39 46 L + N 53 60 61 54 47 55 62 63 O + The order of the coefficients within each tuple is reversed in the comments + below to reflect the usual MSB to LSB notation.*/ +#define OC_ZIG_ZAG_MMXEXT \ + "movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \ + "movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \ + "movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \ + "movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \ + "movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \ + "movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \ + "movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \ + "punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \ + "movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \ + "punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \ + "movq %%mm0,0x00(%[qdct])\n\t" \ + "movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \ + "punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \ + "psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \ + "punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \ + "punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \ + "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \ + "movq %%mm6,0x08(%[qdct])\n\t" \ + "psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \ + "movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \ + "punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \ + "movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \ + "punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \ + "por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \ + "punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \ + "movq %%mm2,0x10(%[qdct])\n\t" \ + "movq %%mm3,0x18(%[qdct])\n\t" \ + "movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \ + "movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \ + "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \ + "punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \ + "punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \ + "punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \ + "punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \ + "punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \ + "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \ + "psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \ + "punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \ + "punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \ + "punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \ + "movq %%mm0,0x20(%[qdct])\n\t" \ + "movq %%mm3,0x50(%[qdct])\n\t" \ + "movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \ + "movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \ + "movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \ + "punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \ + "psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \ + "movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \ + "punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \ + "punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \ + "movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \ + "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \ + "movq %%mm2,0x30(%[qdct])\n\t" \ + "movq %%mm6,0x38(%[qdct])\n\t" \ + "movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \ + "punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \ + "movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \ + "punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \ + "psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \ + "punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \ + "punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \ + "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \ + "movq %%mm0,0x28(%[qdct])\n\t" \ + "punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \ + "punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \ + "punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \ + "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \ + "movq %%mm4,0x40(%[qdct])\n\t" \ + "movq %%mm6,0x48(%[qdct])\n\t" \ + "movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \ + "movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \ + "psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \ + "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \ + "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \ + "punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \ + "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \ + "punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \ + "punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \ + "movq %%mm2,0x68(%[qdct])\n\t" \ + "movq %%mm1,0x58(%[qdct])\n\t" \ + "punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \ + "punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \ + "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \ + "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \ + "punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \ + "punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \ + "movq %%mm6,0x70(%[qdct])\n\t" \ + "movq %%mm5,0x78(%[qdct])\n\t" \ + "movq %%mm7,0x60(%[qdct])\n\t" \ + +#endif diff --git a/thirdparty/recastnavigation/Recast/Include/Recast.h b/thirdparty/recastnavigation/Recast/Include/Recast.h index 4d557389b5..246376bbee 100644 --- a/thirdparty/recastnavigation/Recast/Include/Recast.h +++ b/thirdparty/recastnavigation/Recast/Include/Recast.h @@ -22,13 +22,16 @@ /// The value of PI used by Recast. static const float RC_PI = 3.14159265f; +/// Used to ignore unused function parameters and silence any compiler warnings. +template<class T> void rcIgnoreUnused(const T&) { } + /// Recast log categories. /// @see rcContext enum rcLogCategory { RC_LOG_PROGRESS = 1, ///< A progress log entry. RC_LOG_WARNING, ///< A warning log entry. - RC_LOG_ERROR, ///< An error log entry. + RC_LOG_ERROR ///< An error log entry. }; /// Recast performance timer categories. @@ -101,7 +104,6 @@ enum rcTimerLabel class rcContext { public: - /// Contructor. /// @param[in] state TRUE if the logging and performance timers should be enabled. [Default: true] inline rcContext(bool state = true) : m_logEnabled(state), m_timerEnabled(state) {} @@ -140,31 +142,30 @@ public: inline int getAccumulatedTime(const rcTimerLabel label) const { return m_timerEnabled ? doGetAccumulatedTime(label) : -1; } protected: - /// Clears all log entries. - virtual void doResetLog() {} + virtual void doResetLog(); /// Logs a message. /// @param[in] category The category of the message. /// @param[in] msg The formatted message. /// @param[in] len The length of the formatted message. - virtual void doLog(const rcLogCategory /*category*/, const char* /*msg*/, const int /*len*/) {} + virtual void doLog(const rcLogCategory category, const char* msg, const int len) { rcIgnoreUnused(category); rcIgnoreUnused(msg); rcIgnoreUnused(len); } /// Clears all timers. (Resets all to unused.) virtual void doResetTimers() {} /// Starts the specified performance timer. /// @param[in] label The category of timer. - virtual void doStartTimer(const rcTimerLabel /*label*/) {} + virtual void doStartTimer(const rcTimerLabel label) { rcIgnoreUnused(label); } /// Stops the specified performance timer. /// @param[in] label The category of the timer. - virtual void doStopTimer(const rcTimerLabel /*label*/) {} + virtual void doStopTimer(const rcTimerLabel label) { rcIgnoreUnused(label); } /// Returns the total accumulated time of the specified performance timer. /// @param[in] label The category of the timer. /// @return The accumulated time of the timer, or -1 if timers are disabled or the timer has never been started. - virtual int doGetAccumulatedTime(const rcTimerLabel /*label*/) const { return -1; } + virtual int doGetAccumulatedTime(const rcTimerLabel label) const { rcIgnoreUnused(label); return -1; } /// True if logging is enabled. bool m_logEnabled; @@ -564,7 +565,7 @@ static const int RC_AREA_BORDER = 0x20000; enum rcBuildContoursFlags { RC_CONTOUR_TESS_WALL_EDGES = 0x01, ///< Tessellate solid (impassable) edges during contour simplification. - RC_CONTOUR_TESS_AREA_EDGES = 0x02, ///< Tessellate edges between areas during contour simplification. + RC_CONTOUR_TESS_AREA_EDGES = 0x02 ///< Tessellate edges between areas during contour simplification. }; /// Applied to the region id field of contour vertices in order to extract the region id. @@ -595,11 +596,6 @@ static const int RC_NOT_CONNECTED = 0x3f; /// @name General helper functions /// @{ -/// Used to ignore a function parameter. VS complains about unused parameters -/// and this silences the warning. -/// @param [in] _ Unused parameter -template<class T> void rcIgnoreUnused(const T&) { } - /// Swaps the values of the two parameters. /// @param[in,out] a Value A /// @param[in,out] b Value B @@ -996,6 +992,7 @@ void rcMarkConvexPolyArea(rcContext* ctx, const float* verts, const int nverts, /// @ingroup recast /// @param[in] verts The vertices of the polygon [Form: (x, y, z) * @p nverts] /// @param[in] nverts The number of vertices in the polygon. +/// @param[in] offset How much to offset the polygon by. [Units: wu] /// @param[out] outVerts The offset vertices (should hold up to 2 * @p nverts) [Form: (x, y, z) * return value] /// @param[in] maxOutVerts The max number of vertices that can be stored to @p outVerts. /// @returns Number of vertices in the offset polygon or 0 if too few vertices in @p outVerts. diff --git a/thirdparty/recastnavigation/Recast/Include/RecastAlloc.h b/thirdparty/recastnavigation/Recast/Include/RecastAlloc.h index 071278d659..8b166d736d 100644 --- a/thirdparty/recastnavigation/Recast/Include/RecastAlloc.h +++ b/thirdparty/recastnavigation/Recast/Include/RecastAlloc.h @@ -112,7 +112,7 @@ class rcVectorBase { typedef rcSizeType size_type; typedef T value_type; - rcVectorBase() : m_size(0), m_cap(0), m_data(0) {}; + rcVectorBase() : m_size(0), m_cap(0), m_data(0) {} rcVectorBase(const rcVectorBase<T, H>& other) : m_size(0), m_cap(0), m_data(0) { assign(other.begin(), other.end()); } explicit rcVectorBase(rcSizeType count) : m_size(0), m_cap(0), m_data(0) { resize(count); } rcVectorBase(rcSizeType count, const T& value) : m_size(0), m_cap(0), m_data(0) { resize(count, value); } @@ -142,8 +142,8 @@ class rcVectorBase { const T& front() const { rcAssert(m_size); return m_data[0]; } T& front() { rcAssert(m_size); return m_data[0]; } - const T& back() const { rcAssert(m_size); return m_data[m_size - 1]; }; - T& back() { rcAssert(m_size); return m_data[m_size - 1]; }; + const T& back() const { rcAssert(m_size); return m_data[m_size - 1]; } + T& back() { rcAssert(m_size); return m_data[m_size - 1]; } const T* data() const { return m_data; } T* data() { return m_data; } diff --git a/thirdparty/recastnavigation/Recast/Source/Recast.cpp b/thirdparty/recastnavigation/Recast/Source/Recast.cpp index 1b71710cdc..4cf145c981 100644 --- a/thirdparty/recastnavigation/Recast/Source/Recast.cpp +++ b/thirdparty/recastnavigation/Recast/Source/Recast.cpp @@ -94,6 +94,11 @@ void rcContext::log(const rcLogCategory category, const char* format, ...) doLog(category, msg, len); } +void rcContext::doResetLog() +{ + // Defined out of line to fix the weak v-tables warning +} + rcHeightfield* rcAllocHeightfield() { return rcNew<rcHeightfield>(RC_ALLOC_PERM); diff --git a/thirdparty/recastnavigation/Recast/Source/RecastMesh.cpp b/thirdparty/recastnavigation/Recast/Source/RecastMesh.cpp index e99eaebb79..ea09ee1de0 100644 --- a/thirdparty/recastnavigation/Recast/Source/RecastMesh.cpp +++ b/thirdparty/recastnavigation/Recast/Source/RecastMesh.cpp @@ -566,7 +566,6 @@ static bool canRemoveVertex(rcContext* ctx, rcPolyMesh& mesh, const unsigned sho const int nvp = mesh.nvp; // Count number of polygons to remove. - int numRemovedVerts = 0; int numTouchedVerts = 0; int numRemainingEdges = 0; for (int i = 0; i < mesh.npolys; ++i) @@ -586,7 +585,6 @@ static bool canRemoveVertex(rcContext* ctx, rcPolyMesh& mesh, const unsigned sho } if (numRemoved) { - numRemovedVerts += numRemoved; numRemainingEdges += numVerts-(numRemoved+1); } } diff --git a/thirdparty/recastnavigation/Recast/Source/RecastMeshDetail.cpp b/thirdparty/recastnavigation/Recast/Source/RecastMeshDetail.cpp index 1999200c1a..40bfc9b4bc 100644 --- a/thirdparty/recastnavigation/Recast/Source/RecastMeshDetail.cpp +++ b/thirdparty/recastnavigation/Recast/Source/RecastMeshDetail.cpp @@ -284,7 +284,7 @@ static unsigned short getHeight(const float fx, const float fy, const float fz, enum EdgeValues { EV_UNDEF = -1, - EV_HULL = -2, + EV_HULL = -2 }; static int findEdge(const int* edges, int nedges, int s, int t) diff --git a/thirdparty/recastnavigation/Recast/Source/RecastRasterization.cpp b/thirdparty/recastnavigation/Recast/Source/RecastRasterization.cpp index a4cef74909..673550e79e 100644 --- a/thirdparty/recastnavigation/Recast/Source/RecastRasterization.cpp +++ b/thirdparty/recastnavigation/Recast/Source/RecastRasterization.cpp @@ -264,7 +264,8 @@ static bool rasterizeTri(const float* v0, const float* v1, const float* v2, // Calculate the footprint of the triangle on the grid's y-axis int y0 = (int)((tmin[2] - bmin[2])*ics); int y1 = (int)((tmax[2] - bmin[2])*ics); - y0 = rcClamp(y0, 0, h-1); + // use -1 rather than 0 to cut the polygon properly at the start of the tile + y0 = rcClamp(y0, -1, h-1); y1 = rcClamp(y1, 0, h-1); // Clip the triangle into all grid cells it touches. @@ -283,7 +284,7 @@ static bool rasterizeTri(const float* v0, const float* v1, const float* v2, dividePoly(in, nvIn, inrow, &nvrow, p1, &nvIn, cz+cs, 2); rcSwap(in, p1); if (nvrow < 3) continue; - + if (y < 0) continue; // find the horizontal bounds in the row float minX = inrow[0], maxX = inrow[0]; for (int i=1; i<nvrow; ++i) @@ -293,7 +294,10 @@ static bool rasterizeTri(const float* v0, const float* v1, const float* v2, } int x0 = (int)((minX - bmin[0])*ics); int x1 = (int)((maxX - bmin[0])*ics); - x0 = rcClamp(x0, 0, w-1); + if (x1 < 0 || x0 >= w) { + continue; + } + x0 = rcClamp(x0, -1, w-1); x1 = rcClamp(x1, 0, w-1); int nv, nv2 = nvrow; @@ -305,7 +309,7 @@ static bool rasterizeTri(const float* v0, const float* v1, const float* v2, dividePoly(inrow, nv2, p1, &nv, p2, &nv2, cx+cs, 0); rcSwap(inrow, p2); if (nv < 3) continue; - + if (x < 0) continue; // Calculate min and max of the span. float smin = p1[1], smax = p1[1]; for (int i = 1; i < nv; ++i) |