diff options
Diffstat (limited to 'thirdparty/cvtt/ConvectionKernels_ParallelMath.h')
-rw-r--r-- | thirdparty/cvtt/ConvectionKernels_ParallelMath.h | 1816 |
1 files changed, 1816 insertions, 0 deletions
diff --git a/thirdparty/cvtt/ConvectionKernels_ParallelMath.h b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h new file mode 100644 index 0000000000..9e25280f45 --- /dev/null +++ b/thirdparty/cvtt/ConvectionKernels_ParallelMath.h @@ -0,0 +1,1816 @@ +/* +Convection Texture Tools +Copyright (c) 2018-2019 Eric Lasota + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject +to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ +#pragma once +#ifndef __CVTT_PARALLELMATH_H__ +#define __CVTT_PARALLELMATH_H__ + +#include "ConvectionKernels.h" +#include "ConvectionKernels_Config.h" + +#ifdef CVTT_USE_SSE2 +#include <emmintrin.h> +#endif + +#include <float.h> +#include <assert.h> +#include <string.h> +#include <algorithm> +#include <math.h> + +#define UNREFERENCED_PARAMETER(n) ((void)n) + +// Parallel math implementation +// +// After preprocessor defs are handled, what this should do is expose the following types: +// SInt16 - Signed 16-bit integer +// UInt16 - Signed 16-bit integer +// UInt15 - Unsigned 15-bit integer +// SInt32 - Signed 32-bit integer +// UInt31 - Unsigned 31-bit integer +// AInt16 - 16-bit integer of unknown signedness (only used for storage) +// Int16CompFlag - Comparison flags from comparing 16-bit integers +// Int32CompFlag - Comparison flags from comparing 32-bit integers +// FloatCompFlag - Comparison flags from comparing 32-bit floats +// +// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops +// (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's +// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types +// can elide the bit flips if unsigned versions are not available. + +namespace cvtt +{ +#ifdef CVTT_USE_SSE2 + // SSE2 version + struct ParallelMath + { + typedef uint16_t ScalarUInt16; + typedef int16_t ScalarSInt16; + + template<unsigned int TRoundingMode> + struct RoundForScope + { + unsigned int m_oldCSR; + + RoundForScope() + { + m_oldCSR = _mm_getcsr(); + _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode)); + } + + ~RoundForScope() + { + _mm_setcsr(m_oldCSR); + } + }; + + struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO> + { + }; + + struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST> + { + }; + + struct RoundUpForScope : RoundForScope<_MM_ROUND_UP> + { + }; + + struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN> + { + }; + + static const int ParallelSize = 8; + + enum Int16Subtype + { + IntSubtype_Signed, + IntSubtype_UnsignedFull, + IntSubtype_UnsignedTruncated, + IntSubtype_Abstract, + }; + + template<int TSubtype> + struct VInt16 + { + __m128i m_value; + + inline VInt16 operator+(int16_t other) const + { + VInt16 result; + result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other))); + return result; + } + + inline VInt16 operator+(const VInt16 &other) const + { + VInt16 result; + result.m_value = _mm_add_epi16(m_value, other.m_value); + return result; + } + + inline VInt16 operator|(const VInt16 &other) const + { + VInt16 result; + result.m_value = _mm_or_si128(m_value, other.m_value); + return result; + } + + inline VInt16 operator&(const VInt16 &other) const + { + VInt16 result; + result.m_value = _mm_and_si128(m_value, other.m_value); + return result; + } + + inline VInt16 operator-(const VInt16 &other) const + { + VInt16 result; + result.m_value = _mm_sub_epi16(m_value, other.m_value); + return result; + } + + inline VInt16 operator<<(int bits) const + { + VInt16 result; + result.m_value = _mm_slli_epi16(m_value, bits); + return result; + } + + inline VInt16 operator^(const VInt16 &other) const + { + VInt16 result; + result.m_value = _mm_xor_si128(m_value, other.m_value); + return result; + } + }; + + typedef VInt16<IntSubtype_Signed> SInt16; + typedef VInt16<IntSubtype_UnsignedFull> UInt16; + typedef VInt16<IntSubtype_UnsignedTruncated> UInt15; + typedef VInt16<IntSubtype_Abstract> AInt16; + + template<int TSubtype> + struct VInt32 + { + __m128i m_values[2]; + + inline VInt32 operator+(const VInt32& other) const + { + VInt32 result; + result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]); + return result; + } + + inline VInt32 operator-(const VInt32& other) const + { + VInt32 result; + result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]); + return result; + } + + inline VInt32 operator<<(const int other) const + { + VInt32 result; + result.m_values[0] = _mm_slli_epi32(m_values[0], other); + result.m_values[1] = _mm_slli_epi32(m_values[1], other); + return result; + } + + inline VInt32 operator|(const VInt32& other) const + { + VInt32 result; + result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]); + return result; + } + }; + + typedef VInt32<IntSubtype_Signed> SInt32; + typedef VInt32<IntSubtype_UnsignedTruncated> UInt31; + typedef VInt32<IntSubtype_UnsignedFull> UInt32; + typedef VInt32<IntSubtype_Abstract> AInt32; + + template<class TTargetType> + struct LosslessCast + { +#ifdef CVTT_PERMIT_ALIASING + template<int TSrcSubtype> + static const TTargetType& Cast(const VInt32<TSrcSubtype> &src) + { + return reinterpret_cast<VInt32<TSubtype>&>(src); + } + + template<int TSrcSubtype> + static const TTargetType& Cast(const VInt16<TSrcSubtype> &src) + { + return reinterpret_cast<VInt16<TSubtype>&>(src); + } +#else + template<int TSrcSubtype> + static TTargetType Cast(const VInt32<TSrcSubtype> &src) + { + TTargetType result; + result.m_values[0] = src.m_values[0]; + result.m_values[1] = src.m_values[1]; + return result; + } + + template<int TSrcSubtype> + static TTargetType Cast(const VInt16<TSrcSubtype> &src) + { + TTargetType result; + result.m_value = src.m_value; + return result; + } +#endif + }; + + struct Int64 + { + __m128i m_values[4]; + }; + + struct Float + { + __m128 m_values[2]; + + inline Float operator+(const Float &other) const + { + Float result; + result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]); + return result; + } + + inline Float operator+(float other) const + { + Float result; + result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other)); + result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other)); + return result; + } + + inline Float operator-(const Float& other) const + { + Float result; + result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]); + return result; + } + + inline Float operator-() const + { + Float result; + result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]); + result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]); + return result; + } + + inline Float operator*(const Float& other) const + { + Float result; + result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]); + return result; + } + + inline Float operator*(float other) const + { + Float result; + result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other)); + result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other)); + return result; + } + + inline Float operator/(const Float &other) const + { + Float result; + result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]); + return result; + } + + inline Float operator/(float other) const + { + Float result; + result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other)); + result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other)); + return result; + } + }; + + struct Int16CompFlag + { + __m128i m_value; + + inline Int16CompFlag operator&(const Int16CompFlag &other) const + { + Int16CompFlag result; + result.m_value = _mm_and_si128(m_value, other.m_value); + return result; + } + + inline Int16CompFlag operator|(const Int16CompFlag &other) const + { + Int16CompFlag result; + result.m_value = _mm_or_si128(m_value, other.m_value); + return result; + } + }; + + struct Int32CompFlag + { + __m128i m_values[2]; + + inline Int32CompFlag operator&(const Int32CompFlag &other) const + { + Int32CompFlag result; + result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]); + return result; + } + + inline Int32CompFlag operator|(const Int32CompFlag &other) const + { + Int32CompFlag result; + result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]); + return result; + } + }; + + struct FloatCompFlag + { + __m128 m_values[2]; + + inline FloatCompFlag operator&(const FloatCompFlag &other) const + { + FloatCompFlag result; + result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]); + return result; + } + + inline FloatCompFlag operator|(const FloatCompFlag &other) const + { + FloatCompFlag result; + result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]); + result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]); + return result; + } + }; + + template<int TSubtype> + static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) + { + VInt16<TSubtype> result; + result.m_value = _mm_add_epi16(a.m_value, b.m_value); + return result; + } + + template<int TSubtype> + static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) + { + VInt16<TSubtype> result; + result.m_value = _mm_sub_epi16(a.m_value, b.m_value); + return result; + } + + static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i])); + return result; + } + + template<int TSubtype> + static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) + { + VInt16<TSubtype> result; + result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value)); + return result; + } + + template<int TSubtype> + static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a) + { + VInt16<TSubtype> result; + result.m_value = _mm_and_si128(flag.m_value, a.m_value); + return result; + } + + template<int TSubtype> + static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) + { + dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value)); + } + + template<int TSubtype> + static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src) + { + __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value); + __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value); + dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0])); + dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1])); + } + + static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src) + { + dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value)); + } + + static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v) + { + SInt16 result; + result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15)); + return result; + } + + template<int TSubtype> + static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) + { + dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value)); + } + + static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) + { + for (int i = 0; i < 2; i++) + dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i])); + } + + static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) + { + for (int i = 0; i < 2; i++) + dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i])); + } + + static void MakeSafeDenominator(Float& v) + { + ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f)); + } + + static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision) + { + int lostBits = 16 - precision; + if (lostBits == 0) + return v; + + SInt16 result; + result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); + return result; + } + + static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision) + { + int lostBits = 16 - precision; + if (lostBits == 0) + return v; + + UInt16 result; + result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); + return result; + } + + static UInt16 Min(const UInt16 &a, const UInt16 &b) + { + __m128i bitFlip = _mm_set1_epi16(-32768); + + UInt16 result; + result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); + return result; + } + + static SInt16 Min(const SInt16 &a, const SInt16 &b) + { + SInt16 result; + result.m_value = _mm_min_epi16(a.m_value, b.m_value); + return result; + } + + static UInt15 Min(const UInt15 &a, const UInt15 &b) + { + UInt15 result; + result.m_value = _mm_min_epi16(a.m_value, b.m_value); + return result; + } + + static Float Min(const Float &a, const Float &b) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]); + return result; + } + + static UInt16 Max(const UInt16 &a, const UInt16 &b) + { + __m128i bitFlip = _mm_set1_epi16(-32768); + + UInt16 result; + result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); + return result; + } + + static SInt16 Max(const SInt16 &a, const SInt16 &b) + { + SInt16 result; + result.m_value = _mm_max_epi16(a.m_value, b.m_value); + return result; + } + + static UInt15 Max(const UInt15 &a, const UInt15 &b) + { + UInt15 result; + result.m_value = _mm_max_epi16(a.m_value, b.m_value); + return result; + } + + static Float Max(const Float &a, const Float &b) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]); + return result; + } + + static Float Clamp(const Float &v, float min, float max) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min)); + return result; + } + + static Float Reciprocal(const Float &v) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_rcp_ps(v.m_values[i]); + return result; + } + + static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut) + { + int16_t values[8]; + for (int i = 0; i < 8; i++) + values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; + + chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); + } + + static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut) + { + int16_t values[8]; + for (int i = 0; i < 8; i++) + values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; + + chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); + } + + static Float MakeFloat(float v) + { + Float f; + f.m_values[0] = f.m_values[1] = _mm_set1_ps(v); + return f; + } + + static Float MakeFloatZero() + { + Float f; + f.m_values[0] = f.m_values[1] = _mm_setzero_ps(); + return f; + } + + static UInt16 MakeUInt16(uint16_t v) + { + UInt16 result; + result.m_value = _mm_set1_epi16(static_cast<short>(v)); + return result; + } + + static SInt16 MakeSInt16(int16_t v) + { + SInt16 result; + result.m_value = _mm_set1_epi16(static_cast<short>(v)); + return result; + } + + static AInt16 MakeAInt16(int16_t v) + { + AInt16 result; + result.m_value = _mm_set1_epi16(static_cast<short>(v)); + return result; + } + + static UInt15 MakeUInt15(uint16_t v) + { + UInt15 result; + result.m_value = _mm_set1_epi16(static_cast<short>(v)); + return result; + } + + static SInt32 MakeSInt32(int32_t v) + { + SInt32 result; + result.m_values[0] = _mm_set1_epi32(v); + result.m_values[1] = _mm_set1_epi32(v); + return result; + } + + static UInt31 MakeUInt31(uint32_t v) + { + UInt31 result; + result.m_values[0] = _mm_set1_epi32(v); + result.m_values[1] = _mm_set1_epi32(v); + return result; + } + + static uint16_t Extract(const UInt16 &v, int offset) + { + return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; + } + + static int16_t Extract(const SInt16 &v, int offset) + { + return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; + } + + static uint16_t Extract(const UInt15 &v, int offset) + { + return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; + } + + static int16_t Extract(const AInt16 &v, int offset) + { + return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; + } + + static int32_t Extract(const SInt32 &v, int offset) + { + return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3]; + } + + static float Extract(const Float &v, int offset) + { + return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3]; + } + + static bool Extract(const ParallelMath::Int16CompFlag &v, int offset) + { + return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0; + } + + static void PutUInt16(UInt16 &dest, int offset, uint16_t v) + { + reinterpret_cast<uint16_t*>(&dest)[offset] = v; + } + + static void PutUInt15(UInt15 &dest, int offset, uint16_t v) + { + reinterpret_cast<uint16_t*>(&dest)[offset] = v; + } + + static void PutSInt16(SInt16 &dest, int offset, int16_t v) + { + reinterpret_cast<int16_t*>(&dest)[offset] = v; + } + + static float ExtractFloat(const Float& v, int offset) + { + return reinterpret_cast<const float*>(&v)[offset]; + } + + static void PutFloat(Float &dest, int offset, float v) + { + reinterpret_cast<float*>(&dest)[offset] = v; + } + + static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v) + { + reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0; + } + + static Int32CompFlag Less(const UInt31 &a, const UInt31 &b) + { + Int32CompFlag result; + result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]); + result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]); + return result; + } + + static Int16CompFlag Less(const SInt16 &a, const SInt16 &b) + { + Int16CompFlag result; + result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); + return result; + } + + static Int16CompFlag Less(const UInt15 &a, const UInt15 &b) + { + Int16CompFlag result; + result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); + return result; + } + + static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b) + { + Int16CompFlag result; + result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); + return result; + } + + static FloatCompFlag Less(const Float &a, const Float &b) + { + FloatCompFlag result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]); + return result; + } + + static FloatCompFlag LessOrEqual(const Float &a, const Float &b) + { + FloatCompFlag result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]); + return result; + } + + template<int TSubtype> + static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) + { + Int16CompFlag result; + result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value); + return result; + } + + static FloatCompFlag Equal(const Float &a, const Float &b) + { + FloatCompFlag result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]); + return result; + } + + static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b) + { + Int16CompFlag notResult; + notResult.m_value = _mm_xor_si128(a.m_value, b.m_value); + return Not(notResult); + } + + static Float ToFloat(const UInt16 &v) + { + Float result; + result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); + result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); + return result; + } + + static UInt31 ToUInt31(const UInt16 &v) + { + UInt31 result; + result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); + result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); + return result; + } + + static SInt32 ToInt32(const UInt16 &v) + { + SInt32 result; + result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); + result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); + return result; + } + + static SInt32 ToInt32(const UInt15 &v) + { + SInt32 result; + result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); + result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); + return result; + } + + static SInt32 ToInt32(const SInt16 &v) + { + SInt32 result; + result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16); + result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16); + return result; + } + + static Float ToFloat(const SInt16 &v) + { + Float result; + result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16)); + result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16)); + return result; + } + + static Float ToFloat(const UInt15 &v) + { + Float result; + result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); + result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); + return result; + } + + static Float ToFloat(const UInt31 &v) + { + Float result; + result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]); + result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]); + return result; + } + + static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v) + { + __m128i lo = _mm_castps_si128(v.m_values[0]); + __m128i hi = _mm_castps_si128(v.m_values[1]); + + Int16CompFlag result; + result.m_value = _mm_packs_epi32(lo, hi); + return result; + } + + static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v) + { + __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value); + __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value); + + FloatCompFlag result; + result.m_values[0] = _mm_castsi128_ps(lo); + result.m_values[1] = _mm_castsi128_ps(hi); + return result; + } + + static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v) + { + __m128i lo = v.m_values[0]; + __m128i hi = v.m_values[1]; + + Int16CompFlag result; + result.m_value = _mm_packs_epi32(lo, hi); + return result; + } + + static Int16CompFlag MakeBoolInt16(bool b) + { + Int16CompFlag result; + if (b) + result.m_value = _mm_set1_epi16(-1); + else + result.m_value = _mm_setzero_si128(); + return result; + } + + static FloatCompFlag MakeBoolFloat(bool b) + { + FloatCompFlag result; + if (b) + result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1)); + else + result.m_values[0] = result.m_values[1] = _mm_setzero_ps(); + return result; + } + + static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b) + { + Int16CompFlag result; + result.m_value = _mm_andnot_si128(b.m_value, a.m_value); + return result; + } + + static Int16CompFlag Not(const Int16CompFlag &b) + { + Int16CompFlag result; + result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1)); + return result; + } + + static Int32CompFlag Not(const Int32CompFlag &b) + { + Int32CompFlag result; + result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1)); + result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1)); + return result; + } + + static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/) + { + __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768))); + __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768))); + + __m128i packed = _mm_packs_epi32(lo, hi); + + UInt16 result; + result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768)); + return result; + } + + static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/) + { + __m128i lo = _mm_cvtps_epi32(v.m_values[0]); + __m128i hi = _mm_cvtps_epi32(v.m_values[1]); + + __m128i packed = _mm_packs_epi32(lo, hi); + + UInt15 result; + result.m_value = _mm_packs_epi32(lo, hi); + return result; + } + + static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/) + { + __m128i lo = _mm_cvtps_epi32(v.m_values[0]); + __m128i hi = _mm_cvtps_epi32(v.m_values[1]); + + __m128i packed = _mm_packs_epi32(lo, hi); + + SInt16 result; + result.m_value = _mm_packs_epi32(lo, hi); + return result; + } + + static Float Sqrt(const Float &f) + { + Float result; + for (int i = 0; i < 2; i++) + result.m_values[i] = _mm_sqrt_ps(f.m_values[i]); + return result; + } + + static UInt16 Abs(const SInt16 &a) + { + __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15); + __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15); + + UInt16 result; + result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd); + return result; + } + + static Float Abs(const Float& a) + { + __m128 invMask = _mm_set1_ps(-0.0f); + + Float result; + result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]); + result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]); + return result; + } + + static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b) + { + __m128i diff = _mm_sub_epi16(a.m_value, b.m_value); + + UInt16 result; + result.m_value = _mm_mullo_epi16(diff, diff); + return result; + } + + static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b) + { + __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value)); + + __m128i mulHi = _mm_mulhi_epu16(diffU, diffU); + __m128i mulLo = _mm_mullo_epi16(diffU, diffU); + __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi); + __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi); + + Float result; + result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo); + result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi); + + return result; + } + + static Float TwosCLHalfToFloat(const SInt16 &v) + { + __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15)); + + __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768)); + __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff)); + __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00)); + + __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128()); + + // Convert exponent to high-bits + exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336)); + + __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336))); + + __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3))); + __m128i lowBits = _mm_slli_epi16(mantissa, 13); + + __m128i flow = _mm_unpacklo_epi16(lowBits, highBits); + __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits); + + __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh); + __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh); + + Float result; + result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow)); + result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh)); + + return result; + } + + static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) + { + Float fa = TwosCLHalfToFloat(a); + + Float diff = fa - b; + return diff * diff; + } + + static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) + { + Float fa = TwosCLHalfToFloat(a); + Float fb = TwosCLHalfToFloat(b); + + Float diff = fa - fb; + return diff * diff; + } + + static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) + { + Float fa = TwosCLHalfToFloat(a) * aWeight; + + Float diff = fa - b; + return diff * diff; + } + + static UInt16 RightShift(const UInt16 &v, int bits) + { + UInt16 result; + result.m_value = _mm_srli_epi16(v.m_value, bits); + return result; + } + + static UInt31 RightShift(const UInt31 &v, int bits) + { + UInt31 result; + result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits); + result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits); + return result; + } + + static SInt16 RightShift(const SInt16 &v, int bits) + { + SInt16 result; + result.m_value = _mm_srai_epi16(v.m_value, bits); + return result; + } + + static UInt15 RightShift(const UInt15 &v, int bits) + { + UInt15 result; + result.m_value = _mm_srli_epi16(v.m_value, bits); + return result; + } + + static SInt32 RightShift(const SInt32 &v, int bits) + { + SInt32 result; + result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits); + result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits); + return result; + } + + static SInt16 ToSInt16(const SInt32 &v) + { + SInt16 result; + result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); + return result; + } + + static SInt16 ToSInt16(const UInt16 &v) + { + SInt16 result; + result.m_value = v.m_value; + return result; + } + + static SInt16 ToSInt16(const UInt15 &v) + { + SInt16 result; + result.m_value = v.m_value; + return result; + } + + static UInt16 ToUInt16(const UInt32 &v) + { + __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); + __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); + + UInt16 result; + result.m_value = _mm_packs_epi32(low, high); + return result; + } + + static UInt16 ToUInt16(const UInt31 &v) + { + __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); + __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); + + UInt16 result; + result.m_value = _mm_packs_epi32(low, high); + return result; + } + + static UInt15 ToUInt15(const UInt31 &v) + { + UInt15 result; + result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); + return result; + } + + static UInt15 ToUInt15(const SInt16 &v) + { + UInt15 result; + result.m_value = v.m_value; + return result; + } + + static UInt15 ToUInt15(const UInt16 &v) + { + UInt15 result; + result.m_value = v.m_value; + return result; + } + + static SInt32 XMultiply(const SInt16 &a, const SInt16 &b) + { + __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); + __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); + + SInt32 result; + result.m_values[0] = _mm_unpacklo_epi16(low, high); + result.m_values[1] = _mm_unpackhi_epi16(low, high); + return result; + } + + static SInt32 XMultiply(const SInt16 &a, const UInt15 &b) + { + __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); + __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); + + SInt32 result; + result.m_values[0] = _mm_unpacklo_epi16(low, high); + result.m_values[1] = _mm_unpackhi_epi16(low, high); + return result; + } + + static SInt32 XMultiply(const UInt15 &a, const SInt16 &b) + { + return XMultiply(b, a); + } + + static UInt32 XMultiply(const UInt16 &a, const UInt16 &b) + { + __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); + __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); + + UInt32 result; + result.m_values[0] = _mm_unpacklo_epi16(low, high); + result.m_values[1] = _mm_unpackhi_epi16(low, high); + return result; + } + + static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b) + { + UInt16 result; + result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); + return result; + } + + static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b) + { + UInt16 result; + result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); + return result; + } + + static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b) + { + SInt16 result; + result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); + return result; + } + + static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b) + { + SInt16 result; + result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); + return result; + } + + static UInt31 XMultiply(const UInt15 &a, const UInt15 &b) + { + __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); + __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); + + UInt31 result; + result.m_values[0] = _mm_unpacklo_epi16(low, high); + result.m_values[1] = _mm_unpackhi_epi16(low, high); + return result; + } + + static UInt31 XMultiply(const UInt16 &a, const UInt15 &b) + { + __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); + __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); + + UInt31 result; + result.m_values[0] = _mm_unpacklo_epi16(low, high); + result.m_values[1] = _mm_unpackhi_epi16(low, high); + return result; + } + + static UInt31 XMultiply(const UInt15 &a, const UInt16 &b) + { + return XMultiply(b, a); + } + + static bool AnySet(const Int16CompFlag &v) + { + return _mm_movemask_epi8(v.m_value) != 0; + } + + static bool AllSet(const Int16CompFlag &v) + { + return _mm_movemask_epi8(v.m_value) == 0xffff; + } + + static bool AnySet(const FloatCompFlag &v) + { + return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0; + } + + static bool AllSet(const FloatCompFlag &v) + { + return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf; + } + }; + +#else + // Scalar version + struct ParallelMath + { + struct RoundTowardZeroForScope + { + }; + + struct RoundTowardNearestForScope + { + }; + + struct RoundUpForScope + { + }; + + struct RoundDownForScope + { + }; + + static const int ParallelSize = 1; + + enum Int16Subtype + { + IntSubtype_Signed, + IntSubtype_UnsignedFull, + IntSubtype_UnsignedTruncated, + IntSubtype_Abstract, + }; + + typedef int32_t SInt16; + typedef int32_t UInt15; + typedef int32_t UInt16; + typedef int32_t AInt16; + + typedef int32_t SInt32; + typedef int32_t UInt31; + typedef int32_t UInt32; + typedef int32_t AInt32; + + typedef int32_t ScalarUInt16; + typedef int32_t ScalarSInt16; + + typedef float Float; + + template<class TTargetType> + struct LosslessCast + { + static const int32_t& Cast(const int32_t &src) + { + return src; + } + }; + + typedef bool Int16CompFlag; + typedef bool FloatCompFlag; + + static int32_t AbstractAdd(const int32_t &a, const int32_t &b) + { + return a + b; + } + + static int32_t AbstractSubtract(const int32_t &a, const int32_t &b) + { + return a - b; + } + + static float Select(bool flag, float a, float b) + { + return flag ? a : b; + } + + static int32_t Select(bool flag, int32_t a, int32_t b) + { + return flag ? a : b; + } + + static int32_t SelectOrZero(bool flag, int32_t a) + { + return flag ? a : 0; + } + + static void ConditionalSet(int32_t& dest, bool flag, int32_t src) + { + if (flag) + dest = src; + } + + static void ConditionalSet(bool& dest, bool flag, bool src) + { + if (flag) + dest = src; + } + + static int32_t ConditionalNegate(bool flag, int32_t v) + { + return (flag) ? -v : v; + } + + static void NotConditionalSet(int32_t& dest, bool flag, int32_t src) + { + if (!flag) + dest = src; + } + + static void ConditionalSet(float& dest, bool flag, float src) + { + if (flag) + dest = src; + } + + static void NotConditionalSet(float& dest, bool flag, float src) + { + if (!flag) + dest = src; + } + + static void MakeSafeDenominator(float& v) + { + if (v == 0.0f) + v = 1.0f; + } + + static int32_t SignedRightShift(int32_t v, int bits) + { + return v >> bits; + } + + static int32_t TruncateToPrecisionSigned(int32_t v, int precision) + { + v = (v << (32 - precision)) & 0xffffffff; + return SignedRightShift(v, 32 - precision); + } + + static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision) + { + return v & ((1 << precision) - 1); + } + + static int32_t Min(int32_t a, int32_t b) + { + if (a < b) + return a; + return b; + } + + static float Min(float a, float b) + { + if (a < b) + return a; + return b; + } + + static int32_t Max(int32_t a, int32_t b) + { + if (a > b) + return a; + return b; + } + + static float Max(float a, float b) + { + if (a > b) + return a; + return b; + } + + static float Abs(float a) + { + return fabsf(a); + } + + static int32_t Abs(int32_t a) + { + if (a < 0) + return -a; + return a; + } + + static float Clamp(float v, float min, float max) + { + if (v < min) + return min; + if (v > max) + return max; + return v; + } + + static float Reciprocal(float v) + { + return 1.0f / v; + } + + static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut) + { + chOut = inputBlocks[0].m_pixels[pxOffset][channel]; + } + + static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut) + { + chOut = inputBlocks[0].m_pixels[pxOffset][channel]; + } + + static float MakeFloat(float v) + { + return v; + } + + static float MakeFloatZero() + { + return 0.0f; + } + + static int32_t MakeUInt16(uint16_t v) + { + return v; + } + + static int32_t MakeSInt16(int16_t v) + { + return v; + } + + static int32_t MakeAInt16(int16_t v) + { + return v; + } + + static int32_t MakeUInt15(uint16_t v) + { + return v; + } + + static int32_t MakeSInt32(int32_t v) + { + return v; + } + + static int32_t MakeUInt31(int32_t v) + { + return v; + } + + static int32_t Extract(int32_t v, int offset) + { + UNREFERENCED_PARAMETER(offset); + return v; + } + + static bool Extract(bool v, int offset) + { + UNREFERENCED_PARAMETER(offset); + return v; + } + + static float Extract(float v, int offset) + { + UNREFERENCED_PARAMETER(offset); + return v; + } + + static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) + { + UNREFERENCED_PARAMETER(offset); + dest = v; + } + + static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) + { + UNREFERENCED_PARAMETER(offset); + dest = v; + } + + static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v) + { + UNREFERENCED_PARAMETER(offset); + dest = v; + } + + static float ExtractFloat(float v, int offset) + { + UNREFERENCED_PARAMETER(offset); + return v; + } + + static void PutFloat(float &dest, int offset, float v) + { + UNREFERENCED_PARAMETER(offset); + dest = v; + } + + static void PutBoolInt16(bool &dest, int offset, bool v) + { + UNREFERENCED_PARAMETER(offset); + dest = v; + } + + static bool Less(int32_t a, int32_t b) + { + return a < b; + } + + static bool Less(float a, float b) + { + return a < b; + } + + static bool LessOrEqual(int32_t a, int32_t b) + { + return a < b; + } + + static bool LessOrEqual(float a, float b) + { + return a < b; + } + + static bool Equal(int32_t a, int32_t b) + { + return a == b; + } + + static bool Equal(float a, float b) + { + return a == b; + } + + static float ToFloat(int32_t v) + { + return static_cast<float>(v); + } + + static int32_t ToUInt31(int32_t v) + { + return v; + } + + static int32_t ToInt32(int32_t v) + { + return v; + } + + static bool FloatFlagToInt16(bool v) + { + return v; + } + + static bool Int32FlagToInt16(bool v) + { + return v; + } + + static bool Int16FlagToFloat(bool v) + { + return v; + } + + static bool MakeBoolInt16(bool b) + { + return b; + } + + static bool MakeBoolFloat(bool b) + { + return b; + } + + static bool AndNot(bool a, bool b) + { + return a && !b; + } + + static bool Not(bool b) + { + return !b; + } + + static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz) + { + UNREFERENCED_PARAMETER(rtz); + return static_cast<int>(v); + } + + static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru) + { + UNREFERENCED_PARAMETER(ru); + return static_cast<int>(ceilf(v)); + } + + static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd) + { + UNREFERENCED_PARAMETER(rd); + return static_cast<int>(floorf(v)); + } + + static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn) + { + UNREFERENCED_PARAMETER(rtn); + return static_cast<int>(floorf(v + 0.5f)); + } + + template<class TRoundMode> + static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode) + { + return RoundAndConvertToInt(v, roundingMode); + } + + template<class TRoundMode> + static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode) + { + return RoundAndConvertToInt(v, roundingMode); + } + + template<class TRoundMode> + static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode) + { + return RoundAndConvertToInt(v, roundingMode); + } + + static float Sqrt(float f) + { + return sqrtf(f); + } + + static int32_t SqDiffUInt8(int32_t a, int32_t b) + { + int32_t delta = a - b; + return delta * delta; + } + + static int32_t SqDiffInt16(int32_t a, int32_t b) + { + int32_t delta = a - b; + return delta * delta; + } + + static int32_t SqDiffSInt16(int32_t a, int32_t b) + { + int32_t delta = a - b; + return delta * delta; + } + + static float TwosCLHalfToFloat(int32_t v) + { + int32_t absV = (v < 0) ? -v : v; + + int32_t signBits = (absV & -32768); + int32_t mantissa = (absV & 0x03ff); + int32_t exponent = (absV & 0x7c00); + + bool isDenormal = (exponent == 0); + + // Convert exponent to high-bits + exponent = (exponent >> 3) + 14336; + + int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16; + + int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13); + + float f, correction; + memcpy(&f, &fBits, 4); + memcpy(&correction, &denormalCorrection, 4); + + return f - correction; + } + + static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) + { + Float fa = TwosCLHalfToFloat(a); + + Float diff = fa - b; + return diff * diff; + } + + static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) + { + Float fa = TwosCLHalfToFloat(a); + Float fb = TwosCLHalfToFloat(b); + + Float diff = fa - fb; + return diff * diff; + } + + static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) + { + Float fa = TwosCLHalfToFloat(a) * aWeight; + + Float diff = fa - b; + return diff * diff; + } + + static int32_t RightShift(int32_t v, int bits) + { + return SignedRightShift(v, bits); + } + + static int32_t ToSInt16(int32_t v) + { + return v; + } + + static int32_t ToUInt16(int32_t v) + { + return v; + } + + static int32_t ToUInt15(int32_t v) + { + return v; + } + + static int32_t XMultiply(int32_t a, int32_t b) + { + return a * b; + } + + static int32_t CompactMultiply(int32_t a, int32_t b) + { + return a * b; + } + + static bool AnySet(bool v) + { + return v; + } + + static bool AllSet(bool v) + { + return v; + } + }; + +#endif +} + +#endif |