diff options
Diffstat (limited to 'thirdparty/basis_universal/encoder/cppspmd_sse.h')
-rw-r--r-- | thirdparty/basis_universal/encoder/cppspmd_sse.h | 23 |
1 files changed, 11 insertions, 12 deletions
diff --git a/thirdparty/basis_universal/encoder/cppspmd_sse.h b/thirdparty/basis_universal/encoder/cppspmd_sse.h index 9a97eeb695..4c61bab7b1 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_sse.h +++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h @@ -1,13 +1,5 @@ // cppspmd_sse.h -// Note for Basis Universal: All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation. -// SSE 2 or 4.1 -// Originally written by Nicolas Guillemot, Jefferson Amstutz in the "CppSPMD" project. -// 4/20: Richard Geldreich: Macro control flow, more SIMD instruction sets, optimizations, supports using multiple SIMD instruction sets in same executable. Still a work in progress! -// -// Originally Copyright 2016 Nicolas Guillemot -// Changed from the MIT license to Apache 2.0 with permission from the author. -// -// Modifications/enhancements Copyright 2020-2021 Binomial LLC +// Copyright 2020-2022 Binomial LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,6 +12,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// Notes for Basis Universal: +// All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation. +// The techniques used in this code were originally demonstrated for AVX2 by Nicolas Guillemot, Jefferson Amstutz in their "CppSPMD" project. +// This is new code for use in Basis Universal, although it uses the same general SPMD techniques in SSE 2/4. #include <stdlib.h> #include <stdint.h> @@ -1330,7 +1327,7 @@ struct spmd_kernel __m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210); return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp)); } - + CPPSPMD_FORCE_INLINE int reduce_add(vint v) { __m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask); @@ -1668,14 +1665,16 @@ CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; } CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; } +CPPSPMD_FORCE_INLINE vint zero_vint() { return vint{ _mm_setzero_si128() }; } +CPPSPMD_FORCE_INLINE vfloat zero_vfloat() { return vfloat{ _mm_setzero_ps() }; } + CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; } CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; } - CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; } CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; } - // control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane. #define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control)) +#define VFLOAT_LANE_SHUFFLE_PS(a, b, control) vfloat(_mm_shuffle_ps((a).m_value, (b).m_value, control)) // control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane. #define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control)) |