diff options
Diffstat (limited to 'thirdparty/embree/common/simd/vboolf4_sse2.h')
-rw-r--r-- | thirdparty/embree/common/simd/vboolf4_sse2.h | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/thirdparty/embree/common/simd/vboolf4_sse2.h b/thirdparty/embree/common/simd/vboolf4_sse2.h index fa84b1b6ee..9e0fdf5c6f 100644 --- a/thirdparty/embree/common/simd/vboolf4_sse2.h +++ b/thirdparty/embree/common/simd/vboolf4_sse2.h @@ -36,9 +36,11 @@ namespace embree __forceinline vboolf(__m128 input) : v(input) {} __forceinline operator const __m128&() const { return v; } + #if !defined(__EMSCRIPTEN__) __forceinline operator const __m128i() const { return _mm_castps_si128(v); } __forceinline operator const __m128d() const { return _mm_castps_pd(v); } - + #endif + __forceinline vboolf(bool a) : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} __forceinline vboolf(bool a, bool b) @@ -100,7 +102,7 @@ namespace embree __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); } __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) { -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) return _mm_blendv_ps(f, t, m); #else return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); @@ -114,6 +116,17 @@ namespace embree __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3))); + } + + template<int i0, int i1, int i2, int i3> + __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vboolf4 shuffle(const vboolf4& v) { return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0))); @@ -123,6 +136,7 @@ namespace embree __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) { return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } +#endif template<int i0> __forceinline vboolf4 shuffle(const vboolf4& v) { @@ -135,7 +149,7 @@ namespace embree template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); } #endif -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) && !defined(__aarch64__) template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); } template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); } @@ -157,7 +171,9 @@ namespace embree __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); } __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); } -#if defined(__SSE4_2__) +#if defined(__aarch64__) + __forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); } +#elif defined(__SSE4_2__) __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); } #else __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); } |