diff options
Diffstat (limited to 'thirdparty/embree/common/simd/vfloat4_sse2.h')
-rw-r--r-- | thirdparty/embree/common/simd/vfloat4_sse2.h | 135 |
1 files changed, 108 insertions, 27 deletions
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h index 5215bf9730..6d7e11fe72 100644 --- a/thirdparty/embree/common/simd/vfloat4_sse2.h +++ b/thirdparty/embree/common/simd/vfloat4_sse2.h @@ -42,6 +42,11 @@ namespace embree __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {} __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {} +#if defined(__aarch64__) + __forceinline explicit vfloat(const vuint4& x) { + v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v)); + } +#else __forceinline explicit vfloat(const vuint4& x) { const __m128i a = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF)); const __m128i b = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 @@ -49,7 +54,7 @@ namespace embree const __m128 bf = _mm_castsi128_ps(b); v = _mm_add_ps(af,bf); } - +#endif //////////////////////////////////////////////////////////////////////////////// /// Constants //////////////////////////////////////////////////////////////////////////////// @@ -107,7 +112,11 @@ namespace embree #endif } -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const char* ptr) { + return __m128(_mm_load4epi8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const char* ptr) { return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -117,7 +126,11 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const unsigned char* ptr) { + return __m128(_mm_load4epu8_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const unsigned char* ptr) { return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -128,7 +141,11 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + static __forceinline vfloat4 load(const short* ptr) { + return __m128(_mm_load4epi16_f32(((__m128i*)ptr))); + } +#elif defined(__SSE4_1__) static __forceinline vfloat4 load(const short* ptr) { return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr))); } @@ -145,15 +162,19 @@ namespace embree static __forceinline void store_nt(void* ptr, const vfloat4& v) { #if defined (__SSE4_1__) +#if defined(__aarch64__) _mm_stream_ps((float*)ptr,v); #else + _mm_stream_ps((float*)ptr,v); +#endif +#else _mm_store_ps((float*)ptr,v); #endif } template<int scale = 4> static __forceinline vfloat4 gather(const float* ptr, const vint4& index) { -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) return _mm_i32gather_ps(ptr, index, scale); #else return vfloat4( @@ -169,7 +190,7 @@ namespace embree vfloat4 r = zero; #if defined(__AVX512VL__) return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale); -#elif defined(__AVX2__) +#elif defined(__AVX2__) && !defined(__aarch64__) return _mm_mask_i32gather_ps(r, ptr, index, mask, scale); #else if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]); @@ -223,8 +244,8 @@ namespace embree friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) { #if defined(__AVX512VL__) return _mm_mask_blend_ps(m, f, t); -#elif defined(__SSE4_1__) - return _mm_blendv_ps(f, t, m); +#elif defined(__SSE4_1__) || (defined(__aarch64__)) + return _mm_blendv_ps(f, t, m); #else return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); #endif @@ -256,18 +277,34 @@ namespace embree __forceinline vfloat4 toFloat(const vint4& a) { return vfloat4(a); } __forceinline vfloat4 operator +(const vfloat4& a) { return a; } +#if defined(__aarch64__) + __forceinline vfloat4 operator -(const vfloat4& a) { + return vnegq_f32(a); + } +#else __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } +#endif +#if defined(__aarch64__) + __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); } +#else __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); } +#endif + #if defined(__AVX512VL__) __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); } #else __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); } #endif + __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } - + __forceinline vfloat4 rcp(const vfloat4& a) { +#if defined(__aarch64__) + return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v)); +#else + #if defined(__AVX512VL__) const vfloat4 r = _mm_rcp14_ps(a); #else @@ -275,30 +312,39 @@ namespace embree #endif #if defined(__AVX2__) - return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); + return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r); // computes r + r * (1 - a * r) #else - return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); + return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r) #endif + +#endif //defined(__aarch64__) } __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); } __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); } __forceinline vfloat4 rsqrt(const vfloat4& a) { +#if defined(__aarch64__) + vfloat4 r = _mm_rsqrt_ps(a); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r)); + return r; +#else + #if defined(__AVX512VL__) vfloat4 r = _mm_rsqrt14_ps(a); #else vfloat4 r = _mm_rsqrt_ps(a); #endif -#if defined(__ARM_NEON) - r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); - r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); -#elif defined(__AVX2__) +#if defined(__AVX2__) r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); #else r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); #endif + +#endif return r; } @@ -344,7 +390,8 @@ namespace embree __forceinline vfloat4 max(const vfloat4& a, float b) { return _mm_max_ps(a,vfloat4(b)); } __forceinline vfloat4 max(float a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) + __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -393,9 +440,10 @@ namespace embree __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); } #else __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; } - __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;} __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; } + __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; } + #endif //////////////////////////////////////////////////////////////////////////////// @@ -429,8 +477,13 @@ namespace embree __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); } __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); } __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); } +#if defined(__aarch64__) + __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); } + __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); } +#else __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); } __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); } +#endif __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); } #endif @@ -484,7 +537,7 @@ namespace embree return select(vboolf4(mask), t, f); #endif } - + __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) { return madd(t,b-a,a); } @@ -506,10 +559,10 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// #if defined(__aarch64__) - __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } - __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } - __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } - __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } + __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf + __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf + __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0 + __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn? #elif defined (__SSE4_1__) __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } @@ -524,7 +577,9 @@ namespace embree __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); } __forceinline vint4 floori(const vfloat4& a) { -#if defined(__SSE4_1__) +#if defined(__aarch64__) + return vcvtq_s32_f32(floor(a)); +#elif defined(__SSE4_1__) return vint4(floor(a)); #else return vint4(a-vfloat4(0.5f)); @@ -538,6 +593,16 @@ namespace embree __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); } __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); } +#if defined(__aarch64__) + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& v) { + return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3))); + } + template<int i0, int i1, int i2, int i3> + __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { + return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3))); + } +#else template<int i0, int i1, int i2, int i3> __forceinline vfloat4 shuffle(const vfloat4& v) { return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0))); @@ -547,8 +612,9 @@ namespace embree __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) { return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); } +#endif -#if defined(__SSE3__) +#if defined(__SSE3__) && !defined(__aarch64__) template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); } template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); } template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); } @@ -559,10 +625,14 @@ namespace embree return shuffle<i,i,i,i>(v); } +#if defined(__aarch64__) + template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; } +#else template<int i> __forceinline float extract (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); } template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); } +#endif -#if defined (__SSE4_1__) +#if defined (__SSE4_1__) && !defined(__aarch64__) template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); } template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); } @@ -664,14 +734,25 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// - +#if defined(__aarch64__) + __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); } + __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); } +#else __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); } __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v) + v ; return shuffle<2,3,0,1>(h) + h ; } +#endif +#if defined(__aarch64__) + __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); } + __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); } + __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); } +#else __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); } __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); } __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); } +#endif __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) { @@ -687,7 +768,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// __forceinline float dot(const vfloat4& a, const vfloat4& b) { |