diff options
Diffstat (limited to 'thirdparty/embree/common/math/vec2fa.h')
-rw-r--r-- | thirdparty/embree/common/math/vec2fa.h | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h index a51fb68fd0..4f222894c2 100644 --- a/thirdparty/embree/common/math/vec2fa.h +++ b/thirdparty/embree/common/math/vec2fa.h @@ -97,6 +97,12 @@ namespace embree __forceinline Vec2fa rcp ( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec2fa)reciprocal; +#else #if defined(__AVX512VL__) const Vec2fa r = _mm_rcp14_ps(a.m128); #else @@ -104,13 +110,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); + const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n #else - const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -118,12 +126,21 @@ namespace embree __forceinline Vec2fa rsqrt( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif } __forceinline Vec2fa zero_fix(const Vec2fa& a) { @@ -156,7 +173,7 @@ namespace embree __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -165,7 +182,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -227,7 +244,7 @@ namespace embree __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) |