summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/math/vec2fa.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/common/math/vec2fa.h')
-rw-r--r--thirdparty/embree/common/math/vec2fa.h29
1 files changed, 23 insertions, 6 deletions
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
index a51fb68fd0..4f222894c2 100644
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -97,6 +97,12 @@ namespace embree
__forceinline Vec2fa rcp ( const Vec2fa& a )
{
+#if defined(__aarch64__)
+ __m128 reciprocal = _mm_rcp_ps(a.m128);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ return (const Vec2fa)reciprocal;
+#else
#if defined(__AVX512VL__)
const Vec2fa r = _mm_rcp14_ps(a.m128);
#else
@@ -104,13 +110,15 @@ namespace embree
#endif
#if defined(__AVX2__)
- const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+ const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n
#else
- const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
- //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n
#endif
return res;
+#endif //defined(__aarch64__)
}
__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -118,12 +126,21 @@ namespace embree
__forceinline Vec2fa rsqrt( const Vec2fa& a )
{
+#if defined(__aarch64__)
+ __m128 r = _mm_rsqrt_ps(a.m128);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ return r;
+#else
+
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif
}
__forceinline Vec2fa zero_fix(const Vec2fa& a) {
@@ -156,7 +173,7 @@ namespace embree
__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@@ -165,7 +182,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@@ -227,7 +244,7 @@ namespace embree
__forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)