summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/math/vec3fa.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/common/math/vec3fa.h')
-rw-r--r--thirdparty/embree/common/math/vec3fa.h92
1 files changed, 75 insertions, 17 deletions
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
index 586039741d..8564cf6d10 100644
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -55,7 +55,13 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+ __m128 t = _mm_load_ps((float*)a);
+ t[3] = 0.0f;
+ return Vec3fa(t);
+#else
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
}
static __forceinline Vec3fa loadu( const void* const a ) {
@@ -89,12 +95,20 @@ namespace embree
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
__forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+ return vnegq_f32(a.m128);
+#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return _mm_xor_ps(a.m128, mask);
+#endif
}
__forceinline Vec3fa abs ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+ return _mm_abs_ps(a.m128);
+#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(a.m128, mask);
+#endif
}
__forceinline Vec3fa sign ( const Vec3fa& a ) {
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
@@ -102,6 +116,10 @@ namespace embree
__forceinline Vec3fa rcp ( const Vec3fa& a )
{
+#if defined(__aarch64__)
+ return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#else
+
#if defined(__AVX512VL__)
const Vec3fa r = _mm_rcp14_ps(a.m128);
#else
@@ -109,13 +127,15 @@ namespace embree
#endif
#if defined(__AVX2__)
- const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+ const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
#else
- const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
- //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
#endif
return res;
+#endif //defined(__aarch64__)
}
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -123,12 +143,20 @@ namespace embree
__forceinline Vec3fa rsqrt( const Vec3fa& a )
{
+#if defined(__aarch64__)
+ __m128 r = _mm_rsqrt_ps(a.m128);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ return r;
+#else
+
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
}
__forceinline Vec3fa zero_fix(const Vec3fa& a) {
@@ -161,7 +189,7 @@ namespace embree
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -170,7 +198,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -187,16 +215,16 @@ namespace embree
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
-#if defined(__AVX2__)
+#if defined(__AVX2__) || defined(__ARM_NEON)
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
#else
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
- __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+ __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
#endif
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@@ -218,8 +246,26 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+ __forceinline float reduce_add(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = 0.0f;
+ return vaddvq_f32(t);
+ }
- __forceinline float reduce_add(const Vec3fa& v) {
+ __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+ __forceinline float reduce_min(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = t[2];
+ return vminvq_f32(t);
+ }
+ __forceinline float reduce_max(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = t[2];
+ return vmaxvq_f32(t);
+ }
+#else
+ __forceinline float reduce_add(const Vec3fa& v) {
const vfloat4 a(v.m128);
const vfloat4 b = shuffle<1>(a);
const vfloat4 c = shuffle<2>(a);
@@ -229,6 +275,7 @@ namespace embree
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
@@ -241,8 +288,13 @@ namespace embree
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
- __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
- __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+ #if defined(__aarch64__)
+ __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+ __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
__forceinline bool isvalid ( const Vec3fa& v ) {
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@@ -261,7 +313,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
@@ -335,7 +387,11 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+ __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+ __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
@@ -393,8 +449,10 @@ namespace embree
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
- __forceinline Vec3fx( const Vec3fa& other, const float w1) {
-#if defined (__SSE4_1__)
+ __forceinline Vec3fx( const Vec3fa& other, const float w1) {
+#if defined (__aarch64__)
+ m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
#else
const vint4 mask(-1,-1,-1,0);
@@ -526,7 +584,7 @@ namespace embree
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -535,7 +593,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -626,7 +684,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)