diff options
Diffstat (limited to 'thirdparty/embree/common/math/vec3fa.h')
-rw-r--r-- | thirdparty/embree/common/math/vec3fa.h | 92 |
1 files changed, 75 insertions, 17 deletions
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h index 586039741d..8564cf6d10 100644 --- a/thirdparty/embree/common/math/vec3fa.h +++ b/thirdparty/embree/common/math/vec3fa.h @@ -55,7 +55,13 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// static __forceinline Vec3fa load( const void* const a ) { +#if defined(__aarch64__) + __m128 t = _mm_load_ps((float*)a); + t[3] = 0.0f; + return Vec3fa(t); +#else return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); +#endif } static __forceinline Vec3fa loadu( const void* const a ) { @@ -89,12 +95,20 @@ namespace embree __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } __forceinline Vec3fa operator -( const Vec3fa& a ) { +#if defined(__aarch64__) + return vnegq_f32(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return _mm_xor_ps(a.m128, mask); +#endif } __forceinline Vec3fa abs ( const Vec3fa& a ) { +#if defined(__aarch64__) + return _mm_abs_ps(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return _mm_and_ps(a.m128, mask); +#endif } __forceinline Vec3fa sign ( const Vec3fa& a ) { return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); @@ -102,6 +116,10 @@ namespace embree __forceinline Vec3fa rcp ( const Vec3fa& a ) { +#if defined(__aarch64__) + return vdivq_f32(vdupq_n_f32(1.0f),a.m128); +#else + #if defined(__AVX512VL__) const Vec3fa r = _mm_rcp14_ps(a.m128); #else @@ -109,13 +127,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); + const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n #else - const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -123,12 +143,20 @@ namespace embree __forceinline Vec3fa rsqrt( const Vec3fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif } __forceinline Vec3fa zero_fix(const Vec3fa& a) { @@ -161,7 +189,7 @@ namespace embree __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -170,7 +198,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -187,16 +215,16 @@ namespace embree /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__ARM_NEON) __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } #else __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } #endif __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } @@ -218,8 +246,26 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline float reduce_add(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = 0.0f; + return vaddvq_f32(t); + } - __forceinline float reduce_add(const Vec3fa& v) { + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vminvq_f32(t); + } + __forceinline float reduce_max(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vmaxvq_f32(t); + } +#else + __forceinline float reduce_add(const Vec3fa& v) { const vfloat4 a(v.m128); const vfloat4 b = shuffle<1>(a); const vfloat4 c = shuffle<2>(a); @@ -229,6 +275,7 @@ namespace embree __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } +#endif //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators @@ -241,8 +288,13 @@ namespace embree __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } - __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + #if defined(__aarch64__) + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } +#else + __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +#endif __forceinline bool isvalid ( const Vec3fa& v ) { return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); @@ -261,7 +313,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) @@ -335,7 +387,11 @@ namespace embree /// Rounding Functions //////////////////////////////////////////////////////////////////////////////// -#if defined (__SSE4_1__) +#if defined(__aarch64__) + __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } + __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } +#elif defined (__SSE4_1__) __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } @@ -393,8 +449,10 @@ namespace embree __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } - __forceinline Vec3fx( const Vec3fa& other, const float w1) { -#if defined (__SSE4_1__) + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__aarch64__) + m128 = other.m128; m128[3] = w1; +#elif defined (__SSE4_1__) m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); #else const vint4 mask(-1,-1,-1,0); @@ -526,7 +584,7 @@ namespace embree __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -535,7 +593,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -626,7 +684,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) |