diff options
author | DeeJayLSP <djlsplays@gmail.com> | 2022-11-24 11:45:59 -0300 |
---|---|---|
committer | RĂ©mi Verschelde <rverschelde@gmail.com> | 2022-11-25 13:09:04 +0100 |
commit | 5e4158eb4869427ac13a0fe57e9b688ea4c3b0f1 (patch) | |
tree | 1a659e532fcba81af33d1f874c65311f093e4535 /thirdparty/embree/common/math | |
parent | f16c5b564b569497d04deb965a4fd63b3ea2ab2f (diff) |
Update embree to 3.13.5
Diffstat (limited to 'thirdparty/embree/common/math')
-rw-r--r-- | thirdparty/embree/common/math/bbox.h | 10 | ||||
-rw-r--r-- | thirdparty/embree/common/math/color.h | 19 | ||||
-rw-r--r-- | thirdparty/embree/common/math/constants.cpp | 19 | ||||
-rw-r--r-- | thirdparty/embree/common/math/constants.h | 54 | ||||
-rw-r--r-- | thirdparty/embree/common/math/math.h | 104 | ||||
-rw-r--r-- | thirdparty/embree/common/math/quaternion.h | 12 | ||||
-rw-r--r-- | thirdparty/embree/common/math/transcendental.h | 6 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec2.h | 8 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec2fa.h | 29 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec3.h | 11 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec3fa.h | 92 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec3ia.h | 46 | ||||
-rw-r--r-- | thirdparty/embree/common/math/vec4.h | 6 |
13 files changed, 300 insertions, 116 deletions
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h index bc43155358..e4eb3df9a4 100644 --- a/thirdparty/embree/common/math/bbox.h +++ b/thirdparty/embree/common/math/bbox.h @@ -77,7 +77,7 @@ namespace embree return lower > upper; } -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline bool BBox<Vec3fa>::empty() const { return !all(le_mask(lower,upper)); } @@ -196,11 +196,11 @@ namespace embree } template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper)); } template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) { - return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper)); + return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper)); } /*! blending */ @@ -228,11 +228,11 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined (__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined (__AVX__) #include "../simd/avx.h" #endif diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h index 529584ea16..e62e4ad2a4 100644 --- a/thirdparty/embree/common/math/color.h +++ b/thirdparty/embree/common/math/color.h @@ -152,21 +152,38 @@ namespace embree } __forceinline const Color rcp ( const Color& a ) { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Color)reciprocal; +#else #if defined(__AVX512VL__) const Color r = _mm_rcp14_ps(a.m128); #else const Color r = _mm_rcp_ps(a.m128); #endif - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r) + +#endif //defined(__aarch64__) } __forceinline const Color rsqrt( const Color& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif //defined(__aarch64__) } __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); } diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp index 03919ae20c..f51c642bfc 100644 --- a/thirdparty/embree/common/math/constants.cpp +++ b/thirdparty/embree/common/math/constants.cpp @@ -5,23 +5,4 @@ namespace embree { - TrueTy True; - FalseTy False; - ZeroTy zero; - OneTy one; - NegInfTy neg_inf; - PosInfTy inf; - PosInfTy pos_inf; - NaNTy nan; - UlpTy ulp; - PiTy pi; - OneOverPiTy one_over_pi; - TwoPiTy two_pi; - OneOverTwoPiTy one_over_two_pi; - FourPiTy four_pi; - OneOverFourPiTy one_over_four_pi; - StepTy step; - ReverseStepTy reverse_step; - EmptyTy empty; - UndefinedTy undefined; } diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h index 578473a8ab..07a1a868ba 100644 --- a/thirdparty/embree/common/math/constants.h +++ b/thirdparty/embree/common/math/constants.h @@ -24,13 +24,13 @@ namespace embree __forceinline operator bool( ) const { return true; } }; - extern MAYBE_UNUSED TrueTy True; + const constexpr TrueTy True = TrueTy(); struct FalseTy { __forceinline operator bool( ) const { return false; } }; - extern MAYBE_UNUSED FalseTy False; + const constexpr FalseTy False = FalseTy(); struct ZeroTy { @@ -48,7 +48,7 @@ namespace embree __forceinline operator unsigned char ( ) const { return 0; } }; - extern MAYBE_UNUSED ZeroTy zero; + const constexpr ZeroTy zero = ZeroTy(); struct OneTy { @@ -66,7 +66,7 @@ namespace embree __forceinline operator unsigned char ( ) const { return 1; } }; - extern MAYBE_UNUSED OneTy one; + const constexpr OneTy one = OneTy(); struct NegInfTy { @@ -85,7 +85,7 @@ namespace embree }; - extern MAYBE_UNUSED NegInfTy neg_inf; + const constexpr NegInfTy neg_inf = NegInfTy(); struct PosInfTy { @@ -103,8 +103,8 @@ namespace embree __forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); } }; - extern MAYBE_UNUSED PosInfTy inf; - extern MAYBE_UNUSED PosInfTy pos_inf; + const constexpr PosInfTy inf = PosInfTy(); + const constexpr PosInfTy pos_inf = PosInfTy(); struct NaNTy { @@ -112,15 +112,15 @@ namespace embree __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); } }; - extern MAYBE_UNUSED NaNTy nan; + const constexpr NaNTy nan = NaNTy(); struct UlpTy { __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); } __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); } }; - - extern MAYBE_UNUSED UlpTy ulp; + + const constexpr UlpTy ulp = UlpTy(); struct PiTy { @@ -128,7 +128,7 @@ namespace embree __forceinline operator float ( ) const { return float(M_PI); } }; - extern MAYBE_UNUSED PiTy pi; + const constexpr PiTy pi = PiTy(); struct OneOverPiTy { @@ -136,7 +136,7 @@ namespace embree __forceinline operator float ( ) const { return float(M_1_PI); } }; - extern MAYBE_UNUSED OneOverPiTy one_over_pi; + const constexpr OneOverPiTy one_over_pi = OneOverPiTy(); struct TwoPiTy { @@ -144,7 +144,7 @@ namespace embree __forceinline operator float ( ) const { return float(2.0*M_PI); } }; - extern MAYBE_UNUSED TwoPiTy two_pi; + const constexpr TwoPiTy two_pi = TwoPiTy(); struct OneOverTwoPiTy { @@ -152,7 +152,7 @@ namespace embree __forceinline operator float ( ) const { return float(0.5*M_1_PI); } }; - extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi; + const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy(); struct FourPiTy { @@ -160,7 +160,7 @@ namespace embree __forceinline operator float ( ) const { return float(4.0*M_PI); } }; - extern MAYBE_UNUSED FourPiTy four_pi; + const constexpr FourPiTy four_pi = FourPiTy(); struct OneOverFourPiTy { @@ -168,30 +168,42 @@ namespace embree __forceinline operator float ( ) const { return float(0.25*M_1_PI); } }; - extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi; + const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy(); struct StepTy { + __forceinline operator double ( ) const { return 0; } + __forceinline operator float ( ) const { return 0; } + __forceinline operator long long( ) const { return 0; } + __forceinline operator unsigned long long( ) const { return 0; } + __forceinline operator long ( ) const { return 0; } + __forceinline operator unsigned long ( ) const { return 0; } + __forceinline operator int ( ) const { return 0; } + __forceinline operator unsigned int ( ) const { return 0; } + __forceinline operator short ( ) const { return 0; } + __forceinline operator unsigned short ( ) const { return 0; } + __forceinline operator char ( ) const { return 0; } + __forceinline operator unsigned char ( ) const { return 0; } }; - extern MAYBE_UNUSED StepTy step; + const constexpr StepTy step = StepTy(); struct ReverseStepTy { }; - extern MAYBE_UNUSED ReverseStepTy reverse_step; + const constexpr ReverseStepTy reverse_step = ReverseStepTy(); struct EmptyTy { }; - extern MAYBE_UNUSED EmptyTy empty; + const constexpr EmptyTy empty = EmptyTy(); struct FullTy { }; - extern MAYBE_UNUSED FullTy full; + const constexpr FullTy full = FullTy(); struct UndefinedTy { }; - extern MAYBE_UNUSED UndefinedTy undefined; + const constexpr UndefinedTy undefined = UndefinedTy(); } diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h index 4bc54c1a6a..7930c17727 100644 --- a/thirdparty/embree/common/math/math.h +++ b/thirdparty/embree/common/math/math.h @@ -53,6 +53,16 @@ namespace embree __forceinline float rcp ( const float x ) { +#if defined(__aarch64__) + // Move scalar to vector register and do rcp. + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) @@ -66,30 +76,71 @@ namespace embree #else return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); #endif + +#endif //defined(__aarch64__) } __forceinline float signmsk ( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = 0x80000000; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#endif } __forceinline float xorf( const float x, const float y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128 b; + a[0] = x; + b[0] = y; + a = _mm_xor_ps(a, b); + return a[0]; +#else return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); +#endif } __forceinline float andf( const float x, const unsigned y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = y; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); +#endif } __forceinline float rsqrt( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + a[0] = x; + __m128 value = _mm_rsqrt_ps(a); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + return value[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); #else __m128 r = _mm_rsqrt_ss(a); #endif - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); -#if defined(__ARM_NEON) - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), + _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + return _mm_cvtss_f32(c); #endif - return _mm_cvtss_f32(r); } #if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) @@ -146,7 +197,17 @@ namespace embree __forceinline double floor( const double x ) { return ::floor (x); } __forceinline double ceil ( const double x ) { return ::ceil (x); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float mini(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_min_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float mini(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -155,7 +216,17 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float maxi(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_max_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float maxi(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -172,9 +243,12 @@ namespace embree __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } __forceinline float min(float a, float b) { return a<b ? a:b; } __forceinline double min(double a, double b) { return a<b ? a:b; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long min(long a, long b) { return a<b ? a:b; } +#endif template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } @@ -189,9 +263,12 @@ namespace embree __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } __forceinline float max(float a, float b) { return a<b ? b:a; } __forceinline double max(double a, double b) { return a<b ? b:a; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long max(long a, long b) { return a<b ? b:a; } +#endif template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } @@ -231,6 +308,15 @@ namespace embree __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + +#elif defined (__aarch64__) && defined(__clang__) +#pragma clang fp contract(fast) +__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } +__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } +__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } +__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } +#pragma clang fp contract(on) + #else __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } @@ -326,7 +412,7 @@ namespace embree return x | (y << 1) | (z << 2); } -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) template<> __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h index 080800efcd..78efccda72 100644 --- a/thirdparty/embree/common/math/quaternion.h +++ b/thirdparty/embree/common/math/quaternion.h @@ -242,13 +242,17 @@ namespace embree T cosTheta = dot(q0, q1_); QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_); cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta); - if (unlikely(all(cosTheta > 0.9995f))) { - return normalize(lerp(q0, q1, t)); - } + + // spherical linear interpolation const T phi = t * fastapprox::acos(cosTheta); T sinPhi, cosPhi; fastapprox::sincos(phi, sinPhi, cosPhi); QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1)); - return msub(cosPhi, q0, qperp); + QuaternionT<T> qslerp = msub(cosPhi, q0, qperp); + + // regular linear interpolation as fallback + QuaternionT<T> qlerp = normalize(lerp(q0, q1, t)); + + return select(cosTheta > 0.9995f, qlerp, qslerp); } } diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h index fd16c26e81..daf9dd96d2 100644 --- a/thirdparty/embree/common/math/transcendental.h +++ b/thirdparty/embree/common/math/transcendental.h @@ -27,7 +27,7 @@ __forceinline T sin(const T &v) // Reduced range version of x auto x = v - kReal * piOverTwoVec; auto kMod4 = k & 3; - auto sinUseCos = (kMod4 == 1 | kMod4 == 3); + auto sinUseCos = (kMod4 == 1) | (kMod4 == 3); auto flipSign = (kMod4 > 1); // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2, @@ -76,8 +76,8 @@ __forceinline T cos(const T &v) auto x = v - kReal * piOverTwoVec; auto kMod4 = k & 3; - auto cosUseCos = (kMod4 == 0 | kMod4 == 2); - auto flipSign = (kMod4 == 1 | kMod4 == 2); + auto cosUseCos = (kMod4 == 0) | (kMod4 == 2); + auto flipSign = (kMod4 == 1) | (kMod4 == 2); const float sinC2 = -0.16666667163372039794921875; const float sinC4 = +8.333347737789154052734375e-3; diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h index d62aef51f3..f6d98ffa0d 100644 --- a/thirdparty/embree/common/math/vec2.h +++ b/thirdparty/embree/common/math/vec2.h @@ -144,7 +144,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); } @@ -205,11 +205,11 @@ namespace embree #include "vec2fa.h" -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined(__AVX__) #include "../simd/avx.h" #endif @@ -221,7 +221,7 @@ namespace embree { template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {} #endif diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h index a51fb68fd0..4f222894c2 100644 --- a/thirdparty/embree/common/math/vec2fa.h +++ b/thirdparty/embree/common/math/vec2fa.h @@ -97,6 +97,12 @@ namespace embree __forceinline Vec2fa rcp ( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 reciprocal = _mm_rcp_ps(a.m128); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal); + return (const Vec2fa)reciprocal; +#else #if defined(__AVX512VL__) const Vec2fa r = _mm_rcp14_ps(a.m128); #else @@ -104,13 +110,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f))); + const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n #else - const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) + const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -118,12 +126,21 @@ namespace embree __forceinline Vec2fa rsqrt( const Vec2fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); + +#endif } __forceinline Vec2fa zero_fix(const Vec2fa& a) { @@ -156,7 +173,7 @@ namespace embree __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -165,7 +182,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { const vint4 ai = _mm_castps_si128(a); const vint4 bi = _mm_castps_si128(b); @@ -227,7 +244,7 @@ namespace embree __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h index ce94eff327..254f6c4011 100644 --- a/thirdparty/embree/common/math/vec3.h +++ b/thirdparty/embree/common/math/vec3.h @@ -197,7 +197,7 @@ namespace embree template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); } @@ -207,7 +207,6 @@ namespace embree template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); } template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); } template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); } - template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c ) { const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x; @@ -266,11 +265,11 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif -#if defined __AVX__ +#if defined(__AVX__) #include "../simd/avx.h" #endif @@ -291,14 +290,14 @@ namespace embree template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } -#elif defined(__SSE__) +#elif defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); } #endif -#if defined(__SSE__) +#if defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) { return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k])); diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h index 586039741d..8564cf6d10 100644 --- a/thirdparty/embree/common/math/vec3fa.h +++ b/thirdparty/embree/common/math/vec3fa.h @@ -55,7 +55,13 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// static __forceinline Vec3fa load( const void* const a ) { +#if defined(__aarch64__) + __m128 t = _mm_load_ps((float*)a); + t[3] = 0.0f; + return Vec3fa(t); +#else return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); +#endif } static __forceinline Vec3fa loadu( const void* const a ) { @@ -89,12 +95,20 @@ namespace embree __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } __forceinline Vec3fa operator -( const Vec3fa& a ) { +#if defined(__aarch64__) + return vnegq_f32(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return _mm_xor_ps(a.m128, mask); +#endif } __forceinline Vec3fa abs ( const Vec3fa& a ) { +#if defined(__aarch64__) + return _mm_abs_ps(a.m128); +#else const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return _mm_and_ps(a.m128, mask); +#endif } __forceinline Vec3fa sign ( const Vec3fa& a ) { return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); @@ -102,6 +116,10 @@ namespace embree __forceinline Vec3fa rcp ( const Vec3fa& a ) { +#if defined(__aarch64__) + return vdivq_f32(vdupq_n_f32(1.0f),a.m128); +#else + #if defined(__AVX512VL__) const Vec3fa r = _mm_rcp14_ps(a.m128); #else @@ -109,13 +127,15 @@ namespace embree #endif #if defined(__AVX2__) - const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); + const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n #else - const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); - //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); + const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0) + const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n #endif return res; +#endif //defined(__aarch64__) } __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } @@ -123,12 +143,20 @@ namespace embree __forceinline Vec3fa rsqrt( const Vec3fa& a ) { +#if defined(__aarch64__) + __m128 r = _mm_rsqrt_ps(a.m128); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r)); + return r; +#else + #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ps(a.m128); #else __m128 r = _mm_rsqrt_ps(a.m128); #endif return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); +#endif } __forceinline Vec3fa zero_fix(const Vec3fa& a) { @@ -161,7 +189,7 @@ namespace embree __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -170,7 +198,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -187,16 +215,16 @@ namespace embree /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__ARM_NEON) __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } #else __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } - __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } + __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } #endif __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } @@ -218,8 +246,26 @@ namespace embree //////////////////////////////////////////////////////////////////////////////// /// Reductions //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline float reduce_add(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = 0.0f; + return vaddvq_f32(t); + } - __forceinline float reduce_add(const Vec3fa& v) { + __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } + __forceinline float reduce_min(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vminvq_f32(t); + } + __forceinline float reduce_max(const Vec3fa& v) { + float32x4_t t = v.m128; + t[3] = t[2]; + return vmaxvq_f32(t); + } +#else + __forceinline float reduce_add(const Vec3fa& v) { const vfloat4 a(v.m128); const vfloat4 b = shuffle<1>(a); const vfloat4 c = shuffle<2>(a); @@ -229,6 +275,7 @@ namespace embree __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } +#endif //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators @@ -241,8 +288,13 @@ namespace embree __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } - __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } - __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } + #if defined(__aarch64__) + __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); } + __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); } +#else + __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); } + __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); } +#endif __forceinline bool isvalid ( const Vec3fa& v ) { return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); @@ -261,7 +313,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) @@ -335,7 +387,11 @@ namespace embree /// Rounding Functions //////////////////////////////////////////////////////////////////////////////// -#if defined (__SSE4_1__) +#if defined(__aarch64__) + __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); } + __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); } + __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); } +#elif defined (__SSE4_1__) __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } @@ -393,8 +449,10 @@ namespace embree __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } - __forceinline Vec3fx( const Vec3fa& other, const float w1) { -#if defined (__SSE4_1__) + __forceinline Vec3fx( const Vec3fa& other, const float w1) { +#if defined (__aarch64__) + m128 = other.m128; m128[3] = w1; +#elif defined (__SSE4_1__) m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); #else const vint4 mask(-1,-1,-1,0); @@ -526,7 +584,7 @@ namespace embree __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -535,7 +593,7 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__SSE4_1__) || defined(__aarch64__) __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { const vint4 ai = _mm_castps_si128(a.m128); const vint4 bi = _mm_castps_si128(b.m128); @@ -626,7 +684,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// #if defined(__SSE4_1__) diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h index 694804c40d..d4cc3125cd 100644 --- a/thirdparty/embree/common/math/vec3ia.h +++ b/thirdparty/embree/common/math/vec3ia.h @@ -65,7 +65,9 @@ namespace embree __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; } __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); } -#if defined(__SSSE3__) +#if (defined(__aarch64__)) + __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); } +#elif defined(__SSSE3__) __forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); } #endif @@ -81,7 +83,7 @@ namespace embree __forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); } __forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); } __forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); } __forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; } @@ -116,7 +118,7 @@ namespace embree __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; } __forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; } -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; } __forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; } #endif @@ -127,18 +129,38 @@ namespace embree __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; } __forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; } +#if !defined(__ARM_NEON) __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; } __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; } +#endif //////////////////////////////////////////////////////////////////////////////// - /// Reductions + /// Select //////////////////////////////////////////////////////////////////////////////// + __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { +#if defined(__aarch64__) || defined(__SSE4_1__) + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); +#else + return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); +#endif + } + + //////////////////////////////////////////////////////////////////////////////// + /// Reductions + //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); } + __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } + __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); } + __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); } +#else __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; } __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; } __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); } __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); } - +#endif + //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// @@ -156,19 +178,7 @@ namespace embree __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); } __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); } - //////////////////////////////////////////////////////////////////////////////// - /// Select - //////////////////////////////////////////////////////////////////////////////// - - __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) { -#if defined(__SSE4_1__) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); -#else - return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); -#endif - } - -#if defined(__SSE4_1__) +#if defined(__aarch64__) || defined(__SSE4_1__) __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); } __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); } #else diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h index 0ed107928a..10c53f47b4 100644 --- a/thirdparty/embree/common/math/vec4.h +++ b/thirdparty/embree/common/math/vec4.h @@ -149,7 +149,7 @@ namespace embree } //////////////////////////////////////////////////////////////////////////////// - /// Euclidian Space Operators + /// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); } @@ -205,7 +205,7 @@ namespace embree /// SSE / AVX / MIC specializations //////////////////////////////////////////////////////////////////////////////// -#if defined __SSE__ +#if defined(__SSE__) || defined(__ARM_NEON) #include "../simd/sse.h" #endif @@ -225,7 +225,7 @@ namespace embree template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } -#elif defined(__SSE__) +#elif defined(__SSE__) || defined(__ARM_NEON) template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); } |