summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/math
diff options
context:
space:
mode:
authorDeeJayLSP <djlsplays@gmail.com>2022-11-24 11:45:59 -0300
committerRĂ©mi Verschelde <rverschelde@gmail.com>2022-11-25 13:09:04 +0100
commit5e4158eb4869427ac13a0fe57e9b688ea4c3b0f1 (patch)
tree1a659e532fcba81af33d1f874c65311f093e4535 /thirdparty/embree/common/math
parentf16c5b564b569497d04deb965a4fd63b3ea2ab2f (diff)
Update embree to 3.13.5
Diffstat (limited to 'thirdparty/embree/common/math')
-rw-r--r--thirdparty/embree/common/math/bbox.h10
-rw-r--r--thirdparty/embree/common/math/color.h19
-rw-r--r--thirdparty/embree/common/math/constants.cpp19
-rw-r--r--thirdparty/embree/common/math/constants.h54
-rw-r--r--thirdparty/embree/common/math/math.h104
-rw-r--r--thirdparty/embree/common/math/quaternion.h12
-rw-r--r--thirdparty/embree/common/math/transcendental.h6
-rw-r--r--thirdparty/embree/common/math/vec2.h8
-rw-r--r--thirdparty/embree/common/math/vec2fa.h29
-rw-r--r--thirdparty/embree/common/math/vec3.h11
-rw-r--r--thirdparty/embree/common/math/vec3fa.h92
-rw-r--r--thirdparty/embree/common/math/vec3ia.h46
-rw-r--r--thirdparty/embree/common/math/vec4.h6
13 files changed, 300 insertions, 116 deletions
diff --git a/thirdparty/embree/common/math/bbox.h b/thirdparty/embree/common/math/bbox.h
index bc43155358..e4eb3df9a4 100644
--- a/thirdparty/embree/common/math/bbox.h
+++ b/thirdparty/embree/common/math/bbox.h
@@ -77,7 +77,7 @@ namespace embree
return lower > upper;
}
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline bool BBox<Vec3fa>::empty() const {
return !all(le_mask(lower,upper));
}
@@ -196,11 +196,11 @@ namespace embree
}
template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
- return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+ return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
}
template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
- return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+ return all(ge_mask(a.lower,b.lower)) && all(le_mask(a.upper,b.upper));
}
/*! blending */
@@ -228,11 +228,11 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
-#if defined __SSE__
+#if defined (__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
-#if defined __AVX__
+#if defined (__AVX__)
#include "../simd/avx.h"
#endif
diff --git a/thirdparty/embree/common/math/color.h b/thirdparty/embree/common/math/color.h
index 529584ea16..e62e4ad2a4 100644
--- a/thirdparty/embree/common/math/color.h
+++ b/thirdparty/embree/common/math/color.h
@@ -152,21 +152,38 @@ namespace embree
}
__forceinline const Color rcp ( const Color& a )
{
+#if defined(__aarch64__)
+ __m128 reciprocal = _mm_rcp_ps(a.m128);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ return (const Color)reciprocal;
+#else
#if defined(__AVX512VL__)
const Color r = _mm_rcp14_ps(a.m128);
#else
const Color r = _mm_rcp_ps(a.m128);
#endif
- return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r)
+
+#endif //defined(__aarch64__)
}
__forceinline const Color rsqrt( const Color& a )
{
+#if defined(__aarch64__)
+ __m128 r = _mm_rsqrt_ps(a.m128);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ return r;
+#else
+
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif //defined(__aarch64__)
}
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
diff --git a/thirdparty/embree/common/math/constants.cpp b/thirdparty/embree/common/math/constants.cpp
index 03919ae20c..f51c642bfc 100644
--- a/thirdparty/embree/common/math/constants.cpp
+++ b/thirdparty/embree/common/math/constants.cpp
@@ -5,23 +5,4 @@
namespace embree
{
- TrueTy True;
- FalseTy False;
- ZeroTy zero;
- OneTy one;
- NegInfTy neg_inf;
- PosInfTy inf;
- PosInfTy pos_inf;
- NaNTy nan;
- UlpTy ulp;
- PiTy pi;
- OneOverPiTy one_over_pi;
- TwoPiTy two_pi;
- OneOverTwoPiTy one_over_two_pi;
- FourPiTy four_pi;
- OneOverFourPiTy one_over_four_pi;
- StepTy step;
- ReverseStepTy reverse_step;
- EmptyTy empty;
- UndefinedTy undefined;
}
diff --git a/thirdparty/embree/common/math/constants.h b/thirdparty/embree/common/math/constants.h
index 578473a8ab..07a1a868ba 100644
--- a/thirdparty/embree/common/math/constants.h
+++ b/thirdparty/embree/common/math/constants.h
@@ -24,13 +24,13 @@ namespace embree
__forceinline operator bool( ) const { return true; }
};
- extern MAYBE_UNUSED TrueTy True;
+ const constexpr TrueTy True = TrueTy();
struct FalseTy {
__forceinline operator bool( ) const { return false; }
};
- extern MAYBE_UNUSED FalseTy False;
+ const constexpr FalseTy False = FalseTy();
struct ZeroTy
{
@@ -48,7 +48,7 @@ namespace embree
__forceinline operator unsigned char ( ) const { return 0; }
};
- extern MAYBE_UNUSED ZeroTy zero;
+ const constexpr ZeroTy zero = ZeroTy();
struct OneTy
{
@@ -66,7 +66,7 @@ namespace embree
__forceinline operator unsigned char ( ) const { return 1; }
};
- extern MAYBE_UNUSED OneTy one;
+ const constexpr OneTy one = OneTy();
struct NegInfTy
{
@@ -85,7 +85,7 @@ namespace embree
};
- extern MAYBE_UNUSED NegInfTy neg_inf;
+ const constexpr NegInfTy neg_inf = NegInfTy();
struct PosInfTy
{
@@ -103,8 +103,8 @@ namespace embree
__forceinline operator unsigned char ( ) const { return std::numeric_limits<unsigned char>::max(); }
};
- extern MAYBE_UNUSED PosInfTy inf;
- extern MAYBE_UNUSED PosInfTy pos_inf;
+ const constexpr PosInfTy inf = PosInfTy();
+ const constexpr PosInfTy pos_inf = PosInfTy();
struct NaNTy
{
@@ -112,15 +112,15 @@ namespace embree
__forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
};
- extern MAYBE_UNUSED NaNTy nan;
+ const constexpr NaNTy nan = NaNTy();
struct UlpTy
{
__forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
__forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
};
-
- extern MAYBE_UNUSED UlpTy ulp;
+
+ const constexpr UlpTy ulp = UlpTy();
struct PiTy
{
@@ -128,7 +128,7 @@ namespace embree
__forceinline operator float ( ) const { return float(M_PI); }
};
- extern MAYBE_UNUSED PiTy pi;
+ const constexpr PiTy pi = PiTy();
struct OneOverPiTy
{
@@ -136,7 +136,7 @@ namespace embree
__forceinline operator float ( ) const { return float(M_1_PI); }
};
- extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+ const constexpr OneOverPiTy one_over_pi = OneOverPiTy();
struct TwoPiTy
{
@@ -144,7 +144,7 @@ namespace embree
__forceinline operator float ( ) const { return float(2.0*M_PI); }
};
- extern MAYBE_UNUSED TwoPiTy two_pi;
+ const constexpr TwoPiTy two_pi = TwoPiTy();
struct OneOverTwoPiTy
{
@@ -152,7 +152,7 @@ namespace embree
__forceinline operator float ( ) const { return float(0.5*M_1_PI); }
};
- extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+ const constexpr OneOverTwoPiTy one_over_two_pi = OneOverTwoPiTy();
struct FourPiTy
{
@@ -160,7 +160,7 @@ namespace embree
__forceinline operator float ( ) const { return float(4.0*M_PI); }
};
- extern MAYBE_UNUSED FourPiTy four_pi;
+ const constexpr FourPiTy four_pi = FourPiTy();
struct OneOverFourPiTy
{
@@ -168,30 +168,42 @@ namespace embree
__forceinline operator float ( ) const { return float(0.25*M_1_PI); }
};
- extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+ const constexpr OneOverFourPiTy one_over_four_pi = OneOverFourPiTy();
struct StepTy {
+ __forceinline operator double ( ) const { return 0; }
+ __forceinline operator float ( ) const { return 0; }
+ __forceinline operator long long( ) const { return 0; }
+ __forceinline operator unsigned long long( ) const { return 0; }
+ __forceinline operator long ( ) const { return 0; }
+ __forceinline operator unsigned long ( ) const { return 0; }
+ __forceinline operator int ( ) const { return 0; }
+ __forceinline operator unsigned int ( ) const { return 0; }
+ __forceinline operator short ( ) const { return 0; }
+ __forceinline operator unsigned short ( ) const { return 0; }
+ __forceinline operator char ( ) const { return 0; }
+ __forceinline operator unsigned char ( ) const { return 0; }
};
- extern MAYBE_UNUSED StepTy step;
+ const constexpr StepTy step = StepTy();
struct ReverseStepTy {
};
- extern MAYBE_UNUSED ReverseStepTy reverse_step;
+ const constexpr ReverseStepTy reverse_step = ReverseStepTy();
struct EmptyTy {
};
- extern MAYBE_UNUSED EmptyTy empty;
+ const constexpr EmptyTy empty = EmptyTy();
struct FullTy {
};
- extern MAYBE_UNUSED FullTy full;
+ const constexpr FullTy full = FullTy();
struct UndefinedTy {
};
- extern MAYBE_UNUSED UndefinedTy undefined;
+ const constexpr UndefinedTy undefined = UndefinedTy();
}
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
index 4bc54c1a6a..7930c17727 100644
--- a/thirdparty/embree/common/math/math.h
+++ b/thirdparty/embree/common/math/math.h
@@ -53,6 +53,16 @@ namespace embree
__forceinline float rcp ( const float x )
{
+#if defined(__aarch64__)
+ // Move scalar to vector register and do rcp.
+ __m128 a;
+ a[0] = x;
+ float32x4_t reciprocal = vrecpeq_f32(a);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ return reciprocal[0];
+#else
+
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
@@ -66,30 +76,71 @@ namespace embree
#else
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif
+
+#endif //defined(__aarch64__)
}
__forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128i b;
+ a[0] = x;
+ b[0] = 0x80000000;
+ a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
}
__forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128 b;
+ a[0] = x;
+ b[0] = y;
+ a = _mm_xor_ps(a, b);
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
}
__forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128i b;
+ a[0] = x;
+ b[0] = y;
+ a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
}
__forceinline float rsqrt( const float x )
{
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ a[0] = x;
+ __m128 value = _mm_rsqrt_ps(a);
+ value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+ value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+ return value[0];
+#else
+
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
#else
__m128 r = _mm_rsqrt_ss(a);
#endif
- r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
-#if defined(__ARM_NEON)
- r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+ const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+ _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+ return _mm_cvtss_f32(c);
#endif
- return _mm_cvtss_f32(r);
}
#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
@@ -146,7 +197,17 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline float mini(float a, float b) {
+ // FP and Neon shares same vector register in arm64
+ __m128 x;
+ __m128 y;
+ x[0] = a;
+ y[0] = b;
+ x = _mm_min_ps(x, y);
+ return x[0];
+ }
+#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -155,7 +216,17 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline float maxi(float a, float b) {
+ // FP and Neon shares same vector register in arm64
+ __m128 x;
+ __m128 y;
+ x[0] = a;
+ y[0] = b;
+ x = _mm_max_ps(x, y);
+ return x[0];
+ }
+#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -172,9 +243,12 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif
+#if defined(__EMSCRIPTEN__)
+ __forceinline long min(long a, long b) { return a<b ? a:b; }
+#endif
template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
@@ -189,9 +263,12 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif
+#if defined(__EMSCRIPTEN__)
+ __forceinline long max(long a, long b) { return a<b ? b:a; }
+#endif
template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
@@ -231,6 +308,15 @@ namespace embree
__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+#pragma clang fp contract(on)
+
#else
__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
@@ -326,7 +412,7 @@ namespace embree
return x | (y << 1) | (z << 2);
}
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
diff --git a/thirdparty/embree/common/math/quaternion.h b/thirdparty/embree/common/math/quaternion.h
index 080800efcd..78efccda72 100644
--- a/thirdparty/embree/common/math/quaternion.h
+++ b/thirdparty/embree/common/math/quaternion.h
@@ -242,13 +242,17 @@ namespace embree
T cosTheta = dot(q0, q1_);
QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
cosTheta = select(cosTheta < 0.f, -cosTheta, cosTheta);
- if (unlikely(all(cosTheta > 0.9995f))) {
- return normalize(lerp(q0, q1, t));
- }
+
+ // spherical linear interpolation
const T phi = t * fastapprox::acos(cosTheta);
T sinPhi, cosPhi;
fastapprox::sincos(phi, sinPhi, cosPhi);
QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
- return msub(cosPhi, q0, qperp);
+ QuaternionT<T> qslerp = msub(cosPhi, q0, qperp);
+
+ // regular linear interpolation as fallback
+ QuaternionT<T> qlerp = normalize(lerp(q0, q1, t));
+
+ return select(cosTheta > 0.9995f, qlerp, qslerp);
}
}
diff --git a/thirdparty/embree/common/math/transcendental.h b/thirdparty/embree/common/math/transcendental.h
index fd16c26e81..daf9dd96d2 100644
--- a/thirdparty/embree/common/math/transcendental.h
+++ b/thirdparty/embree/common/math/transcendental.h
@@ -27,7 +27,7 @@ __forceinline T sin(const T &v)
// Reduced range version of x
auto x = v - kReal * piOverTwoVec;
auto kMod4 = k & 3;
- auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+ auto sinUseCos = (kMod4 == 1) | (kMod4 == 3);
auto flipSign = (kMod4 > 1);
// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
@@ -76,8 +76,8 @@ __forceinline T cos(const T &v)
auto x = v - kReal * piOverTwoVec;
auto kMod4 = k & 3;
- auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
- auto flipSign = (kMod4 == 1 | kMod4 == 2);
+ auto cosUseCos = (kMod4 == 0) | (kMod4 == 2);
+ auto flipSign = (kMod4 == 1) | (kMod4 == 2);
const float sinC2 = -0.16666667163372039794921875;
const float sinC4 = +8.333347737789154052734375e-3;
diff --git a/thirdparty/embree/common/math/vec2.h b/thirdparty/embree/common/math/vec2.h
index d62aef51f3..f6d98ffa0d 100644
--- a/thirdparty/embree/common/math/vec2.h
+++ b/thirdparty/embree/common/math/vec2.h
@@ -144,7 +144,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
template<typename T> __forceinline T dot ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
@@ -205,11 +205,11 @@ namespace embree
#include "vec2fa.h"
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
-#if defined __AVX__
+#if defined(__AVX__)
#include "../simd/avx.h"
#endif
@@ -221,7 +221,7 @@ namespace embree
{
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#endif
diff --git a/thirdparty/embree/common/math/vec2fa.h b/thirdparty/embree/common/math/vec2fa.h
index a51fb68fd0..4f222894c2 100644
--- a/thirdparty/embree/common/math/vec2fa.h
+++ b/thirdparty/embree/common/math/vec2fa.h
@@ -97,6 +97,12 @@ namespace embree
__forceinline Vec2fa rcp ( const Vec2fa& a )
{
+#if defined(__aarch64__)
+ __m128 reciprocal = _mm_rcp_ps(a.m128);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+ return (const Vec2fa)reciprocal;
+#else
#if defined(__AVX512VL__)
const Vec2fa r = _mm_rcp14_ps(a.m128);
#else
@@ -104,13 +110,15 @@ namespace embree
#endif
#if defined(__AVX2__)
- const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+ const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n
#else
- const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
- //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n
#endif
return res;
+#endif //defined(__aarch64__)
}
__forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -118,12 +126,21 @@ namespace embree
__forceinline Vec2fa rsqrt( const Vec2fa& a )
{
+#if defined(__aarch64__)
+ __m128 r = _mm_rsqrt_ps(a.m128);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ return r;
+#else
+
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+
+#endif
}
__forceinline Vec2fa zero_fix(const Vec2fa& a) {
@@ -156,7 +173,7 @@ namespace embree
__forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@@ -165,7 +182,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
const vint4 ai = _mm_castps_si128(a);
const vint4 bi = _mm_castps_si128(b);
@@ -227,7 +244,7 @@ namespace embree
__forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
diff --git a/thirdparty/embree/common/math/vec3.h b/thirdparty/embree/common/math/vec3.h
index ce94eff327..254f6c4011 100644
--- a/thirdparty/embree/common/math/vec3.h
+++ b/thirdparty/embree/common/math/vec3.h
@@ -197,7 +197,7 @@ namespace embree
template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
template<typename T> __forceinline T sqr ( const Vec3<T>& a ) { return dot(a,a); }
@@ -207,7 +207,6 @@ namespace embree
template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a ) { return a*rsqrt(sqr(a)); }
template<typename T> __forceinline T distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
template<typename T> __forceinline Vec3<T> cross ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(msub(a.y,b.z,a.z*b.y), msub(a.z,b.x,a.x*b.z), msub(a.x,b.y,a.y*b.x)); }
-
template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
{
const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
@@ -266,11 +265,11 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
-#if defined __AVX__
+#if defined(__AVX__)
#include "../simd/avx.h"
#endif
@@ -291,14 +290,14 @@ namespace embree
template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
x = a.x; y = a.y; z = a.z;
}
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
template<>
__forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
}
#endif
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
template<>
__forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
diff --git a/thirdparty/embree/common/math/vec3fa.h b/thirdparty/embree/common/math/vec3fa.h
index 586039741d..8564cf6d10 100644
--- a/thirdparty/embree/common/math/vec3fa.h
+++ b/thirdparty/embree/common/math/vec3fa.h
@@ -55,7 +55,13 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+ __m128 t = _mm_load_ps((float*)a);
+ t[3] = 0.0f;
+ return Vec3fa(t);
+#else
return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
}
static __forceinline Vec3fa loadu( const void* const a ) {
@@ -89,12 +95,20 @@ namespace embree
__forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
__forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+ return vnegq_f32(a.m128);
+#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return _mm_xor_ps(a.m128, mask);
+#endif
}
__forceinline Vec3fa abs ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+ return _mm_abs_ps(a.m128);
+#else
const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
return _mm_and_ps(a.m128, mask);
+#endif
}
__forceinline Vec3fa sign ( const Vec3fa& a ) {
return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
@@ -102,6 +116,10 @@ namespace embree
__forceinline Vec3fa rcp ( const Vec3fa& a )
{
+#if defined(__aarch64__)
+ return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#else
+
#if defined(__AVX512VL__)
const Vec3fa r = _mm_rcp14_ps(a.m128);
#else
@@ -109,13 +127,15 @@ namespace embree
#endif
#if defined(__AVX2__)
- const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+ const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n
#else
- const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
- //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+ const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0)
+ const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n
#endif
return res;
+#endif //defined(__aarch64__)
}
__forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
@@ -123,12 +143,20 @@ namespace embree
__forceinline Vec3fa rsqrt( const Vec3fa& a )
{
+#if defined(__aarch64__)
+ __m128 r = _mm_rsqrt_ps(a.m128);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+ return r;
+#else
+
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ps(a.m128);
#else
__m128 r = _mm_rsqrt_ps(a.m128);
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
}
__forceinline Vec3fa zero_fix(const Vec3fa& a) {
@@ -161,7 +189,7 @@ namespace embree
__forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -170,7 +198,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -187,16 +215,16 @@ namespace embree
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
-#if defined(__AVX2__)
+#if defined(__AVX2__) || defined(__ARM_NEON)
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
#else
__forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
- __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
__forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
__forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+ __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
#endif
__forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
@@ -218,8 +246,26 @@ namespace embree
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+ __forceinline float reduce_add(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = 0.0f;
+ return vaddvq_f32(t);
+ }
- __forceinline float reduce_add(const Vec3fa& v) {
+ __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+ __forceinline float reduce_min(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = t[2];
+ return vminvq_f32(t);
+ }
+ __forceinline float reduce_max(const Vec3fa& v) {
+ float32x4_t t = v.m128;
+ t[3] = t[2];
+ return vmaxvq_f32(t);
+ }
+#else
+ __forceinline float reduce_add(const Vec3fa& v) {
const vfloat4 a(v.m128);
const vfloat4 b = shuffle<1>(a);
const vfloat4 c = shuffle<2>(a);
@@ -229,6 +275,7 @@ namespace embree
__forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
__forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
__forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
@@ -241,8 +288,13 @@ namespace embree
__forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
__forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
__forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
- __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
- __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+ #if defined(__aarch64__)
+ __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+ __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+ __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
__forceinline bool isvalid ( const Vec3fa& v ) {
return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
@@ -261,7 +313,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
@@ -335,7 +387,11 @@ namespace embree
/// Rounding Functions
////////////////////////////////////////////////////////////////////////////////
-#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+ __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+ __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
__forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
__forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); }
__forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); }
@@ -393,8 +449,10 @@ namespace embree
__forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; }
__forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
- __forceinline Vec3fx( const Vec3fa& other, const float w1) {
-#if defined (__SSE4_1__)
+ __forceinline Vec3fx( const Vec3fa& other, const float w1) {
+#if defined (__aarch64__)
+ m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
#else
const vint4 mask(-1,-1,-1,0);
@@ -526,7 +584,7 @@ namespace embree
__forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
__forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -535,7 +593,7 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
__forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
const vint4 ai = _mm_castps_si128(a.m128);
const vint4 bi = _mm_castps_si128(b.m128);
@@ -626,7 +684,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
#if defined(__SSE4_1__)
diff --git a/thirdparty/embree/common/math/vec3ia.h b/thirdparty/embree/common/math/vec3ia.h
index 694804c40d..d4cc3125cd 100644
--- a/thirdparty/embree/common/math/vec3ia.h
+++ b/thirdparty/embree/common/math/vec3ia.h
@@ -65,7 +65,9 @@ namespace embree
__forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
__forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
-#if defined(__SSSE3__)
+#if (defined(__aarch64__))
+ __forceinline Vec3ia abs ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
__forceinline Vec3ia abs ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
#endif
@@ -81,7 +83,7 @@ namespace embree
__forceinline Vec3ia operator -( const Vec3ia& a, const int b ) { return a-Vec3ia(b); }
__forceinline Vec3ia operator -( const int a, const Vec3ia& b ) { return Vec3ia(a)-b; }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
__forceinline Vec3ia operator *( const Vec3ia& a, const int b ) { return a * Vec3ia(b); }
__forceinline Vec3ia operator *( const int a, const Vec3ia& b ) { return Vec3ia(a) * b; }
@@ -116,7 +118,7 @@ namespace embree
__forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
__forceinline Vec3ia& operator -=( Vec3ia& a, const int& b ) { return a = a - b; }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
__forceinline Vec3ia& operator *=( Vec3ia& a, const int& b ) { return a = a * b; }
#endif
@@ -127,18 +129,38 @@ namespace embree
__forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
__forceinline Vec3ia& operator |=( Vec3ia& a, const int& b ) { return a = a | b; }
+#if !defined(__ARM_NEON)
__forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
__forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
////////////////////////////////////////////////////////////////////////////////
- /// Reductions
+ /// Select
////////////////////////////////////////////////////////////////////////////////
+ __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+ return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
+#endif
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Reductions
+ ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+ __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); }
+ __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+ __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); }
+ __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); }
+#else
__forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
__forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
__forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
__forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
-
+#endif
+
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators
////////////////////////////////////////////////////////////////////////////////
@@ -156,19 +178,7 @@ namespace embree
__forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
__forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
- ////////////////////////////////////////////////////////////////////////////////
- /// Select
- ////////////////////////////////////////////////////////////////////////////////
-
- __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
-#if defined(__SSE4_1__)
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#else
- return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f));
-#endif
- }
-
-#if defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(__SSE4_1__)
__forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
__forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
#else
diff --git a/thirdparty/embree/common/math/vec4.h b/thirdparty/embree/common/math/vec4.h
index 0ed107928a..10c53f47b4 100644
--- a/thirdparty/embree/common/math/vec4.h
+++ b/thirdparty/embree/common/math/vec4.h
@@ -149,7 +149,7 @@ namespace embree
}
////////////////////////////////////////////////////////////////////////////////
- /// Euclidian Space Operators
+ /// Euclidean Space Operators
////////////////////////////////////////////////////////////////////////////////
template<typename T> __forceinline T dot ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
@@ -205,7 +205,7 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////
-#if defined __SSE__
+#if defined(__SSE__) || defined(__ARM_NEON)
#include "../simd/sse.h"
#endif
@@ -225,7 +225,7 @@ namespace embree
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
x = a.x; y = a.y; z = a.z; w = a.w;
}
-#elif defined(__SSE__)
+#elif defined(__SSE__) || defined(__ARM_NEON)
template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
}