summaryrefslogtreecommitdiff
path: root/thirdparty/embree/common/math/math.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree/common/math/math.h')
-rw-r--r--thirdparty/embree/common/math/math.h104
1 files changed, 95 insertions, 9 deletions
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h
index 4bc54c1a6a..7930c17727 100644
--- a/thirdparty/embree/common/math/math.h
+++ b/thirdparty/embree/common/math/math.h
@@ -53,6 +53,16 @@ namespace embree
__forceinline float rcp ( const float x )
{
+#if defined(__aarch64__)
+ // Move scalar to vector register and do rcp.
+ __m128 a;
+ a[0] = x;
+ float32x4_t reciprocal = vrecpeq_f32(a);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+ return reciprocal[0];
+#else
+
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
@@ -66,30 +76,71 @@ namespace embree
#else
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif
+
+#endif //defined(__aarch64__)
}
__forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128i b;
+ a[0] = x;
+ b[0] = 0x80000000;
+ a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
}
__forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128 b;
+ a[0] = x;
+ b[0] = y;
+ a = _mm_xor_ps(a, b);
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
}
__forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ __m128i b;
+ a[0] = x;
+ b[0] = y;
+ a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+ return a[0];
+#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
}
__forceinline float rsqrt( const float x )
{
+#if defined(__aarch64__)
+ // FP and Neon shares same vector register in arm64
+ __m128 a;
+ a[0] = x;
+ __m128 value = _mm_rsqrt_ps(a);
+ value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+ value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+ return value[0];
+#else
+
const __m128 a = _mm_set_ss(x);
#if defined(__AVX512VL__)
__m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
#else
__m128 r = _mm_rsqrt_ss(a);
#endif
- r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
-#if defined(__ARM_NEON)
- r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+ const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+ _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+ return _mm_cvtss_f32(c);
#endif
- return _mm_cvtss_f32(r);
}
#if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700)
@@ -146,7 +197,17 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); }
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline float mini(float a, float b) {
+ // FP and Neon shares same vector register in arm64
+ __m128 x;
+ __m128 y;
+ x[0] = a;
+ y[0] = b;
+ x = _mm_min_ps(x, y);
+ return x[0];
+ }
+#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -155,7 +216,17 @@ namespace embree
}
#endif
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+ __forceinline float maxi(float a, float b) {
+ // FP and Neon shares same vector register in arm64
+ __m128 x;
+ __m128 y;
+ x[0] = a;
+ y[0] = b;
+ x = _mm_max_ps(x, y);
+ return x[0];
+ }
+#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) {
const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
@@ -172,9 +243,12 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif
+#if defined(__EMSCRIPTEN__)
+ __forceinline long min(long a, long b) { return a<b ? a:b; }
+#endif
template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
@@ -189,9 +263,12 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; }
-#if defined(__64BIT__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif
+#if defined(__EMSCRIPTEN__)
+ __forceinline long max(long a, long b) { return a<b ? b:a; }
+#endif
template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
@@ -231,6 +308,15 @@ namespace embree
__forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
__forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+#pragma clang fp contract(on)
+
#else
__forceinline float madd ( const float a, const float b, const float c) { return a*b+c; }
__forceinline float msub ( const float a, const float b, const float c) { return a*b-c; }
@@ -326,7 +412,7 @@ namespace embree
return x | (y << 1) | (z << 2);
}
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)