diff options
Diffstat (limited to 'thirdparty/embree/common/math/math.h')
-rw-r--r-- | thirdparty/embree/common/math/math.h | 104 |
1 files changed, 95 insertions, 9 deletions
diff --git a/thirdparty/embree/common/math/math.h b/thirdparty/embree/common/math/math.h index 4bc54c1a6a..7930c17727 100644 --- a/thirdparty/embree/common/math/math.h +++ b/thirdparty/embree/common/math/math.h @@ -53,6 +53,16 @@ namespace embree __forceinline float rcp ( const float x ) { +#if defined(__aarch64__) + // Move scalar to vector register and do rcp. + __m128 a; + a[0] = x; + float32x4_t reciprocal = vrecpeq_f32(a); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal); + return reciprocal[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) @@ -66,30 +76,71 @@ namespace embree #else return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); #endif + +#endif //defined(__aarch64__) } __forceinline float signmsk ( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = 0x80000000; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#endif } __forceinline float xorf( const float x, const float y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128 b; + a[0] = x; + b[0] = y; + a = _mm_xor_ps(a, b); + return a[0]; +#else return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y))); +#endif } __forceinline float andf( const float x, const unsigned y ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + __m128i b; + a[0] = x; + b[0] = y; + a = _mm_and_ps(a, vreinterpretq_f32_s32(b)); + return a[0]; +#else return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y)))); +#endif } __forceinline float rsqrt( const float x ) { +#if defined(__aarch64__) + // FP and Neon shares same vector register in arm64 + __m128 a; + a[0] = x; + __m128 value = _mm_rsqrt_ps(a); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value)); + return value[0]; +#else + const __m128 a = _mm_set_ss(x); #if defined(__AVX512VL__) __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a); #else __m128 r = _mm_rsqrt_ss(a); #endif - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); -#if defined(__ARM_NEON) - r = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), + _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); + return _mm_cvtss_f32(c); #endif - return _mm_cvtss_f32(r); } #if defined(__WIN32__) && defined(_MSC_VER) && (_MSC_VER <= 1700) @@ -146,7 +197,17 @@ namespace embree __forceinline double floor( const double x ) { return ::floor (x); } __forceinline double ceil ( const double x ) { return ::ceil (x); } -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float mini(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_min_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float mini(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -155,7 +216,17 @@ namespace embree } #endif -#if defined(__SSE4_1__) +#if defined(__aarch64__) + __forceinline float maxi(float a, float b) { + // FP and Neon shares same vector register in arm64 + __m128 x; + __m128 y; + x[0] = a; + y[0] = b; + x = _mm_max_ps(x, y); + return x[0]; + } +#elif defined(__SSE4_1__) __forceinline float maxi(float a, float b) { const __m128i ai = _mm_castps_si128(_mm_set_ss(a)); const __m128i bi = _mm_castps_si128(_mm_set_ss(b)); @@ -172,9 +243,12 @@ namespace embree __forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; } __forceinline float min(float a, float b) { return a<b ? a:b; } __forceinline double min(double a, double b) { return a<b ? a:b; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long min(long a, long b) { return a<b ? a:b; } +#endif template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); } template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); } @@ -189,9 +263,12 @@ namespace embree __forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; } __forceinline float max(float a, float b) { return a<b ? b:a; } __forceinline double max(double a, double b) { return a<b ? b:a; } -#if defined(__64BIT__) +#if defined(__64BIT__) || defined(__EMSCRIPTEN__) __forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; } #endif +#if defined(__EMSCRIPTEN__) + __forceinline long max(long a, long b) { return a<b ? b:a; } +#endif template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); } template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } @@ -231,6 +308,15 @@ namespace embree __forceinline float msub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); } + +#elif defined (__aarch64__) && defined(__clang__) +#pragma clang fp contract(fast) +__forceinline float madd ( const float a, const float b, const float c) { return a*b + c; } +__forceinline float msub ( const float a, const float b, const float c) { return a*b - c; } +__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; } +__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); } +#pragma clang fp contract(on) + #else __forceinline float madd ( const float a, const float b, const float c) { return a*b+c; } __forceinline float msub ( const float a, const float b, const float c) { return a*b-c; } @@ -326,7 +412,7 @@ namespace embree return x | (y << 1) | (z << 2); } -#if defined(__AVX2__) +#if defined(__AVX2__) && !defined(__aarch64__) template<> __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi) |