1 files changed, 108 insertions, 27 deletions
diff --git a/thirdparty/embree/common/simd/vfloat4_sse2.h b/thirdparty/embree/common/simd/vfloat4_sse2.h
index 5215bf9730..6d7e11fe72 100644
--- a/thirdparty/embree/common/simd/vfloat4_sse2.h
+++ b/thirdparty/embree/common/simd/vfloat4_sse2.h
@@ -42,6 +42,11 @@ namespace embree
     __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
 
     __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
     __forceinline explicit vfloat(const vuint4& x) {
       const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
       const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
@@ -49,7 +54,7 @@ namespace embree
       const __m128  bf  = _mm_castsi128_ps(b);  
       v  = _mm_add_ps(af,bf);
     }
-
+#endif
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
@@ -107,7 +112,11 @@ namespace embree
 #endif
   }
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const char* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const char* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -117,7 +126,11 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const unsigned char* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const unsigned char* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -128,7 +141,11 @@ namespace embree
     }
 #endif
 
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
     static __forceinline vfloat4 load(const short* ptr) {
       return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
     }
@@ -145,15 +162,19 @@ namespace embree
     static __forceinline void store_nt(void* ptr, const vfloat4& v)
     {
 #if defined (__SSE4_1__)
+#if defined(__aarch64__)
       _mm_stream_ps((float*)ptr,v);
 #else
+      _mm_stream_ps((float*)ptr,v);
+#endif
+#else
       _mm_store_ps((float*)ptr,v);
 #endif
     }
 
     template<int scale = 4>
     static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
-#if defined(__AVX2__)
+#if defined(__AVX2__) && !defined(__aarch64__)
       return _mm_i32gather_ps(ptr, index, scale);
 #else
       return vfloat4(
@@ -169,7 +190,7 @@ namespace embree
       vfloat4 r = zero;
 #if defined(__AVX512VL__)
       return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)
+#elif defined(__AVX2__)  && !defined(__aarch64__)
       return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
 #else
       if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
@@ -223,8 +244,8 @@ namespace embree
     friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
 #if defined(__AVX512VL__)
       return _mm_mask_blend_ps(m, f, t);
-#elif defined(__SSE4_1__)
-      return _mm_blendv_ps(f, t, m); 
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
 #else
       return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
 #endif
@@ -256,18 +277,34 @@ namespace embree
   __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
 
   __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
   __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
   __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
 #if defined(__AVX512VL__)
   __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
 #else
   __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
 #endif
+
   __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
-  
+
   __forceinline vfloat4 rcp(const vfloat4& a)
   {
+#if defined(__aarch64__)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else
+
 #if defined(__AVX512VL__)
     const vfloat4 r = _mm_rcp14_ps(a);
 #else
@@ -275,30 +312,39 @@ namespace embree
 #endif
 
 #if defined(__AVX2__)
-    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+    return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r);                    // computes r + r * (1 - a * r)
 #else
-    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r))));  // computes r + r * (1 - a * r)
 #endif
+
+#endif  //defined(__aarch64__)
   }
   __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
   __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
 
   __forceinline vfloat4 rsqrt(const vfloat4& a)
   {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
 #if defined(__AVX512VL__)
     vfloat4 r = _mm_rsqrt14_ps(a);
 #else
     vfloat4 r = _mm_rsqrt_ps(a);
 #endif
 
-#if defined(__ARM_NEON)
-    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-#elif defined(__AVX2__)
+#if defined(__AVX2__)
     r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 #else
     r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 #endif
+
+#endif
     return r;
   }
 
@@ -344,7 +390,8 @@ namespace embree
   __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
   __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
     __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
       const vint4 ai = _mm_castps_si128(a);
       const vint4 bi = _mm_castps_si128(b);
@@ -393,9 +440,10 @@ namespace embree
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
 #else
   __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
-  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
   __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
   __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -429,8 +477,13 @@ namespace embree
   __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
   __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
   __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
   __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
   __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
   __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
 #endif
 
@@ -484,7 +537,7 @@ namespace embree
     return select(vboolf4(mask), t, f);
 #endif
   }
-  
+
   __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
     return madd(t,b-a,a);
   }
@@ -506,10 +559,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__aarch64__)
-  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); }
-  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); }
-  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); }
-  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); }
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
 #elif defined (__SSE4_1__)
   __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
   __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
@@ -524,7 +577,9 @@ namespace embree
   __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
 
   __forceinline vint4 floori(const vfloat4& a) {
-#if defined(__SSE4_1__)
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
     return vint4(floor(a));
 #else
     return vint4(a-vfloat4(0.5f));
@@ -538,6 +593,16 @@ namespace embree
   __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
   __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
 
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat4 shuffle(const vfloat4& v) {
     return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
@@ -547,8 +612,9 @@ namespace embree
   __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
     return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
   }
+#endif
 
-#if defined(__SSE3__)
+#if defined(__SSE3__) && !defined(__aarch64__)
   template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
   template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
   template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
@@ -559,10 +625,14 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; }
+#else
   template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
   template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
 
-#if defined (__SSE4_1__)
+#if defined (__SSE4_1__) && !defined(__aarch64__)
   template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
@@ -664,14 +734,25 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-
+#if defined(__aarch64__)
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
   __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
 
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
   __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
   __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
   __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
 
   __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
   { 
@@ -687,7 +768,7 @@ namespace embree
   }
   
   ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
+  /// Euclidean Space Operators
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline float dot(const vfloat4& a, const vfloat4& b) {