25 files changed, 0 insertions, 8598 deletions
diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
deleted file mode 100644
index e8698ac56d..0000000000
--- a/thirdparty/embree-aarch64/common/math/AVX2NEON.h
+++ /dev/null
@@ -1,986 +0,0 @@
-#pragma once
-
-#include "SSE2NEON.h"
-
-
-#define AVX2NEON_ABI static inline  __attribute__((always_inline))
-
-
-struct __m256d;
-
-struct __m256 {
-    __m128 lo,hi;
-    __m256() {}
-};
-
-
-
-
-struct __m256i {
-    __m128i lo,hi;
-    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
-    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
-    __m256i() {}
-};
- 
-
-
-
-struct __m256d {
-    float64x2_t lo,hi;
-    __m256d() {}
-    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
-    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
-};
-
-#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
-
-
-#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
-#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
-
-#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
-
-
-#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
-
-
-
-#define _mm_stream_load_si128 _mm_load_si128
-#define _mm256_stream_load_si256 _mm256_load_si256
-
-
-AVX2NEON_ABI
-__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
-{
-    __m128 res;
-    for (int i=0;i<4;i++)
-    {
-        if (imm8 & (1<<i))
-        {
-            res[i] = b[i];
-        }
-        else{
-            res[i] = a[i];
-        }
-    }
-    
-    return res;
-}
-
-AVX2NEON_ABI
-__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
-{
-    __m128i res;
-    for (int i=0;i<4;i++)
-    {
-        if (imm8 & (1<<i))
-        {
-            res[i] = b[i];
-        }
-        else{
-            res[i] = a[i];
-        }
-    }
-    return res;
-}
-
-AVX2NEON_ABI
-__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
-{
-    return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
-}
-
-
-AVX2NEON_ABI
-__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
-{
-    int64x2_t y;
-    y[0] = *(int64_t *)mem_addr;
-    y[1] = 0;
-    return __m128i(y);
-}
-
-AVX2NEON_ABI
-int _mm_movemask_popcnt(__m128 a)
-{
-    return __builtin_popcount(_mm_movemask_ps(a));
-}
-
-AVX2NEON_ABI
-__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
-{
-    __m128 res;
-    for (int i=0;i<4;i++) {
-        if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
-    }
-    return res;
-}
-
-AVX2NEON_ABI
-void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
-{
-    for (int i=0;i<4;i++) {
-        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
-    }
-}
-
-AVX2NEON_ABI
-void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
-{
-    for (int i=0;i<4;i++) {
-        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
-    }
-}
-
-AVX2NEON_ABI
-__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
-{
-    return vnegq_f32(vfmaq_f32(c,a,b));
-}
-
-#define _mm_fnmsub_ss _mm_fnmsub_ps
-
-AVX2NEON_ABI
-__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
-{
-    return vfmsq_f32(c,a,b);
-}
-
-#define _mm_fnmadd_ss _mm_fnmadd_ps
-
-
-AVX2NEON_ABI
-__m128 _mm_broadcast_ss (float const * mem_addr)
-{
-    return vdupq_n_f32(*mem_addr);
-}
-
-
-AVX2NEON_ABI
-__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
-{
-    return vfmaq_f32(vnegq_f32(c),a,b);
-}
-
-#define _mm_fmsub_ss _mm_fmsub_ps
-#define _mm_fmadd_ps _mm_madd_ps
-#define _mm_fmadd_ss _mm_madd_ps
-
-
-
-template<int code>
-AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
-{
-    float v;
-    v = 0;
-    v += (code & 0x10) ? a[0]*b[0] : 0;
-    v += (code & 0x20) ? a[1]*b[1] : 0;
-    v += (code & 0x40) ? a[2]*b[2] : 0;
-    v += (code & 0x80) ? a[3]*b[3] : 0;
-    float32x4_t res;
-    res[0] = (code & 0x1) ? v : 0;
-    res[1] = (code & 0x2) ? v : 0;
-    res[2] = (code & 0x4) ? v : 0;
-    res[3] = (code & 0x8) ? v : 0;
-    return res;
-}
-
-template<>
-inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
-{
-    float v;
-    float32x4_t m = _mm_mul_ps(a,b);
-    m[3] = 0;
-    v = vaddvq_f32(m);
-    return _mm_set1_ps(v);
-}
-
-template<>
-inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
-{
-    float v;
-    float32x4_t m = _mm_mul_ps(a,b);
-    v = vaddvq_f32(m);
-    return _mm_set1_ps(v);
-}
-
-#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
-
-
-
-AVX2NEON_ABI
-__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
-{
-    return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
-}
-
-
-AVX2NEON_ABI
-__m128 _mm_permutevar_ps (__m128 a, __m128i b)
-{
-    __m128 x;
-    for (int i=0;i<4;i++)
-    {
-        x[i] = a[b[i&3]];
-    }
-    return x;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_setzero_si256()
-{
-    __m256i res;
-    res.lo = res.hi = vdupq_n_s32(0);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_setzero_ps()
-{
-    __m256 res;
-    res.lo = res.hi = vdupq_n_f32(0.0f);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_undefined_si256()
-{
-    return _mm256_setzero_si256();
-}
-
-AVX2NEON_ABI
-__m256 _mm256_undefined_ps()
-{
-    return _mm256_setzero_ps();
-}
-
-CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
-CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
-CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
-CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
-CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
-CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
-
-
-
-
-AVX2NEON_ABI
-__m128 _mm256_castps256_ps128 (__m256 a)
-{
-    return a.lo;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_castsi128_si256 (__m128i a)
-{
-    __m256i res;
-    res.lo = a ;
-    res.hi = vdupq_n_s32(0);
-    return res;
-}
-
-AVX2NEON_ABI
-__m128i _mm256_castsi256_si128 (__m256i a)
-{
-    return a.lo;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_castps128_ps256 (__m128 a)
-{
-    __m256 res;
-    res.lo = a;
-    res.hi = vdupq_n_f32(0);
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256 _mm256_broadcast_ss (float const * mem_addr)
-{
-    __m256 res;
-    res.lo = res.hi = vdupq_n_f32(*mem_addr);
-    return res;
-}
-
-
-
-AVX2NEON_ABI
-__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
-    __m256i res;
-    res.lo = lo; res.hi = hi;
-    return res;
-    
-}
-
-AVX2NEON_ABI
-__m256i _mm256_set1_epi32 (int a)
-{
-    __m256i res;
-    res.lo = res.hi = vdupq_n_s32(a);
-    return res;
-}
-
-
-
-
-AVX2NEON_ABI
-int _mm256_movemask_ps(const __m256& v)
-{
-    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
-}
-
-template<int imm8>
-AVX2NEON_ABI
-__m256 __mm256_permute_ps (const __m256& a)
-{
-    __m256 res;
-    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
-    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
-    return res;
-
-}
-
-#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
-
-
-template<int imm8>
-AVX2NEON_ABI
-__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
-{
-    __m256 res;
-    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
-    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
-    return res;
-
-}
-
-#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
-
-AVX2NEON_ABI
-__m256i _mm256_set1_epi64x (long long a)
-{
-    __m256i res;
-    int64x2_t t = vdupq_n_s64(a);
-    res.lo = res.hi = __m128i(t);
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
-{
-    __m256 res;
-    __m128 tmp;
-    switch (imm8 & 0x7)
-    {
-        case 0: tmp = a.lo; break;
-        case 1: tmp = a.hi; break;
-        case 2: tmp = b.lo; break;
-        case 3: tmp = b.hi; break;
-    }
-    if (imm8 & 0x8)
-        tmp = _mm_setzero_ps();
-
-    
-    
-    res.lo = tmp;
-    imm8 >>= 4;
-    
-    switch (imm8 & 0x7)
-    {
-        case 0: tmp = a.lo; break;
-        case 1: tmp = a.hi; break;
-        case 2: tmp = b.lo; break;
-        case 3: tmp = b.hi; break;
-    }
-    if (imm8 & 0x8)
-        tmp = _mm_setzero_ps();
-    
-    res.hi = tmp;
-    
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_moveldup_ps (__m256 a)
-{
-    __m256 res;
-    res.lo[0] = res.lo[1] = a.lo[0];
-    res.lo[2] = res.lo[3] = a.lo[2];
-    res.hi[0] = res.hi[1] = a.hi[0];
-    res.hi[2] = res.hi[3] = a.hi[2];
-    return res;
-
-}
-
-AVX2NEON_ABI
-__m256 _mm256_movehdup_ps (__m256 a)
-{
-    __m256 res;
-    res.lo[0] = res.lo[1] = a.lo[1];
-    res.lo[2] = res.lo[3] = a.lo[3];
-    res.hi[0] = res.hi[1] = a.hi[1];
-    res.hi[2] = res.hi[3] = a.hi[3];
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
-{
-    __m256 res = a;
-    if (imm8 & 1) res.hi = b;
-    else res.lo = b;
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
-{
-    if (imm8 & 1) return a.hi;
-    return a.lo;
-}
-
-
-AVX2NEON_ABI
-__m256d _mm256_movedup_pd (__m256d a)
-{
-    __m256d res;
-    res.hi = a.hi;
-    res.lo[0] = res.lo[1] = a.lo[0];
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_abs_epi32(__m256i a)
-{
-   __m256i res;
-   res.lo = vabsq_s32(a.lo);
-   res.hi = vabsq_s32(a.hi);
-   return res;
-}
-
-UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
-UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
-UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
-UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
-UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
-
-
-BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
-BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
-BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
-
-BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
-BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
-BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
-BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
-
-BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
-BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
-
-BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
-BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
-BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
-BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
-
-BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
-BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
-BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
-BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
-
-BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
-BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
-BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
-
-
-
-BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
-BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
-BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
-
-
-BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
-BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
-TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
-
-
-TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
-TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
-TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
-TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
-
-
-BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
-BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
-
-
-BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
-BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
-BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
-BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
-BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
-BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
-
-
-AVX2NEON_ABI
-__m256i _mm256_cvtps_epi32 (__m256 a)
-{
-    __m256i res;
-    res.lo = _mm_cvtps_epi32(a.lo);
-    res.hi = _mm_cvtps_epi32(a.hi);
-    return res;
-    
-}
-
-AVX2NEON_ABI
-__m256i _mm256_cvttps_epi32 (__m256 a)
-{
-    __m256i res;
-    res.lo = _mm_cvttps_epi32(a.lo);
-    res.hi = _mm_cvttps_epi32(a.hi);
-    return res;
-    
-}
-
-AVX2NEON_ABI
-__m256 _mm256_loadu_ps (float const * mem_addr)
-{
-    __m256 res;
-    res.lo = *(__m128 *)(mem_addr + 0);
-    res.hi = *(__m128 *)(mem_addr + 4);
-    return res;
-}
-#define _mm256_load_ps _mm256_loadu_ps
-
-
-AVX2NEON_ABI
-int _mm256_testz_ps (const __m256& a, const __m256& b)
-{
-    __m256 t = a;
-    if (&a != &b)
-        t = _mm256_and_ps(a,b);
-
-    __m128i l  = vshrq_n_s32(__m128i(t.lo),31);
-    __m128i h  = vshrq_n_s32(__m128i(t.hi),31);
-    return vaddvq_s32(vaddq_s32(l,h)) == 0;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
-{
-    __m256i res;
-    int64x2_t t0 = {e0,e1};
-    int64x2_t t1 = {e2,e3};
-    res.lo = __m128i(t0);
-    res.hi = __m128i(t1);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256d _mm256_setzero_pd ()
-{
-    __m256d res;
-    res.lo = res.hi = vdupq_n_f64(0);
-    return res;
-}
-
-AVX2NEON_ABI
-int _mm256_movemask_pd (__m256d a)
-{
-    int res = 0;
-    uint64x2_t x;
-    x = uint64x2_t(a.lo);
-    res |= (x[0] >> 63) ? 1 : 0;
-    res |= (x[0] >> 63) ? 2 : 0;
-    x = uint64x2_t(a.hi);
-    res |= (x[0] >> 63) ? 4 : 0;
-    res |= (x[0] >> 63) ? 8 : 0;
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
-{
-    __m256i res;
-    res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
-    res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
-{
-    __m256i res;
-    res.lo = __m128i(vceqq_f64(a.lo,b.lo));
-    res.hi = __m128i(vceqq_f64(a.hi,b.hi));
-    return res;
-}
-
-
-AVX2NEON_ABI
-int _mm256_testz_pd (const __m256d& a, const __m256d& b)
-{
-    __m256d t = a;
-
-    if (&a != &b)
-        t = _mm256_and_pd(a,b);
-
-    return _mm256_movemask_pd(t) == 0;
-}
-
-AVX2NEON_ABI
-__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
-{
-    __m256d res;
-    uint64x2_t t = uint64x2_t(mask.lo);
-    res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
-    res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
-    t = uint64x2_t(mask.hi);
-    res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
-    res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
-    return res;
-}
-
-template<int imm8>
-__m256 __mm256_dp_ps (__m256 a, __m256 b)
-{
-    __m256 res;
-    res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
-    res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
-    return res;
-}
-
-#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
-
-AVX2NEON_ABI
-double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
-{
-    switch (imm8 & 3) {
-        case 0:
-            return a.lo[0];
-        case 1:
-            return a.lo[1];
-        case 2:
-            return a.hi[0];
-        case 3:
-            return a.hi[1];
-    }
-    __builtin_unreachable();
-    return 0;
-}
-
-AVX2NEON_ABI
-__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
-{
-    __m256d res;
-    res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
-    res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
-    res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
-    res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
-    
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
-{
-    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
-{
-    __m256i res;
-    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
-    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
-    return res;
-}
-
-#define _mm256_load_si256 _mm256_loadu_si256
-
-AVX2NEON_ABI
-void _mm256_storeu_ps (float * mem_addr, __m256 a)
-{
-    *(__m128 *)(mem_addr + 0) = a.lo;
-    *(__m128 *)(mem_addr + 4) = a.hi;
-
-}
-
-#define _mm256_store_ps _mm256_storeu_ps
-#define _mm256_stream_ps _mm256_storeu_ps
-
-
-AVX2NEON_ABI
-void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
-{
-    *(__m128i *)((int *)mem_addr + 0) = a.lo;
-    *(__m128i *)((int *)mem_addr + 4) = a.hi;
-
-}
-
-#define _mm256_store_si256 _mm256_storeu_si256
-
-
-
-AVX2NEON_ABI
-__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
-{
-    __m256 res;
-    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
-    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
-    return res;
-
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_cvtepu8_epi32 (__m128i a)
-{
-    __m256i res;
-    uint8x16_t x = uint8x16_t(a);
-    for (int i=0;i<4;i++)
-    {
-        res.lo[i] = x[i];
-        res.hi[i] = x[i+4];
-    }
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_cvtepi8_epi32 (__m128i a)
-{
-    __m256i res;
-    int8x16_t x = int8x16_t(a);
-    for (int i=0;i<4;i++)
-    {
-        res.lo[i] = x[i];
-        res.hi[i] = x[i+4];
-    }
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_cvtepu16_epi32 (__m128i a)
-{
-    __m256i res;
-    uint16x8_t x = uint16x8_t(a);
-    for (int i=0;i<4;i++)
-    {
-        res.lo[i] = x[i];
-        res.hi[i] = x[i+4];
-    }
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_cvtepi16_epi32 (__m128i a)
-{
-    __m256i res;
-    int16x8_t x = int16x8_t(a);
-    for (int i=0;i<4;i++)
-    {
-        res.lo[i] = x[i];
-        res.hi[i] = x[i+4];
-    }
-    return res;
-}
-
-
-
-AVX2NEON_ABI
-void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
-{
-    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
-    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
-}
-
-AVX2NEON_ABI
-__m256i _mm256_slli_epi32 (__m256i a, int imm8)
-{
-    __m256i res;
-    res.lo = _mm_slli_epi32(a.lo,imm8);
-    res.hi = _mm_slli_epi32(a.hi,imm8);
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_srli_epi32 (__m256i a, int imm8)
-{
-    __m256i res;
-    res.lo = _mm_srli_epi32(a.lo,imm8);
-    res.hi = _mm_srli_epi32(a.hi,imm8);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256i _mm256_srai_epi32 (__m256i a, int imm8)
-{
-    __m256i res;
-    res.lo = _mm_srai_epi32(a.lo,imm8);
-    res.hi = _mm_srai_epi32(a.hi,imm8);
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
-{
-    __m256i res;
-    res.lo = vshlq_s32(a.lo,count.lo);
-    res.hi = vshlq_s32(a.hi,count.hi);
-    return res;
-
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
-{
-    __m256i res;
-    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
-    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
-    return res;
-
-}
-
-AVX2NEON_ABI
-__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
-{
-    __m256i res;
-    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
-    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
-    return res;
-
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
-{
-    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
-}
-
-
-AVX2NEON_ABI
-__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
-{
-    if (imm8 & 1) return a.hi;
-    return a.lo;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_set1_ps(float x)
-{
-    __m256 res;
-    res.lo = res.hi = vdupq_n_f32(x);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    __m256 res;
-    res.lo = _mm_set_ps(e3,e2,e1,e0);
-    res.hi = _mm_set_ps(e7,e6,e5,e4);
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
-{
-    __m256 res;
-    res.lo = res.hi = *mem_addr;
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_cvtepi32_ps (__m256i a)
-{
-    __m256 res;
-    res.lo = _mm_cvtepi32_ps(a.lo);
-    res.hi = _mm_cvtepi32_ps(a.hi);
-    return res;
-}
-AVX2NEON_ABI
-void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
-{
-    for (int i=0;i<4;i++) {
-        if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
-        if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
-    }
-}
-
-AVX2NEON_ABI
-__m256d _mm256_andnot_pd (__m256d a, __m256d b)
-{
-    __m256d res;
-    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
-    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
-    return res;
-}
-
-AVX2NEON_ABI
-__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
-{
-    __m256 res;
-    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
-    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
-    return res;
-
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
-{
-    __m256i res;
-    res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
-    res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
-    return res;
-
-}
-
-AVX2NEON_ABI
-__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
-{
-    __m256i res;
-    for (int i=0;i<4;i++)
-    {
-        res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
-        res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
-    }
-    return res;
-}
-
-
-AVX2NEON_ABI
-__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
-{
-    __m256i res = _mm256_setzero_si256();
-    for (int i=0;i<4;i++)
-    {
-        if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
-        if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
-    }
-    
-    return res;
-
-}
-
-
diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
deleted file mode 100644
index 2013151d31..0000000000
--- a/thirdparty/embree-aarch64/common/math/SSE2NEON.h
+++ /dev/null
@@ -1,1753 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding ARM NEON versions
-//
-// This header file does not (yet) translate *all* of the SSE intrinsics.
-// Since this is in support of a specific porting effort, I have only
-// included the intrinsics I needed to get my port to work.
-//
-// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
-//
-// If you want to improve or add to this project, send me an
-// email and I will probably approve your access to the depot.
-//
-// Project is located here:
-//
-//	https://github.com/jratcliff63367/sse2neon
-//
-// Show your appreciation for open source by sending me a bitcoin tip to the following
-// address.
-//
-// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
-// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
-//
-//
-// Contributors to this project are:
-//
-// John W. Ratcliff : jratcliffscarab@gmail.com
-// Brandon Rowlett  : browlett@nvidia.com
-// Ken Fast         : kfast@gdeb.com
-// Eric van Beurden : evanbeurden@nvidia.com
-//
-//
-// *********************************************************************************************************************
-// Release notes for January 20, 2017 version:
-//
-// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
-// The unit-tests now test 10,000 random float and int values against each intrinsic.
-//
-// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
-// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
-//
-// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
-// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
-// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
-// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
-// cast or otherwise alias the base NEON data type for intrinsic operations.
-//
-// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
-// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
-// to return the correct value.  This is now fixed.
-//
-// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
-// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
-// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
-// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
-// a build error.
-//
-// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
-// producing the correct results on NEON.  These unit tests will be added as soon as possible.
-//
-// Here is the list of new instrinsics which have been added:
-//
-// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
-// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
-// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
-// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
-// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
-// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
-// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
-// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
-// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
-// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
-// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
-// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
-// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
-// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
-//
-// *********************************************************************************************************************
-/*
-** The MIT license:
-**
-** Permission is hereby granted, free of charge, to any person obtaining a copy
-** of this software and associated documentation files (the "Software"), to deal
-** in the Software without restriction, including without limitation the rights
-** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-** copies of the Software, and to permit persons to whom the Software is furnished
-** to do so, subject to the following conditions:
-**
-** The above copyright notice and this permission notice shall be included in all
-** copies or substantial portions of the Software.
-
-** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#pragma once
-
-#define GCC 1
-#define ENABLE_CPP_VERSION 0
-
-// enable precise emulation of _mm_min_ps and _mm_max_ps?
-// This would slow down the computation a bit, but gives consistent result with x86 SSE2.
-// (e.g. would solve a hole or NaN pixel in the rendering result)
-#define USE_PRECISE_MINMAX_IMPLEMENTATION (1)
-
-#if GCC
-#define FORCE_INLINE					inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x)					__attribute__((aligned(x)))
-#else
-#define FORCE_INLINE					inline
-#define ALIGN_STRUCT(x)					__declspec(align(x))
-#endif
-
-#include <stdint.h>
-#include "arm_neon.h"
-#if defined(__aarch64__)
-#include "constants.h"
-#endif
-
-
-#if !defined(__has_builtin)
-#define __has_builtin(x) (0)
-#endif
-
-/*******************************************************/
-/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
-/* Argument fp3 is a digit[0123] that represents the fp*/
-/* from argument "b" of mm_shuffle_ps that will be     */
-/* placed in fp3 of result. fp2 is the same for fp2 in */
-/* result. fp1 is a digit[0123] that represents the fp */
-/* from argument "a" of mm_shuffle_ps that will be     */
-/* places in fp1 of result. fp0 is the same for fp0 of */
-/* result                                              */
-/*******************************************************/
-#if defined(__aarch64__)
-#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
-#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } )
-#endif
-
-#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
-  ((fp1) << 2) | ((fp0)))
-
-typedef float32x4_t __m128;
-typedef int32x4_t __m128i;
-
-// union intended to allow direct access to an __m128 variable using the names that the MSVC
-// compiler provides.  This union should really only be used when trying to access the members
-// of the vector as integer values.  GCC/clang allow native access to the float members through
-// a simple array access operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
-// hit.  If it really is needed however, the original __m128 variable can be aliased with a
-// pointer to this union and used to access individual components.  The use of this union should
-// be hidden behind a macro that is used throughout the codebase to access the members instead
-// of always declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec
-{
-  float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
-  int8_t      m128_i8[16];    // as signed 8-bit integers.
-  int16_t     m128_i16[8];    // as signed 16-bit integers.
-  int32_t     m128_i32[4];    // as signed 32-bit integers.
-  int64_t     m128_i64[2];    // as signed 64-bit integers.
-  uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
-  uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
-  uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
-  uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
-  double	    m128_f64[2];    // as signed double
-} SIMDVec;
-
-// ******************************************
-// CPU stuff
-// ******************************************
-
-typedef SIMDVec __m128d;
-
-#include <stdlib.h>
-
-#ifndef _MM_MASK_MASK
-#define _MM_MASK_MASK 0x1f80
-#define _MM_MASK_DIV_ZERO 0x200
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_DENORMALS_ZERO_ON 0x40
-#define _MM_MASK_DENORM 0x100
-#endif
-#define _MM_SET_EXCEPTION_MASK(x)
-#define _MM_SET_FLUSH_ZERO_MODE(x)
-#define _MM_SET_DENORMALS_ZERO_MODE(x)
-
-FORCE_INLINE void _mm_pause()
-{
-}
-
-FORCE_INLINE void _mm_mfence()
-{
-    __sync_synchronize();
-}
-
-#define _MM_HINT_T0 3
-#define _MM_HINT_T1 2
-#define _MM_HINT_T2 1
-#define _MM_HINT_NTA 0
-
-FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level)
-{
-   __builtin_prefetch(ptr);
- 
-}
-
-FORCE_INLINE void* _mm_malloc(int size, int align)
-{
-    void *ptr;
-    // align must be multiple of sizeof(void *) for posix_memalign.
-    if (align < sizeof(void *)) {
-        align = sizeof(void *);
-    }
-
-    if ((align % sizeof(void *)) != 0) {
-        // fallback to malloc
-        ptr = malloc(size);
-    } else {
-        if (posix_memalign(&ptr, align, size)) {
-          return 0;
-        }
-    }
-
-    return ptr;
-}
-
-FORCE_INLINE void _mm_free(void* ptr)
-{
-        free(ptr);
-}
-
-FORCE_INLINE int _mm_getcsr()
-{
-        return 0;
-}
-
-FORCE_INLINE void _mm_setcsr(int val)
-{
-        return;
-}
-
-// ******************************************
-// Set/get methods
-// ******************************************
-
-// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
-#if defined(__aarch64__)
-FORCE_INLINE float _mm_cvtss_f32(const __m128& x)
-{
-    return x[0];
-}
-#else
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(a, 0);
-}
-#endif
-
-// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128()
-{
-  return vdupq_n_s32(0);
-}
-
-// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-  return vdupq_n_f32(0);
-}
-
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-  return vdupq_n_f32(_w);
-}
-
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-  return vdupq_n_f32(_w);
-}
-
-// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-#if defined(__aarch64__) 
-FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x)
-{
-    float32x4_t t = { x, y, z, w };
-    return t;
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x )
-{
-    float32x4_t t = { w, z, y, x };
-    return t;
-}
-#else
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float __attribute__((aligned(16))) data[4] = { x, y, z, w };
-    return vld1q_f32(data);
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
-{
-    float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
-    return vld1q_f32(data);
-}
-#endif
-
-// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-  return vdupq_n_s32(_i);
-}
-
-//Set the first lane to of 4 signed single-position, floating-point number to w
-#if defined(__aarch64__)
-FORCE_INLINE __m128 _mm_set_ss(float _w)
-{
-    float32x4_t res = {_w, 0, 0, 0};
-    return res;
-}
-
-// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32x4_t t = {i0,i1,i2,i3};
-    return t;
-}
-#else
-FORCE_INLINE __m128 _mm_set_ss(float _w)
-{
-    __m128 val = _mm_setzero_ps();
-    return vsetq_lane_f32(_w, val, 0);
-}
-
-// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
-    return vld1q_s32(data);
-}
-#endif
-
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-  vst1q_f32(p, a);
-}
-
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-  vst1q_f32(p, a);
-}
-
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-  vst1q_s32((int32_t*) p,a);
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a )
-{
-  vst1q_s32((int32_t*) p,a);
-}
-
-// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-  vst1q_lane_f32(p, a, 0);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
-{
-  *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0);
-}
-
-// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float * p)
-{
-  return vld1q_dup_f32(p);
-}
-
-// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float * p)
-{
-  return vld1q_f32(p);
-}
-
-// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
-{
-  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
-  return vld1q_f32(p);
-}
-
-// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float * p)
-{
-  __m128 result = vdupq_n_f32(0);
-  return vsetq_lane_f32(*p, result, 0);
-}
-
-FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p)
-{
-  return (__m128i)vld1q_s32((const int32_t*) p);
-}
-
-
-// ******************************************
-// Logic/Binary operations
-// ******************************************
-
-// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-  return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b));
-}
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-  return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-  return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-  return (__m128i)vandq_s32(a, b);
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-  return (__m128)vandq_s32((__m128i)a, (__m128i)b);
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-  return (__m128)vorrq_s32((__m128i)a, (__m128i)b);
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-  return (__m128)veorq_s32((__m128i)a, (__m128i)b);
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-  return (__m128i)vorrq_s32(a, b);
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-  return veorq_s32(a, b);
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
-  uint32x4_t &ia = *(uint32x4_t *)&a;
-  return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
-#else
-    
-#if defined(__aarch64__)
-    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
-    return vaddvq_u32(t2);
-#else
-  static const uint32x4_t movemask = { 1, 2, 4, 8 };
-  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-  uint32x4_t t0 = vreinterpretq_u32_f32(a);
-  uint32x4_t t1 = vtstq_u32(t0, highbit);
-  uint32x4_t t2 = vandq_u32(t1, movemask);
-  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
-  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
-#endif
-    
-#endif
-}
-
-#if defined(__aarch64__)
-FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a)
-{
-    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
-    t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2)));
-    return vaddvq_u32(t2);
-    
-}
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-  return vcombine_f32(vget_high_f32(a), vget_low_f32(b));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high end of result
-// takes the higher two 32 bit values from b and swaps them and places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-  return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b)));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-  return vcombine_f32(vget_low_f32(a), vget_high_f32(b));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0)));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0)));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2)));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-  float32_t a0 = vgetq_lane_f32(a, 0);
-  float32_t a2 = vgetq_lane_f32(a, 2);
-  float32x2_t aVal = vdup_n_f32(a2);
-  aVal = vset_lane_f32(a0, aVal, 1);
-  return vcombine_f32(aVal, vget_high_f32(b));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1)));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-  float32_t b0 = vgetq_lane_f32(b, 0);
-  float32_t b2 = vgetq_lane_f32(b, 2);
-  float32x2_t bVal = vdup_n_f32(b0);
-  bVal = vset_lane_f32(b2, bVal, 1);
-  return vcombine_f32(vget_low_f32(a), bVal);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-  float32_t b0 = vgetq_lane_f32(b, 0);
-  float32_t b2 = vgetq_lane_f32(b, 2);
-  float32x2_t bVal = vdup_n_f32(b0);
-  bVal = vset_lane_f32(b2, bVal, 1);
-  return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-  float32_t b0 = vgetq_lane_f32(b, 0);
-  float32_t b2 = vgetq_lane_f32(b, 2);
-  float32x2_t bVal = vdup_n_f32(b0);
-  bVal = vset_lane_f32(b2, bVal, 1);
-  return vcombine_f32(vget_high_f32(a), bVal);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-  float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3));
-  float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3));
-  return vcombine_f32(a21, b03);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-  float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3));
-  float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3));
-  return vcombine_f32(a03, b21);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-  float32x2_t a10 = vget_low_f32(a);
-  float32x2_t b10 = vget_low_f32(b);
-  return vcombine_f32(a10, b10);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
-  float32x2_t b10 = vget_low_f32(b);
-  return vcombine_f32(a01, b10);
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
-  float32x2_t b01 = vrev64_f32(vget_low_f32(b));
-  return vcombine_f32(a01, b01);
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
-// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
-// The same is true on SSE as well.
-// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-template <int i>
-FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b)
-{
-#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
-  __m128 ret;
-  ret[0] = a[i & 0x3];
-  ret[1] = a[(i >> 2) & 0x3];
-  ret[2] = b[(i >> 4) & 0x03];
-  ret[3] = b[(i >> 6) & 0x03];
-  return ret;
-#else
-# if __has_builtin(__builtin_shufflevector)
-    return __builtin_shufflevector(             \
-        a, b, (i) & (0x3), ((i) >> 2) & 0x3,
-        (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4);
-# else
-    const int i0 = (i >> 0)&0x3;
-    const int i1 = (i >> 2)&0x3;
-    const int i2 = (i >> 4)&0x3;
-    const int i3 = (i >> 6)&0x3;
-
-    if (&a == &b)
-     {
-         if (i0 == i1 && i0 == i2 && i0 == i3)
-         {
-             return (float32x4_t)vdupq_laneq_f32(a,i0);
-         }
-         static const uint8_t tbl[16] = {
-             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
-             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
-             (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3,
-             (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3
-         };
-         
-         return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl);
-         
-     }
-     else
-     {
-         
-         static const uint8_t tbl[16] = {
-             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
-             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
-             (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16,
-             (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16
-         };
-         
-         return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl));
-     }
-# endif //builtin(shufflevector)
-#endif
-}
-
-template <int i >
-FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b)
-{
-  switch (i)
-  {
-    case _MM_SHUFFLE(1, 0, 3, 2):
-      return _mm_shuffle_ps_1032(a, b);
-      break;
-    case _MM_SHUFFLE(2, 3, 0, 1):
-      return _mm_shuffle_ps_2301(a, b);
-      break;
-    case _MM_SHUFFLE(3, 2, 1, 0):
-      return _mm_shuffle_ps_3210(a, b);
-      break;
-    case _MM_SHUFFLE(0, 0, 1, 1):
-      return _mm_shuffle_ps_0011(a, b);
-      break;
-    case _MM_SHUFFLE(0, 0, 2, 2):
-      return _mm_shuffle_ps_0022(a, b);
-      break;
-    case _MM_SHUFFLE(2, 2, 0, 0):
-      return _mm_shuffle_ps_2200(a, b);
-      break;
-    case _MM_SHUFFLE(3, 2, 0, 2):
-      return _mm_shuffle_ps_3202(a, b);
-      break;
-    case _MM_SHUFFLE(1, 1, 3, 3):
-      return _mm_shuffle_ps_1133(a, b);
-      break;
-    case _MM_SHUFFLE(2, 0, 1, 0):
-      return _mm_shuffle_ps_2010(a, b);
-      break;
-    case _MM_SHUFFLE(2, 0, 0, 1):
-      return _mm_shuffle_ps_2001(a, b);
-      break;
-    case _MM_SHUFFLE(2, 0, 3, 2):
-      return _mm_shuffle_ps_2032(a, b);
-      break;
-    case _MM_SHUFFLE(0, 3, 2, 1):
-      return _mm_shuffle_ps_0321(a, b);
-      break;
-    case _MM_SHUFFLE(2, 1, 0, 3):
-      return _mm_shuffle_ps_2103(a, b);
-      break;
-    case _MM_SHUFFLE(1, 0, 1, 0):
-      return _mm_shuffle_ps_1010(a, b);
-      break;
-    case _MM_SHUFFLE(1, 0, 0, 1):
-      return _mm_shuffle_ps_1001(a, b);
-      break;
-    case _MM_SHUFFLE(0, 1, 0, 1):
-      return _mm_shuffle_ps_0101(a, b);
-      break;
-  }
-  return _mm_shuffle_ps_default<i>(a, b);
-}
-
-# if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default<i>(a,b)
-# else
-#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function<i>(a,b)
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b)
-{
-  return vcombine_s32(vget_high_s32(a), vget_low_s32(b));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end of result
-// takes the higher two 32 bit values from b and swaps them and places in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b)
-{
-  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b)));
-}
-
-// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b
-// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b)
-{
-  return vextq_s32(a, b, 1);
-}
-
-// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a
-// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b)
-{
-  return vextq_s32(a, b, 3);
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of b and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b)
-{
-  return vcombine_s32(vget_low_s32(a), vget_low_s32(a));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b)
-{
-  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
-// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b)
-{
-  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b)));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b)
-{
-  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2)));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b)
-{
-  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b)));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b)
-{
-  return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3)));
-}
-
-template <int i >
-FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b)
-{
-#if ENABLE_CPP_VERSION
-  __m128i ret;
-  ret[0] = a[i & 0x3];
-  ret[1] = a[(i >> 2) & 0x3];
-  ret[2] = b[(i >> 4) & 0x03];
-  ret[3] = b[(i >> 6) & 0x03];
-  return ret;
-#else
-  __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3));
-  ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1);
-  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2);
-  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3);
-  return ret;
-#endif
-}
-
-template <int i >
-FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b)
-{
-  switch (i)
-  {
-    case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break;
-    case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break;
-    case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break;
-    case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break;
-    case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break;
-    case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break;
-    case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break;
-    case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break;
-    case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break;
-    case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break;
-    default: return _mm_shuffle_epi32_default<i>(a, b);
-  }
-}
-
-template <int i >
-FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a)
-{
-  return vdupq_n_s32(vgetq_lane_s32(a, i));
-}
-
-template <int i>
-FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a)
-{
-  switch (i)
-  {
-    case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break;
-    case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break;
-    case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break;
-    case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break;
-    default: return _mm_shuffle_epi32_function<i>(a, a);
-  }
-}
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single<i>(a)
-
-template <int i>
-FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a)
-{
-  int16x8_t ret = (int16x8_t)a;
-  int16x4_t highBits = vget_high_s16(ret);
-  ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4);
-  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5);
-  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6);
-  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7);
-  return (__m128i)ret;
-}
-
-// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function<i>(a)
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm)
-
-// Based on SIMDe
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8)
-{
-#if defined(__aarch64__)
-    const int32x4_t s = vdupq_n_s32(imm8);
-    return vshlq_s32(a, s);
-#else
-  int32_t __attribute__((aligned(16))) data[4];
-  vst1q_s32(data, a);
-  const int s = (imm8 > 31) ? 0 : imm8;
-  data[0] = data[0] << s;
-  data[1] = data[1] << s;
-  data[2] = data[2] << s;
-  data[3] = data[3] << s;
-
-  return vld1q_s32(data);
-#endif
-}
-
-
-//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
-//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm)
-
-// Based on SIMDe
-FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8)
-{
-#if defined(__aarch64__)
-    const int shift = (imm8 > 31) ? 0 : imm8;  // Unfortunately, we need to check for this case for embree.
-    const int32x4_t s = vdupq_n_s32(-shift);
-    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s));
-#else
-  int32_t __attribute__((aligned(16))) data[4];
-  vst1q_s32(data, a);
-
-  const int s = (imm8 > 31) ? 0 : imm8;
-
-  data[0] = data[0] >> s;
-  data[1] = data[1] >> s;
-  data[2] = data[2] >> s;
-  data[3] = data[3] >> s;
-
-  return vld1q_s32(data);
-#endif
-}
-
-
-// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
-//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm)
-
-// Based on SIMDe
-FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8)
-{
-#if defined(__aarch64__)
-    const int32x4_t s = vdupq_n_s32(-imm8);
-    return vshlq_s32(a, s);
-#else
-  int32_t __attribute__((aligned(16))) data[4];
-  vst1q_s32(data, a);
-  const uint32_t m = (uint32_t) ((~0U) << (32 - imm8));
-
-  for (int i = 0; i < 4; i++) {
-    uint32_t is_neg = ((uint32_t) (((data[i]) >> 31)));
-    data[i] = (data[i] >> imm8) | (m * is_neg);
-  }
-
-  return vld1q_s32(data);
-#endif
-}
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm))
-#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm))
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm))
-
-// NEON does not provide a version of this function, here is an article about some ways to repro the results.
-// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
-// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
-{
-  uint8x16_t input = (uint8x16_t)_a;
-  const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
-  uint8x8_t mask_and = vdup_n_u8(0x80);
-  int8x8_t mask_shift = vld1_s8(xr);
-
-  uint8x8_t lo = vget_low_u8(input);
-  uint8x8_t hi = vget_high_u8(input);
-
-  lo = vand_u8(lo, mask_and);
-  lo = vshl_u8(lo, mask_shift);
-
-  hi = vand_u8(hi, mask_and);
-  hi = vshl_u8(hi, mask_shift);
-
-  lo = vpadd_u8(lo, lo);
-  lo = vpadd_u8(lo, lo);
-  lo = vpadd_u8(lo, lo);
-
-  hi = vpadd_u8(hi, hi);
-  hi = vpadd_u8(hi, hi);
-  hi = vpadd_u8(hi, hi);
-
-  return ((hi[0] << 8) | (lo[0] & 0xFF));
-}
-
-
-// ******************************************
-// Math operations
-// ******************************************
-
-// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-  return vsubq_f32(a, b);
-}
-
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-  return vsubq_f32(a, b);
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-  return vsubq_s32(a, b);
-}
-
-// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-  return vaddq_f32(a, b);
-}
-
-// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-  const float32_t     b0 = vgetq_lane_f32(b, 0);
-  float32x4_t         value = vdupq_n_f32(0);
-
-  //the upper values in the result must be the remnants of <a>.
-  value = vsetq_lane_f32(b0, value, 0);
-  return vaddq_f32(a, value);
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-  return vaddq_s32(a, b);
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-  return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b);
-}
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-  return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b);
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b)
-{
-  return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b);
-}
-
-// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-  return vmulq_f32(a, b);
-}
-
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-  return vmulq_f32(a, b);
-}
-
-// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-#if defined(BUILD_IOS)
-  return vdivq_f32(vdupq_n_f32(1.0f),in);
-    
-#endif
-    // Get an initial estimate of 1/in.
-  float32x4_t reciprocal = vrecpeq_f32(in);
-
-  // We only return estimated 1/in.
-  // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps().
-
-  // TODO(LTE): We could delete these ifdef?
-  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
-  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
-  return reciprocal;
-
-}
-
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 in)
-{
-  float32x4_t value;
-  float32x4_t result = in;
-
-  value = _mm_rcp_ps(in);
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(BUILD_IOS) 
-  return vdivq_f32(a,b);
-#else
-  float32x4_t reciprocal = _mm_rcp_ps(b);
-    
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-
-  // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp.
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-
-  // Another round for safety
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-
-    
-  return vmulq_f32(a, reciprocal);
-#endif
-}
-
-// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-  float32x4_t value;
-  float32x4_t result = a;
-  value = _mm_div_ps(a, b);
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-	
-  float32x4_t value = vrsqrteq_f32(in);
-  
-  // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values.
-  // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html
-  // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps
-  
-  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
-  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
-
-  // one more round to get better precision
-  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
-
-  // another round for safety
-  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
-
-  return value;
-}
-
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-  float32x4_t result = in;
-  
-  __m128 value = _mm_rsqrt_ps(in);
-
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-
-// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if defined(BUILD_IOS)
-  return vsqrtq_f32(in);
-#else
-  __m128 reciprocal = _mm_rsqrt_ps(in);
-  
-  // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf.
-  // We assign 0 to reciprocal wherever required.
-  const float32x4_t vzero = vdupq_n_f32(0.0f);
-  const uint32x4_t mask = vceqq_f32(in, vzero);
-  reciprocal = vbslq_f32(mask, vzero, reciprocal);
-  
-  // sqrt(x) = x * (1 / sqrt(x))
-  return vmulq_f32(in, reciprocal);
-#endif
-}
-
-// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-  float32x4_t value;
-  float32x4_t result = in;
-
-  value = _mm_sqrt_ps(in);
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-
-// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if USE_PRECISE_MINMAX_IMPLEMENTATION
-  return vbslq_f32(vcltq_f32(b,a),a,b);
-#else
-  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
-  return vmaxq_f32(a, b);
-#endif
-}
-
-// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if USE_PRECISE_MINMAX_IMPLEMENTATION
-  return vbslq_f32(vcltq_f32(a,b),a,b);
-#else
-  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
-  return vminq_f32(a, b);
-#endif
-}
-
-// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-  float32x4_t value;
-  float32x4_t result = a;
- 
-  value = _mm_max_ps(a, b);
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-  float32x4_t value;
-  float32x4_t result = a;
-
-    
-  value = _mm_min_ps(a, b);
-  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
-}
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-  return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b);
-}
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b )
-{
-  return vmaxq_s32(a,b);
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b )
-{
-  return vminq_s32(a,b);
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-  int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b);
-  ret = vshrq_n_s16(ret, 1);
-  return (__m128i)ret;
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point values a and b.
-//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
-{
-#if defined(__aarch64__)
-    return vpaddq_f32(a,b);
-#else
-// This does not work, no vpaddq...
-//	return (__m128) vpaddq_f32(a,b);
-        //
-        // get two f32x2_t values from a
-        // do vpadd
-        // put result in low half of f32x4 result
-        //
-        // get two f32x2_t values from b
-        // do vpadd
-        // put result in high half of f32x4 result
-        //
-        // combine
-        return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) );
-#endif
-}
-
-// ******************************************
-// Compare operations
-// ******************************************
-
-// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-  return (__m128)vcltq_f32(a, b);
-}
-
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-  return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b));
-}
-
-// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-  return (__m128)vcgtq_f32(a, b);
-}
-
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-  return (__m128) _mm_cmpgt_ps(a,b);
-}
-
-
-// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-  return (__m128)vcgeq_f32(a, b);
-}
-
-// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-  return (__m128)vcleq_f32(a, b);
-}
-
-// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-  return (__m128)vceqq_f32(a, b);
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-  return (__m128i)vcltq_s32(a, b);
-}
-
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-  return (__m128i) vceqq_s32(a,b);
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-  return (__m128i)vcgtq_s32(a, b);
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
-// see also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
-{
-  // Note: NEON does not have ordered compare builtin
-  // Need to compare a eq a and b eq b to check for NaN
-  // Do AND of results to get final
-  return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) );
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vcltq_f32(a, b);
-  return vgetq_lane_u32(value, 0);
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vcgtq_f32(a, b);
-  return vgetq_lane_u32(value, 0);
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vcleq_f32(a, b);
-  return vgetq_lane_u32(value, 0);
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vcgeq_f32(a, b);
-  return vgetq_lane_u32(value, 0);
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vceqq_f32(a, b);
-  return vgetq_lane_u32(value, 0);
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-  uint32x4_t value;
-
-  value = vceqq_f32(a, b);
-  return !vgetq_lane_u32(value, 0);
-}
-
-// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss      _mm_comilt_ss
-#define _mm_ucomile_ss      _mm_comile_ss
-#define _mm_ucomigt_ss      _mm_comigt_ss
-#define _mm_ucomige_ss      _mm_comige_ss
-#define _mm_ucomieq_ss      _mm_comieq_ss
-#define _mm_ucomineq_ss     _mm_comineq_ss
-
-// ******************************************
-// Conversions
-// ******************************************
-
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-  return vcvtq_s32_f32(a);
-}
-
-// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-  return vcvtq_f32_s32(a);
-}
-
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
-// It is supported on ARMv8 however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if 1
-  return vcvtnq_s32_f32(a);
-#else
-  __m128 half = vdupq_n_f32(0.5f);
-  const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
-  const __m128 aPlusHalf = vaddq_f32(a, half);
-  const __m128 aRound = vsubq_f32(aPlusHalf, sign);
-  return vcvtq_s32_f32(aRound);
-#endif
-}
-
-// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-  return vgetq_lane_s32(a, 0);
-}
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-  __m128i result = vdupq_n_s32(0);
-  return vsetq_lane_s32(a, result, 0);
-}
-
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-#if defined(__aarch64__)
-    return (__m128i)a;
-#else
-  return *(const __m128i *)&a;
-#endif
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-#if defined(__aarch64__)
-    return (__m128)a;
-#else
-  return *(const __m128 *)&a;
-#endif
-}
-
-// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-  return vld1q_s32((int32_t *)p);
-}
-
-FORCE_INLINE __m128d _mm_castps_pd(const __m128 a)
-{
-  return *(const __m128d *)&a;
-}
-
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
-{
-  return *(const __m128d *)&a;
-}
-// ******************************************
-// Miscellaneous Operations
-// ******************************************
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-  return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-  return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-  return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-  int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a);
-  int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b);
-
-  int8x8x2_t result = vzip_s8(a1, b1);
-
-  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-  int16x4_t a1 = vget_low_s16((int16x8_t)a);
-  int16x4_t b1 = vget_low_s16((int16x8_t)b);
-
-  int16x4x2_t result = vzip_s16(a1, b1);
-
-  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-  int32x2_t a1 = vget_low_s32(a);
-  int32x2_t b1 = vget_low_s32(b);
-
-  int32x2x2_t result = vzip_s32(a1, b1);
-
-  return vcombine_s32(result.val[0], result.val[1]);
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-  float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b));
-  return vcombine_f32(result.val[0], result.val[1]);
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-  float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b));
-  return vcombine_f32(result.val[0], result.val[1]);
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-  int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a);
-  int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b);
-
-  int8x8x2_t result = vzip_s8(a1, b1);
-
-  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-  int16x4_t a1 = vget_high_s16((int16x8_t)a);
-  int16x4_t b1 = vget_high_s16((int16x8_t)b);
-
-  int16x4x2_t result = vzip_s16(a1, b1);
-
-  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-  int32x2_t a1 = vget_high_s32(a);
-  int32x2_t b1 = vget_high_s32(b);
-
-  int32x2x2_t result = vzip_s32(a1, b1);
-
-  return vcombine_s32(result.val[0], result.val[1]);
-}
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm)
-
-// ******************************************
-// Streaming Extensions
-// ******************************************
-
-// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-  __sync_synchronize();
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-  *p = a;
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const*p)
-{
-  // no corollary for Neon?
-}
-
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b)
-{
-  // Stick to the flipped behavior of x86.
-  int64_t __attribute__((aligned(16))) data[2] = { b, a };
-  return (__m128i)vld1q_s64(data);
-}
-
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-  return (__m128i)vmovq_n_s64(_i);
-}
-
-#if defined(__aarch64__)
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
-{
-    int32x4_t mask = vshrq_n_s32(__m128i(c),31);
-    return vbslq_f32( uint32x4_t(mask), b, a);
-}
-
-FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr)
-{
-    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
-    uint16x8_t t1 = vmovl_u8(t0);
-    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
-    return vreinterpretq_s32_u32(t2);
-}
-
-FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr)
-{
-    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
-    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
-    return vreinterpretq_s32_u32(t1);
-}
-
-FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr)
-{
-    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
-    int16x8_t   t1 = vmovl_s8(t0);
-    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
-    float32x4_t t3 = vcvtq_f32_s32(t2);
-    return vreinterpretq_s32_f32(t3);
-}
-
-FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr)
-{
-    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
-    uint16x8_t  t1 = vmovl_u8(t0);
-    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
-    return vreinterpretq_s32_u32(t2);
-}
-
-FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr)
-{
-    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
-    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
-    float32x4_t t2 = vcvtq_f32_s32(t1);
-    return vreinterpretq_s32_f32(t2);
-}
-
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
-}
-
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr)
-{
-    // No non-temporal load on a single register on ARM.
-    return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr));
-}
-
-FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a)
-{
-    // No non-temporal store on a single register on ARM.
-    vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a));
-}
-
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
-}
-
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
-}
-
-FORCE_INLINE __m128 _mm_abs_ps(__m128 a)
-{
-    return vabsq_f32(a);
-}
-
-FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c)
-{
-    return vmlaq_f32(c, a, b);
-}
-
-FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c)
-{
-    return vmlsq_f32(c, a, b);
-}
-
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-  return vabsq_s32(a);
-}
-#endif  //defined(__aarch64__)
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-  return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-  return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-}
-
-#endif
diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h
deleted file mode 100644
index 32452fbe72..0000000000
--- a/thirdparty/embree-aarch64/common/math/affinespace.h
+++ /dev/null
@@ -1,361 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "linearspace2.h"
-#include "linearspace3.h"
-#include "quaternion.h"
-#include "bbox.h"
-#include "vec4.h"
-
-namespace embree
-{
-  #define VectorT typename L::Vector
-  #define ScalarT typename L::Vector::Scalar
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Affine Space
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L>
-    struct AffineSpaceT
-    {
-      L l;           /*< linear part of affine space */
-      VectorT p;     /*< affine part of affine space */
-
-      ////////////////////////////////////////////////////////////////////////////////
-      // Constructors, Assignment, Cast, Copy Operations
-      ////////////////////////////////////////////////////////////////////////////////
-
-      __forceinline AffineSpaceT           ( )                           { }
-      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
-      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
-      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
-
-      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
-      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
-
-      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
-
-      ////////////////////////////////////////////////////////////////////////////////
-      // Constants
-      ////////////////////////////////////////////////////////////////////////////////
-
-      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
-      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
-
-      /*! return matrix for scaling */
-      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
-
-      /*! return matrix for translation */
-      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
-
-      /*! return matrix for rotation, only in 2D */
-      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
-
-      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
-      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
-
-      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
-      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
-
-      /*! return matrix for looking at given point, only in 3D */
-      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
-        VectorT Z = normalize(point-eye);
-        VectorT U = normalize(cross(up,Z));
-        VectorT V = normalize(cross(Z,U));
-        return AffineSpaceT(L(U,V,Z),eye);
-      }
-
-    };
-  
-  // template specialization to get correct identity matrix for type AffineSpace3fa
-  template<>
-    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
-  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
-  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
-  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
-
-  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
-  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
-  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
-  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
-
-  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
-  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
-  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
-  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
-
-  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
-  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
-  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
-
-  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
-  { 
-    BBox3fa dst = empty;
-    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
-    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
-    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
-    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
-    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
-    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
-    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
-    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
-    return dst;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
-  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
-    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
-    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Template Instantiations
-  ////////////////////////////////////////////////////////////////////////////////
-
-  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
-  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
-  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
-  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
-  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
-  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
-
-  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
-  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
-  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
-  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
-
-  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
-  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
-  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
-  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
-
-  //////////////////////////////////////////////////////////////////////////////
-  /// Interpolation
-  //////////////////////////////////////////////////////////////////////////////
-  template<typename T, typename R>
-  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
-                                     const AffineSpaceT<T>& M1,
-                                     const R& t)
-  {
-    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
-  }
-
-  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
-  // three matrizes (D, R, S) that are interpolated individually.
-  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
-  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
-        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
-        const T& t)
-  {
-    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
-    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
-    QuaternionT<T> q = slerp(q0, q1, t);
-
-    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
-    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
-    D.p.x = S.l.vx.y;
-    D.p.y = S.l.vx.z;
-    D.p.z = S.l.vy.z;
-    S.l.vx.y = 0;
-    S.l.vx.z = 0;
-    S.l.vy.z = 0;
-
-    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
-    return D * R * S;
-  }
-
-  // this is a specialized version for Vec3fa because that does
-  // not play along nicely with the other templated Vec3/Vec4 types
-  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
-                                     const AffineSpace3ff& M1,
-                                     const float& t)
-  {
-    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
-    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
-    Quaternion3f q = slerp(q0, q1, t);
-
-    AffineSpace3fa S = lerp(M0, M1, t);
-    AffineSpace3fa D(one);
-    D.p.x = S.l.vx.y;
-    D.p.y = S.l.vx.z;
-    D.p.z = S.l.vy.z;
-    S.l.vx.y = 0;
-    S.l.vx.z = 0;
-    S.l.vy.z = 0;
-
-    AffineSpace3fa R = LinearSpace3fa(q);
-    return D * R * S;
-  }
-  
-  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
-  {
-    // compute affine transform from quaternion decomposition
-    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
-    AffineSpace3fa M = qd;
-    AffineSpace3fa D(one);
-    D.p.x = M.l.vx.y;
-    D.p.y = M.l.vx.z;
-    D.p.z = M.l.vy.z;
-    M.l.vx.y = 0;
-    M.l.vx.z = 0;
-    M.l.vy.z = 0;
-    AffineSpace3fa R = LinearSpace3fa(q);
-    return D * R * M;
-  }
-  
-  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
-  {
-    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
-    S = qd;
-    T.x = qd.l.vx.y;
-    T.y = qd.l.vx.z;
-    T.z = qd.l.vy.z;
-    S.l.vx.y = 0;
-    S.l.vx.z = 0;
-    S.l.vy.z = 0;
-  }
-
-  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
-  {
-    AffineSpace3ff M = S;
-    M.l.vx.w = q.i;
-    M.l.vy.w = q.j;
-    M.l.vz.w = q.k;
-    M.p.w    = q.r;
-    M.l.vx.y = T.x;
-    M.l.vx.z = T.y;
-    M.l.vy.z = T.z;
-    return M;
-  }
-
-  struct __aligned(16) QuaternionDecomposition
-  {
-    float scale_x = 1.f;
-    float scale_y = 1.f;
-    float scale_z = 1.f;
-    float skew_xy = 0.f;
-    float skew_xz = 0.f;
-    float skew_yz = 0.f;
-    float shift_x = 0.f;
-    float shift_y = 0.f;
-    float shift_z = 0.f;
-    float quaternion_r = 1.f;
-    float quaternion_i = 0.f;
-    float quaternion_j = 0.f;
-    float quaternion_k = 0.f;
-    float translation_x = 0.f;
-    float translation_y = 0.f;
-    float translation_z = 0.f;
-  };
-
-  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
-  {
-    QuaternionDecomposition qd;
-    qd.scale_x       = M.l.vx.x;
-    qd.scale_y       = M.l.vy.y;
-    qd.scale_z       = M.l.vz.z;
-    qd.shift_x       = M.p.x;
-    qd.shift_y       = M.p.y;
-    qd.shift_z       = M.p.z;
-    qd.translation_x = M.l.vx.y;
-    qd.translation_y = M.l.vx.z;
-    qd.translation_z = M.l.vy.z;
-    qd.skew_xy       = M.l.vy.x;
-    qd.skew_xz       = M.l.vz.x;
-    qd.skew_yz       = M.l.vz.y;
-    qd.quaternion_r  = M.p.w;
-    qd.quaternion_i  = M.l.vx.w;
-    qd.quaternion_j  = M.l.vy.w;
-    qd.quaternion_k  = M.l.vz.w;
-    return qd;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /*
-   * ! Template Specialization for 2D: return matrix for rotation around point
-   * (rotation around arbitrarty vector is not meaningful in 2D)
-   */
-  template<> __forceinline
-  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
-    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Similarity Transform
-  //
-  // checks, if M is a similarity transformation, i.e if there exists a factor D
-  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
-  ////////////////////////////////////////////////////////////////////////////////
-  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
-  {
-    if (D) *D = 0.f;
-    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
-    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
-    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
-
-    const float D_x = dot(M.l.vx, M.l.vx);
-    const float D_y = dot(M.l.vy, M.l.vy);
-    const float D_z = dot(M.l.vz, M.l.vz);
-
-    if (abs(D_x - D_y) > 1e-5f ||
-        abs(D_x - D_z) > 1e-5f ||
-        abs(D_y - D_z) > 1e-5f)
-      return false;
-
-    if (D) *D = sqrtf(D_x);
-    return true;
-  }
-
-  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
-  {
-    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
-    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
-    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
-    Vec3fa::storeu(&ptr->p, source.p);
-  }
-
-  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
-  {
-    AffineSpace3fa space;
-    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
-    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
-    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
-    space.p    = Vec3fa::loadu(&ptr->p);
-    return space;
-  }
-
-  #undef VectorT
-  #undef ScalarT
-}
diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h
deleted file mode 100644
index 29bb13912b..0000000000
--- a/thirdparty/embree-aarch64/common/math/bbox.h
+++ /dev/null
@@ -1,331 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "vec2.h"
-#include "vec3.h"
-
-namespace embree
-{
-  namespace internal {
-
-    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
-    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
-    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
-
-  } // namespace internal
-  template<typename T>
-  struct BBox
-  {
-    T lower, upper;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline BBox           ( )                   { }
-    template<typename T1>
-    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
-    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
-
-    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
-    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Extending Bounds
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
-    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
-
-    /*! tests if box is empty */
-    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
-
-    /*! computes the size of the box */
-    __forceinline T size() const { return upper - lower; }
-
-    /*! computes the center of the box */
-    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
-
-    /*! computes twice the center of the box */
-    __forceinline T center2() const { return lower+upper; }
-
-    /*! merges two boxes */
-    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
-      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
-    }
-
-     /*! enlarge box by some scaling factor */
-    __forceinline BBox enlarge_by(const float a) const {
-      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
-    }
-    
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
-    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
-    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
-    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
-    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
-    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
-  };
-
-  template<> __forceinline bool BBox<float>::empty() const {
-    return lower > upper;
-  }
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-  template<> __forceinline bool BBox<Vec3fa>::empty() const {
-    return !all(le_mask(lower,upper));
-  }
-  template<> __forceinline bool BBox<Vec3fx>::empty() const {
-    return !all(le_mask(lower,upper));
-  }
-#endif
-
-  /*! tests if box is finite */
-  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
-    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
-  }
-
-  /*! tests if box is finite and non-empty*/
-  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
-    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
-  }
-  
-  /*! tests if box has finite entries */
-  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
-    return is_finite(b.lower) && is_finite(b.upper);
-  }
-
-  /*! test if point contained in box */
-  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
-
-  /*! computes the center of the box */
-  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
-  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
-
-  /*! computes the volume of a bounding box */
-  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
-  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
-
-  /*! computes the volume of a bounding box */
-  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
-
-  /*! computes the surface area of a bounding box */
-  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
-
-  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
-  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
-
-  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
-  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
-
-  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
-  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
-
-  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
-
-  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
-    return halfArea(box);
-  }
-
-  /*! merges bounding boxes and points */
-  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
-  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
-  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
-
-  /*! Merges three boxes. */
-  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
-
-  /*! Merges four boxes. */
-  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
-    return merge(merge(a,b),merge(c,d));
-  }
-
-  /*! Comparison Operators */
-  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
-  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
-
-  /*! scaling */
-  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
-  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
-
-  /*! translations */
-  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
-  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
-  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
-  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
-
-  /*! extension */
-  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
-
-  /*! intersect bounding boxes */
-  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
-  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
-  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
-
-  /*! subtract bounds from each other */
-  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
-  {
-    c.lower = a.lower;
-    c.upper = min(a.upper,b.lower);
-    d.lower = max(a.lower,b.upper);
-    d.upper = a.upper;
-  }
-
-  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
-  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
-  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
-  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
-
-  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
-  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
-  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
-  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
-
-  /*! subset relation */
-  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
-  { 
-    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
-    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
-    return true; 
-  }
-
-  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
-    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
-  }
-
-  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
-    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
-  }
-  
-  /*! blending */
-  template<typename T>
-    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
-    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
-  }
-
-  /*! output operator */
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
-    return cout << "[" << box.lower << "; " << box.upper << "]";
-  }
-
-  /*! default template instantiations */
-  typedef BBox<float> BBox1f;
-  typedef BBox<Vec2f> BBox2f;
-  typedef BBox<Vec2fa> BBox2fa;
-  typedef BBox<Vec3f> BBox3f;
-  typedef BBox<Vec3fa> BBox3fa;
-  typedef BBox<Vec3fx> BBox3fx;
-  typedef BBox<Vec3ff> BBox3ff;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// SSE / AVX / MIC specializations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined (__SSE__) || defined(__ARM_NEON)
-#include "../simd/sse.h"
-#endif
-
-#if defined (__AVX__)
-#include "../simd/avx.h"
-#endif
-
-#if defined(__AVX512F__)
-#include "../simd/avx512.h"
-#endif
-
-namespace embree
-{
-  template<int N>
-    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
-  
-  template<>
-    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
-  {
-    BBox<Vec3<vfloat4>> dest;
-    
-    transpose((vfloat4&)bounds[0].lower,
-              (vfloat4&)bounds[1].lower,
-              (vfloat4&)bounds[2].lower,
-              (vfloat4&)bounds[3].lower,
-              dest.lower.x,
-              dest.lower.y,
-              dest.lower.z);
-    
-    transpose((vfloat4&)bounds[0].upper,
-              (vfloat4&)bounds[1].upper,
-              (vfloat4&)bounds[2].upper,
-              (vfloat4&)bounds[3].upper,
-              dest.upper.x,
-              dest.upper.y,
-              dest.upper.z);
-    
-    return dest;
-  }
-  
-#if defined(__AVX__)
-  template<>
-    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
-  {
-    BBox<Vec3<vfloat8>> dest;
-    
-    transpose((vfloat4&)bounds[0].lower,
-              (vfloat4&)bounds[1].lower,
-              (vfloat4&)bounds[2].lower,
-              (vfloat4&)bounds[3].lower,
-              (vfloat4&)bounds[4].lower,
-              (vfloat4&)bounds[5].lower,
-              (vfloat4&)bounds[6].lower,
-              (vfloat4&)bounds[7].lower,
-              dest.lower.x,
-              dest.lower.y,
-              dest.lower.z);
-    
-    transpose((vfloat4&)bounds[0].upper,
-              (vfloat4&)bounds[1].upper,
-              (vfloat4&)bounds[2].upper,
-              (vfloat4&)bounds[3].upper,
-              (vfloat4&)bounds[4].upper,
-              (vfloat4&)bounds[5].upper,
-              (vfloat4&)bounds[6].upper,
-              (vfloat4&)bounds[7].upper,
-              dest.upper.x,
-              dest.upper.y,
-              dest.upper.z);
-    
-    return dest;
-  }
-#endif
-  
-  template<int N>
-    __forceinline BBox3fa merge(const BBox3fa* bounds);
-  
-  template<>
-    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
-  {
-    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
-                             min(bounds[2].lower,bounds[3].lower));
-    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
-                             max(bounds[2].upper,bounds[3].upper));
-    return BBox3fa(lower,upper);
-  }
-  
-#if defined(__AVX__)
-  template<>
-    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
-  {
-    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
-                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
-    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
-                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
-    return BBox3fa(lower,upper);
-  }
-#endif
-}
-
diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h
deleted file mode 100644
index f52015fb88..0000000000
--- a/thirdparty/embree-aarch64/common/math/col3.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// RGB Color Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct Col3
-  {
-    T r, g, b;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Col3           ( )                   { }
-    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
-    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
-
-    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
-    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
-    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
-    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
-    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
-  };
-
-  /*! output operator */
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
-    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
-  }
-
-  /*! default template instantiations */
-  typedef Col3<uint8_t      > Col3uc;
-  typedef Col3<float        > Col3f;
-}
diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h
deleted file mode 100644
index 90df293f8e..0000000000
--- a/thirdparty/embree-aarch64/common/math/col4.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// RGBA Color Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct Col4
-  {
-    T r, g, b, a;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Col4           ( )                   { }
-    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
-    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
-
-    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
-    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
-    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
-    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
-    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
-  };
-
-  /*! output operator */
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
-    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
-  }
-
-  /*! default template instantiations */
-  typedef Col4<uint8_t      > Col4uc;
-  typedef Col4<float        > Col4f;
-}
diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h
deleted file mode 100644
index c3083e4fc0..0000000000
--- a/thirdparty/embree-aarch64/common/math/color.h
+++ /dev/null
@@ -1,257 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "constants.h"
-#include "col3.h"
-#include "col4.h"
-
-#include "../simd/sse.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE RGBA Color Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct Color4
-  {
-    union {
-      __m128 m128;
-      struct { float r,g,b,a; };
-    };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Color4 () {}
-    __forceinline Color4 ( const __m128 a ) : m128(a) {}
-
-    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
-    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
-
-    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
-    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
-    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
-    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
-
-    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
-    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
-
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Set
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
-    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
-    __forceinline void set(Col3uc& d) const 
-    {
-      vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (uint8_t)(s[0]); 
-      d.g = (uint8_t)(s[1]); 
-      d.b = (uint8_t)(s[2]); 
-    }
-    __forceinline void set(Col4uc& d) const 
-    {
-      vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (uint8_t)(s[0]); 
-      d.g = (uint8_t)(s[1]); 
-      d.b = (uint8_t)(s[2]); 
-      d.a = (uint8_t)(s[3]); 
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
-    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE RGB Color Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct Color
-  {
-    union {
-      __m128 m128;
-      struct { float r,g,b; };
-    };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Color () {}
-    __forceinline Color ( const __m128 a ) : m128(a) {}
-
-    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
-    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
-
-    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
-    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
-
-    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
-    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
-
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Set
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
-    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
-    __forceinline void set(Col3uc& d) const 
-    { 
-      vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (uint8_t)(s[0]); 
-      d.g = (uint8_t)(s[1]); 
-      d.b = (uint8_t)(s[2]); 
-    }
-    __forceinline void set(Col4uc& d) const 
-    { 
-      vfloat4 s = clamp(vfloat4(m128))*255.0f;
-      d.r = (uint8_t)(s[0]); 
-      d.g = (uint8_t)(s[1]); 
-      d.b = (uint8_t)(s[2]); 
-      d.a = 255; 
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
-    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const Color operator +( const Color& a ) { return a; }
-  __forceinline const Color operator -( const Color& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
-  }
-  __forceinline const Color abs  ( const Color& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
-  }
-  __forceinline const Color rcp  ( const Color& a )
-  {
-#if defined(__aarch64__) && defined(BUILD_IOS)
-    __m128 reciprocal = _mm_rcp_ps(a.m128);
-    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-    return (const Color)reciprocal;
-#else
-#if defined(__AVX512VL__)
-    const Color r = _mm_rcp14_ps(a.m128);
-#else
-    const Color r = _mm_rcp_ps(a.m128);
-#endif
-    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#endif  //defined(__aarch64__) && defined(BUILD_IOS)
-  }
-  __forceinline const Color rsqrt( const Color& a )
-  {
-#if defined(__aarch64__) && defined(BUILD_IOS)
-    __m128 r = _mm_rsqrt_ps(a.m128);
-    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-    return r;
-#else
-      
-#if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
-#else
-    __m128 r = _mm_rsqrt_ps(a.m128);
-#endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-      
-#endif  //defined(__aarch64__) && defined(BUILD_IOS)
-  }
-  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
-  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
-  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
-  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
-  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
-
-  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
-  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
-  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
-  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
-  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
-  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
-  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
-  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
-  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
-  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
-  __forceinline bool operator < ( const Color& a, const Color& b ) {
-    if (a.r != b.r) return a.r < b.r;
-    if (a.g != b.g) return a.g < b.g;
-    if (a.b != b.b) return a.b < b.b;
-    return false;
-  }
-
-   ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
-    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f, t, mask);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Special Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  /*! computes luminance of a color */
-  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
-
-  /*! output operator */
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
-    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp
deleted file mode 100644
index eeff131664..0000000000
--- a/thirdparty/embree-aarch64/common/math/constants.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#if defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-#include "constants.h"
-
-namespace embree
-{
-  TrueTy True;
-  FalseTy False;
-  ZeroTy zero;
-  OneTy one;
-  NegInfTy neg_inf;
-  PosInfTy inf;
-  PosInfTy pos_inf;
-  NaNTy nan;
-  UlpTy ulp;
-  PiTy pi;
-  OneOverPiTy one_over_pi;
-  TwoPiTy two_pi;
-  OneOverTwoPiTy one_over_two_pi;
-  FourPiTy four_pi;
-  OneOverFourPiTy one_over_four_pi;
-  StepTy step;
-  ReverseStepTy reverse_step;
-  EmptyTy empty;
-  UndefinedTy undefined;
-
-#if defined(__aarch64__)
-const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
-const uint32x4_t vzero = { 0, 0, 0, 0 };
-const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
-const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
-const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
-const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
-const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
-const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
-const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
-const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
-const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
-const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
-const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
-const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
-const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
-const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
-const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
-const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
-const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
-const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
-const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
-const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
-const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
-const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
-#endif
-
-}
diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h
deleted file mode 100644
index e80abec80f..0000000000
--- a/thirdparty/embree-aarch64/common/math/constants.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/platform.h"
-
-#include <limits>
-
-#define _USE_MATH_DEFINES
-#include <math.h> // using cmath causes issues under Windows
-#include <cfloat>
-#include <climits>
-
-// Math constants may not be defined in libcxx + mingw + strict C++ standard
-#if defined(__MINGW32__)
-
-// TODO(LTE): use constexpr
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-#ifndef M_1_PI
-#define M_1_PI 0.31830988618379067154
-#endif
-
-#endif // __MINGW32__
-
-namespace embree
-{
-  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
-  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
-
-  /* we consider floating point numbers in that range as valid input numbers */
-  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
-
-  struct TrueTy {
-    __forceinline operator bool( ) const { return true; }
-  };
-
-  extern MAYBE_UNUSED TrueTy True;
-
-  struct FalseTy {
-    __forceinline operator bool( ) const { return false; }
-  };
-
-  extern MAYBE_UNUSED FalseTy False;
-  
-  struct ZeroTy
-  {
-    __forceinline operator          double   ( ) const { return 0; }
-    __forceinline operator          float    ( ) const { return 0; }
-    __forceinline operator          long long( ) const { return 0; }
-    __forceinline operator unsigned long long( ) const { return 0; }
-    __forceinline operator          long     ( ) const { return 0; }
-    __forceinline operator unsigned long     ( ) const { return 0; }
-    __forceinline operator          int      ( ) const { return 0; }
-    __forceinline operator unsigned int      ( ) const { return 0; }
-    __forceinline operator          short    ( ) const { return 0; }
-    __forceinline operator unsigned short    ( ) const { return 0; }
-    __forceinline operator          int8_t     ( ) const { return 0; }
-    __forceinline operator uint8_t     ( ) const { return 0; }
-  }; 
-
-  extern MAYBE_UNUSED ZeroTy zero;
-
-  struct OneTy
-  {
-    __forceinline operator          double   ( ) const { return 1; }
-    __forceinline operator          float    ( ) const { return 1; }
-    __forceinline operator          long long( ) const { return 1; }
-    __forceinline operator unsigned long long( ) const { return 1; }
-    __forceinline operator          long     ( ) const { return 1; }
-    __forceinline operator unsigned long     ( ) const { return 1; }
-    __forceinline operator          int      ( ) const { return 1; }
-    __forceinline operator unsigned int      ( ) const { return 1; }
-    __forceinline operator          short    ( ) const { return 1; }
-    __forceinline operator unsigned short    ( ) const { return 1; }
-    __forceinline operator          int8_t     ( ) const { return 1; }
-    __forceinline operator uint8_t     ( ) const { return 1; }
-  };
-
-  extern MAYBE_UNUSED OneTy one;
-
-  struct NegInfTy
-  {
-    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
-    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
-    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
-    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
-    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
-    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
-    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
-    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
-    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
-    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
-    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::min(); }
-    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::min(); }
-
-  };
-
-  extern MAYBE_UNUSED NegInfTy neg_inf;
-
-  struct PosInfTy
-  {
-    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
-    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
-    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
-    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
-    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
-    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
-    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
-    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
-    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
-    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
-    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::max(); }
-    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::max(); }
-  };
-
-  extern MAYBE_UNUSED PosInfTy inf;
-  extern MAYBE_UNUSED PosInfTy pos_inf;
-
-  struct NaNTy
-  {
-    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
-    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
-  };
-
-  extern MAYBE_UNUSED NaNTy nan;
-
-  struct UlpTy
-  {
-    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
-    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
-  };
-
-  extern MAYBE_UNUSED UlpTy ulp;
-
-  struct PiTy
-  {
-    __forceinline operator double( ) const { return double(M_PI); }
-    __forceinline operator float ( ) const { return float(M_PI); }
-  };
-
-  extern MAYBE_UNUSED PiTy pi;
-
-  struct OneOverPiTy
-  {
-    __forceinline operator double( ) const { return double(M_1_PI); }
-    __forceinline operator float ( ) const { return float(M_1_PI); }
-  };
-
-  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
-
-  struct TwoPiTy
-  {
-    __forceinline operator double( ) const { return double(2.0*M_PI); }
-    __forceinline operator float ( ) const { return float(2.0*M_PI); }
-  };
-
-  extern MAYBE_UNUSED TwoPiTy two_pi;
-
-  struct OneOverTwoPiTy
-  {
-    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
-    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
-  };
-
-  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
-
-  struct FourPiTy
-  {
-    __forceinline operator double( ) const { return double(4.0*M_PI); } 
-    __forceinline operator float ( ) const { return float(4.0*M_PI); }
-  };
-
-  extern MAYBE_UNUSED FourPiTy four_pi;
-
-  struct OneOverFourPiTy
-  {
-    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
-    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
-  };
-
-  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
-
-  struct StepTy {
-  };
-
-  extern MAYBE_UNUSED StepTy step;
-
-  struct ReverseStepTy {
-  };
-
-  extern MAYBE_UNUSED ReverseStepTy reverse_step;
-
-  struct EmptyTy {
-  };
-
-  extern MAYBE_UNUSED EmptyTy empty;
-
-  struct FullTy {
-  };
-
-  extern MAYBE_UNUSED FullTy full;
-
-  struct UndefinedTy {
-  };
-
-  extern MAYBE_UNUSED UndefinedTy undefined;
-    
-#if defined(__aarch64__)
-  extern const uint32x4_t movemask_mask;
-  extern const uint32x4_t vzero;
-  extern const uint32x4_t v0x80000000;
-  extern const uint32x4_t v0x7fffffff;
-  extern const uint32x4_t v000F;
-  extern const uint32x4_t v00F0;
-  extern const uint32x4_t v00FF;
-  extern const uint32x4_t v0F00;
-  extern const uint32x4_t v0F0F;
-  extern const uint32x4_t v0FF0;
-  extern const uint32x4_t v0FFF;
-  extern const uint32x4_t vF000;
-  extern const uint32x4_t vF00F;
-  extern const uint32x4_t vF0F0;
-  extern const uint32x4_t vF0FF;
-  extern const uint32x4_t vFF00;
-  extern const uint32x4_t vFF0F;
-  extern const uint32x4_t vFFF0;
-  extern const uint32x4_t vFFFF;
-  extern const uint8x16_t v0022;
-  extern const uint8x16_t v1133;
-  extern const uint8x16_t v0101;
-  extern const float32x4_t vOne;
-  extern const float32x4_t vmOne;
-  extern const float32x4_t vInf;
-  extern const float32x4_t vmInf;
-#endif
-}
diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h
deleted file mode 100644
index f06478e881..0000000000
--- a/thirdparty/embree-aarch64/common/math/interval.h
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "vec2.h"
-#include "vec3.h"
-#include "bbox.h"
-
-namespace embree
-{
-  template<typename V>
-    struct Interval
-    {
-      V lower, upper;
-      
-      __forceinline Interval() {}
-      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
-      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
-
-      __forceinline Interval(const V& a) : lower(a), upper(a) {}
-      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
-      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
-          
-      /*! tests if box is empty */
-      //__forceinline bool empty() const { return lower > upper; }
-      
-      /*! computes the size of the interval */
-      __forceinline V size() const { return upper - lower; }
-      
-      __forceinline V center() const { return 0.5f*(lower+upper); }
-      
-      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
-      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
-      
-      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
-        return Interval(a.lower+b.lower,a.upper+b.upper);
-      }
-      
-      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
-        return Interval(a.lower-b.upper,a.upper-b.lower);
-      }
-      
-      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
-        return Interval(a.lower-b,a.upper-b);
-      }
-      
-      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
-      {
-        const V ll = a.lower*b.lower;
-        const V lu = a.lower*b.upper;
-        const V ul = a.upper*b.lower;
-        const V uu = a.upper*b.upper;
-        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
-      }
-      
-      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
-        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
-      }
-      
-      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
-        return merge(merge(a,b),c);
-      }
-      
-      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
-        return merge(merge(a,b),merge(c,d));
-      }
-      
-      /*! intersect bounding boxes */
-      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
-      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
-      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
-      
-      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
-        return cout << "[" << a.lower << ", " << a.upper << "]";
-      }
-      
-      ////////////////////////////////////////////////////////////////////////////////
-      /// Constants
-      ////////////////////////////////////////////////////////////////////////////////
-      
-      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
-      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
-    };
-
-  __forceinline bool isEmpty(const Interval<float>& v) { 
-    return v.lower > v.upper;
-  }
-
-  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
-    return v.lower > v.upper;
-  }
-  
-  /*! subset relation */
-  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
-    return (a.lower > b.lower) && (a.upper < b.upper);
-  }
-
-  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
-    return subset(a.x,b.x) && subset(a.y,b.y);
-  }
-
-  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
-    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
-    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
-  }
-
-  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
-    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
-  }
-
-  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
-  {
-    float eps = 1E-4f;
-    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
-    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
-    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
-  }
-  
-  typedef Interval<float> Interval1f;
-  typedef Vec2<Interval<float>> Interval2f;
-  typedef Vec3<Interval<float>> Interval3f;
-
-inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
-
-inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
-
-#define TWO_PI (2.0*M_PI)
-inline Interval1f sin(Interval1f interval)
-{
-  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
-  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
-  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
-  float sinLower = sin(interval.lower);
-  float sinUpper = sin(interval.upper);
-  if (sinLower > sinUpper) swap(sinLower, sinUpper);
-  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
-  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
-  return Interval1f(sinLower, sinUpper);
-}
-
-inline Interval1f cos(Interval1f interval)
-{
-  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
-  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
-  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
-  float cosLower = cos(interval.lower);
-  float cosUpper = cos(interval.upper);
-  if (cosLower > cosUpper) swap(cosLower, cosUpper);
-  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
-  return Interval1f(cosLower, cosUpper);
-}
-#undef TWO_PI
-}
diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h
deleted file mode 100644
index 95df4a918d..0000000000
--- a/thirdparty/embree-aarch64/common/math/lbbox.h
+++ /dev/null
@@ -1,289 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "bbox.h"
-#include "range.h"
-
-namespace embree
-{
-  template<typename T>
-    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
-  {
-    const float rcp_dt_size = float(1.0f)/dt.size();
-    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
-    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
-    return std::make_pair(g0,g1);
-  }
-
-  template<typename T>
-  struct LBBox
-  {
-  public:
-    __forceinline LBBox () {}
-
-    template<typename T1>
-    __forceinline LBBox ( const LBBox<T1>& other )
-    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
-
-    __forceinline LBBox& operator= ( const LBBox& other ) { 
-      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
-    }
-
-    __forceinline LBBox (EmptyTy) 
-      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
-    
-    __forceinline explicit LBBox ( const BBox<T>& bounds) 
-      : bounds0(bounds), bounds1(bounds) { }
-    
-    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
-      : bounds0(bounds0), bounds1(bounds1) { }
-
-    LBBox ( const avector<BBox<T>>& bounds ) 
-    {
-      assert(bounds.size());
-      BBox<T> b0 = bounds.front();
-      BBox<T> b1 = bounds.back();
-      for (size_t i=1; i<bounds.size()-1; i++) {
-        const float f = float(i)/float(bounds.size()-1);
-        const BBox<T> bt = lerp(b0,b1,f);
-        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
-        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
-        b0.lower += dlower; b1.lower += dlower;
-        b0.upper += dupper; b1.upper += dupper;
-      }
-      bounds0 = b0;
-      bounds1 = b1;
-    }
-
-    /*! calculates the linear bounds of a primitive for the specified time range */
-    template<typename BoundsFunc>
-    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
-    {
-      const float lower = time_range.lower*numTimeSegments;
-      const float upper = time_range.upper*numTimeSegments;
-      const float ilowerf = floor(lower);
-      const float iupperf = ceil(upper);
-      const int ilower = (int)ilowerf;
-      const int iupper = (int)iupperf;
-
-      const BBox<T> blower0 = bounds(ilower);
-      const BBox<T> bupper1 = bounds(iupper);
-
-      if (iupper-ilower == 1) {
-        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
-        bounds1 = lerp(bupper1, blower0, iupperf-upper);
-        return;
-      }
-
-      const BBox<T> blower1 = bounds(ilower+1);
-      const BBox<T> bupper0 = bounds(iupper-1);
-      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
-      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
-
-      for (int i = ilower+1; i < iupper; i++)
-      {
-        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
-        const BBox<T> bt = lerp(b0, b1, f);
-        const BBox<T> bi = bounds(i);
-        const T dlower = min(bi.lower-bt.lower, T(zero));
-        const T dupper = max(bi.upper-bt.upper, T(zero));
-        b0.lower += dlower; b1.lower += dlower;
-        b0.upper += dupper; b1.upper += dupper;
-      }
-
-      bounds0 = b0;
-      bounds1 = b1;
-    }
-
-    /*! calculates the linear bounds of a primitive for the specified time range */
-    template<typename BoundsFunc>
-    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
-    {
-      /* normalize global time_range_in to local geom_time_range */
-      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
-                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
-        
-      const float lower = time_range.lower*geom_time_segments;
-      const float upper = time_range.upper*geom_time_segments;
-      const float ilowerf = floor(lower);
-      const float iupperf = ceil(upper);
-      const float ilowerfc = max(0.0f,ilowerf);
-      const float iupperfc = min(iupperf,geom_time_segments);
-      const int   ilowerc = (int)ilowerfc;
-      const int   iupperc = (int)iupperfc;
-      assert(iupperc-ilowerc > 0);
-
-      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
-      const int ilower_iter = max(-1,(int)ilowerf);
-      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
-        
-      const BBox<T> blower0 = bounds(ilowerc);
-      const BBox<T> bupper1 = bounds(iupperc);
-      if (iupper_iter-ilower_iter == 1) {
-        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
-        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
-        return;
-      }
-
-      const BBox<T> blower1 = bounds(ilowerc+1);
-      const BBox<T> bupper0 = bounds(iupperc-1);
-      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
-      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
-
-      for (int i = ilower_iter+1; i < iupper_iter; i++)
-      {
-        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
-        const BBox<T> bt = lerp(b0, b1, f);
-        const BBox<T> bi = bounds(i);
-        const T dlower = min(bi.lower-bt.lower, T(zero));
-        const T dupper = max(bi.upper-bt.upper, T(zero));
-        b0.lower += dlower; b1.lower += dlower;
-        b0.upper += dupper; b1.upper += dupper;
-      }
-
-      bounds0 = b0;
-      bounds1 = b1;
-    }
-
-    /*! calculates the linear bounds of a primitive for the specified time range */
-    template<typename BoundsFunc>
-    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
-    {
-      const int ilower = time_range.begin();
-      const int iupper = time_range.end();
-
-      BBox<T> b0 = bounds(ilower);
-      BBox<T> b1 = bounds(iupper);
-
-      if (iupper-ilower == 1)
-      {
-        bounds0 = b0;
-        bounds1 = b1;
-        return;
-      }
-  
-      for (int i = ilower+1; i<iupper; i++)
-      {
-        const float f = float(i - time_range.begin()) / float(time_range.size());
-        const BBox<T> bt = lerp(b0, b1, f);
-        const BBox<T> bi = bounds(i);
-        const T dlower = min(bi.lower-bt.lower, T(zero));
-        const T dupper = max(bi.upper-bt.upper, T(zero));
-        b0.lower += dlower; b1.lower += dlower;
-        b0.upper += dupper; b1.upper += dupper;
-      }
-
-      bounds0 = b0;
-      bounds1 = b1;
-    }
-
-  public:
-
-    __forceinline bool empty() const {
-      return bounds().empty();
-    }
-
-    __forceinline BBox<T> bounds () const {
-      return merge(bounds0,bounds1);
-    }
-
-    __forceinline BBox<T> interpolate( const float t ) const {
-      return lerp(bounds0,bounds1,t);
-    }
-
-    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
-      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
-    }
-
-    __forceinline void extend( const LBBox& other ) {
-      bounds0.extend(other.bounds0);
-      bounds1.extend(other.bounds1);
-    }
-
-    __forceinline float expectedHalfArea() const;
-
-    __forceinline float expectedHalfArea(const BBox1f& dt) const {
-      return interpolate(dt).expectedHalfArea();
-    }
-
-    __forceinline float expectedApproxHalfArea() const {
-      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
-    }
-
-    /* calculates bounds for [0,1] time range from bounds in dt time range */
-    __forceinline LBBox global(const BBox1f& dt) const 
-    {
-      const float rcp_dt_size = 1.0f/dt.size();
-      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
-      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
-      return LBBox(b0,b1);
-    }
-
-    /*! Comparison Operators */
-    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
-    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
-    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
-    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
-    
-    /*! output operator */
-    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
-      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
-    }
-
-  public:
-    BBox<T> bounds0, bounds1;
-  };
-
-  /*! tests if box is finite */
-  template<typename T>
-    __forceinline bool isvalid( const LBBox<T>& v ) {
-    return isvalid(v.bounds0) && isvalid(v.bounds1);
-  }
-
-  template<typename T>
-    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
-    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
-  }
-  
-  template<typename T>
-    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
-  {
-    const T da = a1-a0;
-    const T db = b1-b0;
-    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
-  }
-  
-  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
-  {
-    const Vec3fa d0 = bounds0.size();
-    const Vec3fa d1 = bounds1.size();
-    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
-                                   Vec3fa(d1.x,d1.y,d1.z),
-                                   Vec3fa(d0.y,d0.z,d0.x),
-                                   Vec3fa(d1.y,d1.z,d1.x)));
-  }
-
-  template<typename T>
-  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
-    return box.expectedApproxHalfArea(); 
-  }
-
-  template<typename T>
-  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
-    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
-  }
-
-   /*! subset relation */
-  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
-    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
-  }
-
-  /*! default template instantiations */
-  typedef LBBox<float> LBBox1f;
-  typedef LBBox<Vec2f> LBBox2f;
-  typedef LBBox<Vec3f> LBBox3f;
-  typedef LBBox<Vec3fa> LBBox3fa;
-  typedef LBBox<Vec3fx> LBBox3fx;
-}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h
deleted file mode 100644
index b9a382962c..0000000000
--- a/thirdparty/embree-aarch64/common/math/linearspace2.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "vec2.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// 2D Linear Transform (2x2 Matrix)
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct LinearSpace2
-  {
-    typedef T Vector;
-    typedef typename T::Scalar Scalar;
-
-    /*! default matrix constructor */
-    __forceinline LinearSpace2           ( ) {}
-    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
-    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
-
-    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
-
-    /*! matrix construction from column vectors */
-    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
-      : vx(vx), vy(vy) {}
-
-    /*! matrix construction from row mayor data */
-    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
-                               const Scalar& m10, const Scalar& m11)
-      : vx(m00,m10), vy(m01,m11) {}
-
-    /*! compute the determinant of the matrix */
-    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
-
-    /*! compute adjoint matrix */
-    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
-
-    /*! compute inverse matrix */
-    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
-
-    /*! compute transposed matrix */
-    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
-
-    /*! returns first row of matrix */
-    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
-
-    /*! returns second row of matrix */
-    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
-    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
-
-    /*! return matrix for scaling */
-    static __forceinline LinearSpace2 scale(const Vector& s) {
-      return LinearSpace2(s.x,   0,
-                          0  , s.y);
-    }
-
-    /*! return matrix for rotation */
-    static __forceinline LinearSpace2 rotate(const Scalar& r) {
-      Scalar s = sin(r), c = cos(r);
-      return LinearSpace2(c, -s,
-                          s,  c);
-    }
-
-    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
-    LinearSpace2 orthogonal() const 
-    {
-      LinearSpace2 m = *this;
-
-      // mirrored?
-      Scalar mirror(one);
-      if (m.det() < Scalar(zero)) {
-        m.vx = -m.vx;
-        mirror = -mirror;
-      }
-
-      // rotation
-      for (int i = 0; i < 99; i++) {
-        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
-        const LinearSpace2 d = m_next - m;
-        m = m_next;
-        // norm^2 of difference small enough?
-        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
-          break;
-      }
-
-      // rotation * mirror_x
-      return LinearSpace2(mirror*m.vx, m.vy);
-    }
-
-  public:
-
-    /*! the column vectors of the matrix */
-    Vector vx,vy;
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
-  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
-  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
-  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
-
-  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
-  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
-  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
-
-  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
-  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
-
-  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
-  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
-  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
-    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
-  }
-
-  /*! Shortcuts for common linear spaces. */
-  typedef LinearSpace2<Vec2f> LinearSpace2f;
-  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
-}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h
deleted file mode 100644
index 12b5bb776b..0000000000
--- a/thirdparty/embree-aarch64/common/math/linearspace3.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "vec3.h"
-#include "quaternion.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// 3D Linear Transform (3x3 Matrix)
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct LinearSpace3
-  {
-    typedef T Vector;
-    typedef typename T::Scalar Scalar;
-
-    /*! default matrix constructor */
-    __forceinline LinearSpace3           ( ) {}
-    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
-    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
-
-    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
-
-    /*! matrix construction from column vectors */
-    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
-      : vx(vx), vy(vy), vz(vz) {}
-
-    /*! construction from quaternion */
-    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
-      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
-      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
-      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
-
-    /*! matrix construction from row mayor data */
-    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
-                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
-                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
-      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
-
-    /*! compute the determinant of the matrix */
-    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
-
-    /*! compute adjoint matrix */
-    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
-
-    /*! compute inverse matrix */
-    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
-
-    /*! compute transposed matrix */
-    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
-
-    /*! returns first row of matrix */
-    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
-
-    /*! returns second row of matrix */
-    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
-
-    /*! returns third row of matrix */
-    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
-    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
-
-    /*! return matrix for scaling */
-    static __forceinline LinearSpace3 scale(const Vector& s) {
-      return LinearSpace3(s.x,   0,   0,
-                          0  , s.y,   0,
-                          0  ,   0, s.z);
-    }
-
-    /*! return matrix for rotation around arbitrary axis */
-    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
-      Vector u = normalize(_u);
-      Scalar s = sin(r), c = cos(r);
-      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
-                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
-                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
-    }
-
-  public:
-
-    /*! the column vectors of the matrix */
-    Vector vx,vy,vz;
-  };
-
-  /*! compute transposed matrix */
-  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
-    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
-    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
-  }
-
-  template<typename T>
-    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
-    return xfm.transposed();
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
-  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
-  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
-
-  /* constructs a coordinate frame form a normalized normal */
-  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
-  {
-    const T dx0(0,N.z,-N.y);
-    const T dx1(-N.z,0,N.x);
-    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
-    const T dy = normalize(cross(N,dx));
-    return LinearSpace3<T>(dx,dy,N);
-  }
-
-  /* constructs a coordinate frame from a normal and approximate x-direction */
-  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
-  {
-    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
-    const T dx = normalize(cross(dxi,N));
-    const T dy = normalize(cross(N,dx));
-    return LinearSpace3<T>(dx,dy,N);
-  }
-  
-  /* clamps linear space to range -1 to +1 */
-  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
-    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
-                           clamp(space.vy,T(-1.0f),T(1.0f)),
-                           clamp(space.vz,T(-1.0f),T(1.0f)));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
-  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
-
-  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
-  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
-  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
-
-  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
-  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
-
-  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
-  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
-
-  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
-  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
-  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
-  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
-    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
-  }
-
-  /*! blending */
-  template<typename T>
-    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
-  {
-    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
-                           lerp(l0.vy,l1.vy,t),
-                           lerp(l0.vz,l1.vz,t));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
-    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
-  }
-
-  /*! Shortcuts for common linear spaces. */
-  typedef LinearSpace3<Vec3f> LinearSpace3f;
-  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
-  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
-  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
-
-  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
-  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
-  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
-  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
-
-  /*! blending */
-  template<typename T, typename S>
-    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
-                                       const LinearSpace3<T>& l1,
-                                       const S& t)
-  {
-    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
-                           lerp(l0.vy,l1.vy,t),
-                           lerp(l0.vz,l1.vz,t));
-  }
-
-}
diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h
deleted file mode 100644
index 6d54abd44d..0000000000
--- a/thirdparty/embree-aarch64/common/math/math.h
+++ /dev/null
@@ -1,451 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/platform.h"
-#include "../sys/intrinsics.h"
-#include "constants.h"
-#include <cmath>
-
-#if defined(__ARM_NEON)
-#include "SSE2NEON.h"
-#if defined(NEON_AVX2_EMULATION)
-#include "AVX2NEON.h"
-#endif
-#else
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#include <immintrin.h>
-#endif
-
-#if defined(__WIN32__) && !defined(__MINGW32__)
-#if (__MSV_VER <= 1700)
-namespace std
-{
-  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
-  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
-  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
-}
-#endif
-#endif
-
-namespace embree
-{
-  __forceinline bool isvalid ( const float& v ) {
-    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
-  }
-
-  __forceinline int cast_f2i(float f) {
-    union { float f; int i; } v; v.f = f; return v.i;
-  }
-
-  __forceinline float cast_i2f(int i) {
-    union { float f; int i; } v; v.i = i; return v.f;
-  }
-
-  __forceinline int   toInt  (const float& a) { return int(a); }
-  __forceinline float toFloat(const int&   a) { return float(a); }
-
-#if defined(__WIN32__) && !defined(__MINGW32__)
-  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
-#endif
-
-  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
-  __forceinline float sqr  ( const float x ) { return x*x; }
-
-  __forceinline float rcp  ( const float x )
-  {
-#if defined(__aarch64__)
-      // Move scalar to vector register and do rcp.
-      __m128 a;
-      a[0] = x;
-      float32x4_t reciprocal = vrecpeq_f32(a);
-      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
-      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
-      return reciprocal[0];
-#else
-
-    const __m128 a = _mm_set_ss(x);
-
-#if defined(__AVX512VL__)
-    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
-#else
-    const __m128 r = _mm_rcp_ss(a);
-#endif
-
-#if defined(__AVX2__)
-    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
-#else
-    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
-#endif
-
-#endif  //defined(__aarch64__)
-  }
-
-  __forceinline float signmsk ( const float x ) {
-#if defined(__aarch64__)
-      // FP and Neon shares same vector register in arm64
-      __m128 a;
-      __m128i b;
-      a[0] = x;
-      b[0] = 0x80000000;
-      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
-      return a[0];
-#else
-    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#endif
-  }
-  __forceinline float xorf( const float x, const float y ) {
-#if defined(__aarch64__)
-      // FP and Neon shares same vector register in arm64
-      __m128 a;
-      __m128 b;
-      a[0] = x;
-      b[0] = y;
-      a = _mm_xor_ps(a, b);
-      return a[0];
-#else
-    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
-#endif
-  }
-  __forceinline float andf( const float x, const unsigned y ) {
-#if defined(__aarch64__) 
-      // FP and Neon shares same vector register in arm64
-      __m128 a;
-      __m128i b;
-      a[0] = x;
-      b[0] = y;
-      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
-      return a[0];
-#else
-    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
-#endif
-  }
-  __forceinline float rsqrt( const float x )
-  {
-#if defined(__aarch64__)
-      // FP and Neon shares same vector register in arm64
-      __m128 a;
-      a[0] = x;
-      __m128 value = _mm_rsqrt_ps(a);
-      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
-      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
-      return value[0];
-#else
-
-    const __m128 a = _mm_set_ss(x);
-#if defined(__AVX512VL__)
-    const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
-#else
-    const __m128 r = _mm_rsqrt_ss(a);
-#endif
-    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
-                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
-    return _mm_cvtss_f32(c);
-#endif
-  }
-
-#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
-  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
-  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
-  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
-#else
-  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
-  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
-#endif
-
-  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
-  __forceinline float acos ( const float x ) { return ::acosf (x); }
-  __forceinline float asin ( const float x ) { return ::asinf (x); }
-  __forceinline float atan ( const float x ) { return ::atanf (x); }
-  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
-  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
-  __forceinline float cosh ( const float x ) { return ::coshf (x); }
-  __forceinline float exp  ( const float x ) { return ::expf  (x); }
-  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
-  __forceinline float log  ( const float x ) { return ::logf  (x); }
-  __forceinline float log10( const float x ) { return ::log10f(x); }
-  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
-  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
-  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
-  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
-  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
-  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
-  __forceinline float floor( const float x ) { return ::floorf (x); }
-  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
-  __forceinline float frac ( const float x ) { return x-floor(x); }
-
-  __forceinline double abs  ( const double x ) { return ::fabs(x); }
-  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
-  __forceinline double acos ( const double x ) { return ::acos (x); }
-  __forceinline double asin ( const double x ) { return ::asin (x); }
-  __forceinline double atan ( const double x ) { return ::atan (x); }
-  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
-  __forceinline double cos  ( const double x ) { return ::cos  (x); }
-  __forceinline double cosh ( const double x ) { return ::cosh (x); }
-  __forceinline double exp  ( const double x ) { return ::exp  (x); }
-  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
-  __forceinline double log  ( const double x ) { return ::log  (x); }
-  __forceinline double log10( const double x ) { return ::log10(x); }
-  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
-  __forceinline double rcp  ( const double x ) { return 1.0/x; }
-  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
-  __forceinline double sin  ( const double x ) { return ::sin  (x); }
-  __forceinline double sinh ( const double x ) { return ::sinh (x); }
-  __forceinline double sqr  ( const double x ) { return x*x; }
-  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
-  __forceinline double tan  ( const double x ) { return ::tan  (x); }
-  __forceinline double tanh ( const double x ) { return ::tanh (x); }
-  __forceinline double floor( const double x ) { return ::floor (x); }
-  __forceinline double ceil ( const double x ) { return ::ceil (x); }
-
-#if defined(__aarch64__) 
-    __forceinline float mini(float a, float b) {
-        // FP and Neon shares same vector register in arm64
-        __m128 x;
-        __m128 y;
-        x[0] = a;
-        y[0] = b;
-        x = _mm_min_ps(x, y);
-        return x[0];
-    }
-#elif defined(__SSE4_1__)
-  __forceinline float mini(float a, float b) {
-    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
-    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
-    const __m128i ci = _mm_min_epi32(ai,bi);
-    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
-  }
-#endif
-
-#if defined(__aarch64__) 
-    __forceinline float maxi(float a, float b) {
-        // FP and Neon shares same vector register in arm64
-        __m128 x;
-        __m128 y;
-        x[0] = a;
-        y[0] = b;
-        x = _mm_max_ps(x, y);
-        return x[0];
-    }
-#elif defined(__SSE4_1__)
-  __forceinline float maxi(float a, float b) {
-    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
-    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
-    const __m128i ci = _mm_max_epi32(ai,bi);
-    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
-  }
-#endif
-
-  template<typename T>
-    __forceinline T twice(const T& a) { return a+a; }
-
-  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
-  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
-  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
-  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
-  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
-#if defined(__X86_64__) || defined(__aarch64__)
-  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
-#endif
-
-  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
-  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
-  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
-
-  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
-  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
-  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
-
-  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
-  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
-  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
-  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
-  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
-#if defined(__X86_64__) || defined(__aarch64__)
-  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
-#endif
-
-  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
-  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
-  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
-
-  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
-  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
-  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
-
-#if defined(__MACOSX__)
-  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
-  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
-#endif
-
-#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
-  __forceinline void sincosf(float x, float *sin, float *cos) {
-    __sincosf(x,sin,cos);
-  }
-#endif
-
-#if defined(__WIN32__) || defined(__FreeBSD__)
-  __forceinline void sincosf(float x, float *s, float *c) {
-    *s = sinf(x); *c = cosf(x);
-  }
-#endif
-
-  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
-  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
-
-  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
-  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
-  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
-  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
-
-#if defined(__AVX2__)
-  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
-  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
-  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
-  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
-#elif defined (__aarch64__) && defined(__clang__)
-#pragma clang fp contract(fast)
-
-
-__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
-__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
-__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
-__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
-
-#pragma clang fp contract(on)
-#else
-  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
-  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
-  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
-  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
-#endif
-
-  /*! random functions */
-  template<typename T> T random() { return T(0); }
-#if defined(_WIN32)
-  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
-  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
-#else
-  template<> __forceinline int      random() { return int(rand()); }
-  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
-#endif
-  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
-  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
-
-#if _WIN32
-  __forceinline double drand48() {
-    return double(rand())/double(RAND_MAX);
-  }
-
-  __forceinline void srand48(long seed) {
-    return srand(seed);
-  }
-#endif
-
-  /*! selects */
-  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
-  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
-  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
-
-  __forceinline bool all(bool s) { return s; }
-
-  __forceinline float lerp(const float v0, const float v1, const float t) {
-    return madd(1.0f-t,v0,t*v1);
-  }
-
-  template<typename T>
-    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
-    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
-  }
-
-  /*! exchange */
-  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
-
-
-  template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
-#if 1//!defined(__aarch64__)
-      return msub(a,b,c*d);
-#else
-      return nmadd(c,d,a*b);
-#endif
-  }
-
-  /*! bit reverse operation */
-  template<class T>
-    __forceinline T bitReverse(const T& vin)
-  {
-    T v = vin;
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    v = ( v >> 16             ) | ( v               << 16);
-    return v;
-  }
-
-  /*! bit interleave operation */
-  template<class T>
-    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
-  {
-	  T x = xin, y = yin, z = zin;
-    x = (x | (x << 16)) & 0x030000FF;
-    x = (x | (x <<  8)) & 0x0300F00F;
-    x = (x | (x <<  4)) & 0x030C30C3;
-    x = (x | (x <<  2)) & 0x09249249;
-
-    y = (y | (y << 16)) & 0x030000FF;
-    y = (y | (y <<  8)) & 0x0300F00F;
-    y = (y | (y <<  4)) & 0x030C30C3;
-    y = (y | (y <<  2)) & 0x09249249;
-
-    z = (z | (z << 16)) & 0x030000FF;
-    z = (z | (z <<  8)) & 0x0300F00F;
-    z = (z | (z <<  4)) & 0x030C30C3;
-    z = (z | (z <<  2)) & 0x09249249;
-
-    return x | (y << 1) | (z << 2);
-  }
-
-#if defined(__AVX2__) && !defined(__aarch64__)
-
-  template<>
-    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
-  {
-    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
-    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
-    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
-    return xx | yy | zz;
-  }
-
-#endif
-
-  /*! bit interleave operation for 64bit data types*/
-  template<class T>
-    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
-    T x = xin & 0x1fffff;
-    T y = yin & 0x1fffff;
-    T z = zin & 0x1fffff;
-
-    x = (x | x << 32) & 0x1f00000000ffff;
-    x = (x | x << 16) & 0x1f0000ff0000ff;
-    x = (x | x << 8) & 0x100f00f00f00f00f;
-    x = (x | x << 4) & 0x10c30c30c30c30c3;
-    x = (x | x << 2) & 0x1249249249249249;
-
-    y = (y | y << 32) & 0x1f00000000ffff;
-    y = (y | y << 16) & 0x1f0000ff0000ff;
-    y = (y | y << 8) & 0x100f00f00f00f00f;
-    y = (y | y << 4) & 0x10c30c30c30c30c3;
-    y = (y | y << 2) & 0x1249249249249249;
-
-    z = (z | z << 32) & 0x1f00000000ffff;
-    z = (z | z << 16) & 0x1f0000ff0000ff;
-    z = (z | z << 8) & 0x100f00f00f00f00f;
-    z = (z | z << 4) & 0x10c30c30c30c30c3;
-    z = (z | z << 2) & 0x1249249249249249;
-
-    return x | (y << 1) | (z << 2);
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h
deleted file mode 100644
index 032b56904e..0000000000
--- a/thirdparty/embree-aarch64/common/math/obbox.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "bbox.h"
-#include "linearspace3.h"
-
-namespace embree
-{
-  /*! Oriented bounding box */
-  template<typename T>
-    struct OBBox 
-  {
-  public:
-    
-    __forceinline OBBox () {}
-    
-    __forceinline OBBox (EmptyTy) 
-      : space(one), bounds(empty) {}
-    
-    __forceinline OBBox (const BBox<T>& bounds) 
-      : space(one), bounds(bounds) {}
-      
-    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
-      : space(space), bounds(bounds) {}
-    
-    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
-      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
-    }
-    
-  public:
-    LinearSpace3<T> space; //!< orthonormal transformation
-    BBox<T> bounds;        //!< bounds in transformed space
-  };
-
-  typedef OBBox<Vec3f> OBBox3f;
-  typedef OBBox<Vec3fa> OBBox3fa;
-}
diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h
deleted file mode 100644
index 20c69bc62f..0000000000
--- a/thirdparty/embree-aarch64/common/math/quaternion.h
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "vec3.h"
-#include "vec4.h"
-
-#include "transcendental.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////
-  // Quaternion Struct
-  ////////////////////////////////////////////////////////////////
-
-  template<typename T>
-  struct QuaternionT
-  {
-    typedef Vec3<T> Vector;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline QuaternionT           ()                     { }
-    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
-    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
-
-    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
-    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
-    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
-    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
-    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
-
-    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
-    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
-    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
-
-    /*! return quaternion for rotation around arbitrary axis */
-    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
-      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
-    }
-
-    /*! returns the rotation axis of the quaternion as a vector */
-    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
-
-  public:
-    T r, i, j, k;
-  };
-
-  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
-  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
-
-  ////////////////////////////////////////////////////////////////
-  // Unary Operators
-  ////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
-  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
-  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
-  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
-  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
-  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
-
-  // evaluates a*q-r
-  template<typename T> __forceinline QuaternionT<T>
-  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
-  {
-    return QuaternionT<T>(msub(a, q.r, p.r),
-                          msub(a, q.i, p.i),
-                          msub(a, q.j, p.j),
-                          msub(a, q.k, p.k));
-  }
-  // evaluates a*q-r
-  template<typename T> __forceinline QuaternionT<T>
-  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
-  {
-    return QuaternionT<T>(madd(a, q.r, p.r),
-                          madd(a, q.i, p.i),
-                          madd(a, q.j, p.j),
-                          madd(a, q.k, p.k));
-  }
-
-  ////////////////////////////////////////////////////////////////
-  // Binary Operators
-  ////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
-  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
-  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
-  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
-  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
-  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
-
-  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
-  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
-    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
-                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
-                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
-                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
-  }
-  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
-  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
-  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
-
-  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
-  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
-  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
-  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
-  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
-  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
-  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
-  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
-
-  template<typename T, typename M> __forceinline QuaternionT<T>
-  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
-  {
-    return QuaternionT<T>(select(m, q.r, p.r),
-                          select(m, q.i, p.i),
-                          select(m, q.j, p.j),
-                          select(m, q.k, p.k));
-  }
-
-
-  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
-  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
-  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
-
-  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
-  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
-
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Orientation Functions
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
-  {
-    if ( vx.x + vy.y + vz.z >= T(zero) )
-    {
-      const T t = T(one) + (vx.x + vy.y + vz.z);
-      const T s = rsqrt(t)*T(0.5f);
-      r = t*s;
-      i = (vy.z - vz.y)*s;
-      j = (vz.x - vx.z)*s;
-      k = (vx.y - vy.x)*s;
-    }
-    else if ( vx.x >= max(vy.y, vz.z) )
-    {
-      const T t = (T(one) + vx.x) - (vy.y + vz.z);
-      const T s = rsqrt(t)*T(0.5f);
-      r = (vy.z - vz.y)*s;
-      i = t*s;
-      j = (vx.y + vy.x)*s;
-      k = (vz.x + vx.z)*s;
-    }
-    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
-    {
-      const T t = (T(one) + vy.y) - (vz.z + vx.x);
-      const T s = rsqrt(t)*T(0.5f);
-      r = (vz.x - vx.z)*s;
-      i = (vx.y + vy.x)*s;
-      j = t*s;
-      k = (vy.z + vz.y)*s;
-    }
-    else //if ( vz.z >= max(vy.y, vx.x) )
-    {
-      const T t = (T(one) + vz.z) - (vx.x + vy.y);
-      const T s = rsqrt(t)*T(0.5f);
-      r = (vx.y - vy.x)*s;
-      i = (vz.x + vx.z)*s;
-      j = (vy.z + vz.y)*s;
-      k = t*s;
-    }
-  }
-
-  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
-  {
-    const T cya = cos(yaw  *T(0.5f));
-    const T cpi = cos(pitch*T(0.5f));
-    const T cro = cos(roll *T(0.5f));
-    const T sya = sin(yaw  *T(0.5f));
-    const T spi = sin(pitch*T(0.5f));
-    const T sro = sin(roll *T(0.5f));
-    r = cro*cya*cpi + sro*sya*spi;
-    i = cro*cya*spi + sro*sya*cpi;
-    j = cro*sya*cpi - sro*cya*spi;
-    k = sro*cya*cpi - cro*sya*spi;
-  }
-
-  //////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  //////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
-    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
-  }
-
-  /*! default template instantiations */
-  typedef QuaternionT<float>  Quaternion3f;
-  typedef QuaternionT<double> Quaternion3d;
-
-  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
-  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
-  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
-  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
-
-  //////////////////////////////////////////////////////////////////////////////
-  /// Interpolation
-  //////////////////////////////////////////////////////////////////////////////
-  template<typename T>
-  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
-                                   const QuaternionT<T>& q1,
-                                   const T& factor)
-  {
-    QuaternionT<T> q;
-    q.r = lerp(q0.r, q1.r, factor);
-    q.i = lerp(q0.i, q1.i, factor);
-    q.j = lerp(q0.j, q1.j, factor);
-    q.k = lerp(q0.k, q1.k, factor);
-    return q;
-  }
-
-  template<typename T>
-  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
-                                     const QuaternionT<T>& q1_,
-                                     const T& t)
-  {
-    T cosTheta = dot(q0, q1_);
-    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
-    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
-    if (unlikely(all(cosTheta > 0.9995f))) {
-      return normalize(lerp(q0, q1, t));
-    }
-    const T phi = t * fastapprox::acos(cosTheta);
-    T sinPhi, cosPhi;
-    fastapprox::sincos(phi, sinPhi, cosPhi);
-    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
-    return msub(cosPhi, q0, qperp);
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h
deleted file mode 100644
index 762d9cd9ea..0000000000
--- a/thirdparty/embree-aarch64/common/math/range.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/platform.h"
-#include "../math/math.h"
-
-namespace embree
-{
-  template<typename Ty>
-    struct range 
-    {
-      __forceinline range() {}
-
-      __forceinline range(const Ty& begin)
-        : _begin(begin), _end(begin+1) {}
-      
-      __forceinline range(const Ty& begin, const Ty& end)
-        : _begin(begin), _end(end) {}
- 
-      __forceinline range(const range& other)
-        : _begin(other._begin), _end(other._end) {}
-
-      template<typename T1>
-      __forceinline range(const range<T1>& other)
-        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
-
-      template<typename T1>
-      __forceinline range& operator =(const range<T1>& other) {
-        _begin = other._begin;
-        _end = other._end;
-        return *this;
-      }
-      
-      __forceinline Ty begin() const {
-        return _begin;
-      }
-      
-      __forceinline Ty end() const {
-	return _end;
-      }
-
-      __forceinline range intersect(const range& r) const {
-        return range (max(_begin,r._begin),min(_end,r._end));
-      }
-
-      __forceinline Ty size() const {
-        return _end - _begin;
-      }
-
-      __forceinline bool empty() const { 
-        return _end <= _begin; 
-      }
-
-      __forceinline Ty center() const {
-        return (_begin + _end)/2;
-      }
-
-      __forceinline std::pair<range,range> split() const 
-      {
-        const Ty _center = center();
-        return std::make_pair(range(_begin,_center),range(_center,_end));
-      }
-
-      __forceinline void split(range& left_o, range& right_o) const 
-      {
-        const Ty _center = center();
-        left_o = range(_begin,_center);
-        right_o = range(_center,_end);
-      }
-
-      __forceinline friend bool operator< (const range& r0, const range& r1) {
-        return r0.size() < r1.size();
-      }
-	
-      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
-        return cout << "range [" << r.begin() << ", " << r.end() << "]";
-      }
-      
-      Ty _begin, _end;
-    };
-
-  template<typename Ty>
-    range<Ty> make_range(const Ty& begin, const Ty& end) {
-    return range<Ty>(begin,end);
-  }
-
-  template<typename Ty>
-    struct extended_range : public range<Ty>
-    {
-      __forceinline extended_range () {}
-
-      __forceinline extended_range (const Ty& begin)
-        : range<Ty>(begin), _ext_end(begin+1) {}
-      
-      __forceinline extended_range (const Ty& begin, const Ty& end)
-        : range<Ty>(begin,end), _ext_end(end) {}
-
-      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
-        : range<Ty>(begin,end), _ext_end(ext_end) {}
-      
-      __forceinline Ty ext_end() const {
-	return _ext_end;
-      }
-
-      __forceinline Ty ext_size() const {
-        return _ext_end - range<Ty>::_begin;
-      }
-
-      __forceinline Ty ext_range_size() const {
-        return _ext_end - range<Ty>::_end;
-      }
-
-      __forceinline bool has_ext_range() const {
-        assert(_ext_end >= range<Ty>::_end);
-        return (_ext_end - range<Ty>::_end) > 0;
-      }
-
-      __forceinline void set_ext_range(const size_t ext_end){
-        assert(ext_end >= range<Ty>::_end);
-        _ext_end = ext_end;
-      }
-
-      __forceinline void move_right(const size_t plus){
-        range<Ty>::_begin   += plus;
-        range<Ty>::_end     += plus;
-        _ext_end += plus;
-      }
-
-      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
-        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
-      }
-      
-      Ty _ext_end;
-    };
-}
diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h
deleted file mode 100644
index 6855d82b53..0000000000
--- a/thirdparty/embree-aarch64/common/math/transcendental.h
+++ /dev/null
@@ -1,525 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-// Transcendental functions from "ispc": https://github.com/ispc/ispc/
-// Most of the transcendental implementations in ispc code come from
-// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
-
-#include "../simd/simd.h"
-
-namespace embree
-{
-
-namespace fastapprox
-{
-
-template <typename T>
-__forceinline T sin(const T &v)
-{
-  static const float piOverTwoVec = 1.57079637050628662109375;
-  static const float twoOverPiVec = 0.636619746685028076171875;
-  auto scaled = v * twoOverPiVec;
-  auto kReal = floor(scaled);
-  auto k = toInt(kReal);
-
-  // Reduced range version of x
-  auto x = v - kReal * piOverTwoVec;
-  auto kMod4 = k & 3;
-  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
-  auto flipSign = (kMod4 > 1);
-
-  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
-  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
-  static const float sinC2  = -0.16666667163372039794921875;
-  static const float sinC4  = +8.333347737789154052734375e-3;
-  static const float sinC6  = -1.9842604524455964565277099609375e-4;
-  static const float sinC8  = +2.760012648650445044040679931640625e-6;
-  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
-
-  static const float cosC2  = -0.5;
-  static const float cosC4  = +4.166664183139801025390625e-2;
-  static const float cosC6  = -1.388833043165504932403564453125e-3;
-  static const float cosC8  = +2.47562347794882953166961669921875e-5;
-  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
-
-  auto outside = select(sinUseCos, 1., x);
-  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
-  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
-  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
-  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
-  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
-
-  auto x2 = x * x;
-  auto formula = x2 * c10 + c8;
-  formula = x2 * formula + c6;
-  formula = x2 * formula + c4;
-  formula = x2 * formula + c2;
-  formula = x2 * formula + 1.;
-  formula *= outside;
-
-  formula = select(flipSign, -formula, formula);
-  return formula;
-}
-
-template <typename T>
-__forceinline T cos(const T &v)
-{
-  static const float piOverTwoVec = 1.57079637050628662109375;
-  static const float twoOverPiVec = 0.636619746685028076171875;
-  auto scaled = v * twoOverPiVec;
-  auto kReal = floor(scaled);
-  auto k = toInt(kReal);
-
-  // Reduced range version of x
-  auto x = v - kReal * piOverTwoVec;
-
-  auto kMod4 = k & 3;
-  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
-  auto flipSign = (kMod4 == 1 | kMod4 == 2);
-
-  const float sinC2  = -0.16666667163372039794921875;
-  const float sinC4  = +8.333347737789154052734375e-3;
-  const float sinC6  = -1.9842604524455964565277099609375e-4;
-  const float sinC8  = +2.760012648650445044040679931640625e-6;
-  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
-
-  const float cosC2  = -0.5;
-  const float cosC4  = +4.166664183139801025390625e-2;
-  const float cosC6  = -1.388833043165504932403564453125e-3;
-  const float cosC8  = +2.47562347794882953166961669921875e-5;
-  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
-
-  auto outside = select(cosUseCos, 1., x);
-  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
-  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
-  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
-  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
-  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
-
-  auto x2 = x * x;
-  auto formula = x2 * c10 + c8;
-  formula = x2 * formula + c6;
-  formula = x2 * formula + c4;
-  formula = x2 * formula + c2;
-  formula = x2 * formula + 1.;
-  formula *= outside;
-
-  formula = select(flipSign, -formula, formula);
-  return formula;
-}
-
-template <typename T>
-__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
-{
-  const float piOverTwoVec = 1.57079637050628662109375;
-  const float twoOverPiVec = 0.636619746685028076171875;
-  auto scaled = v * twoOverPiVec;
-  auto kReal = floor(scaled);
-  auto k = toInt(kReal);
-
-  // Reduced range version of x
-  auto x = v - kReal * piOverTwoVec;
-  auto kMod4 = k & 3;
-  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
-  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
-  auto sinFlipSign = (kMod4 > 1);
-  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
-
-  const float oneVec = +1.;
-  const float sinC2  = -0.16666667163372039794921875;
-  const float sinC4  = +8.333347737789154052734375e-3;
-  const float sinC6  = -1.9842604524455964565277099609375e-4;
-  const float sinC8  = +2.760012648650445044040679931640625e-6;
-  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
-
-  const float cosC2  = -0.5;
-  const float cosC4  = +4.166664183139801025390625e-2;
-  const float cosC6  = -1.388833043165504932403564453125e-3;
-  const float cosC8  = +2.47562347794882953166961669921875e-5;
-  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
-
-  auto x2 = x * x;
-
-  auto sinFormula = x2 * sinC10 + sinC8;
-  auto cosFormula = x2 * cosC10 + cosC8;
-  sinFormula = x2 * sinFormula + sinC6;
-  cosFormula = x2 * cosFormula + cosC6;
-
-  sinFormula = x2 * sinFormula + sinC4;
-  cosFormula = x2 * cosFormula + cosC4;
-
-  sinFormula = x2 * sinFormula + sinC2;
-  cosFormula = x2 * cosFormula + cosC2;
-
-  sinFormula = x2 * sinFormula + oneVec;
-  cosFormula = x2 * cosFormula + oneVec;
-
-  sinFormula *= x;
-
-  sinResult = select(sinUseCos, cosFormula, sinFormula);
-  cosResult = select(cosUseCos, cosFormula, sinFormula);
-
-  sinResult = select(sinFlipSign, -sinResult, sinResult);
-  cosResult = select(cosFlipSign, -cosResult, cosResult);
-}
-
-template <typename T>
-__forceinline T tan(const T &v)
-{
-  const float piOverFourVec = 0.785398185253143310546875;
-  const float fourOverPiVec = 1.27323949337005615234375;
-
-  auto xLt0 = v < 0.;
-  auto y = select(xLt0, -v, v);
-  auto scaled = y * fourOverPiVec;
-
-  auto kReal = floor(scaled);
-  auto k = toInt(kReal);
-
-  auto x = y - kReal * piOverFourVec;
-
-  // If k & 1, x -= Pi/4
-  auto needOffset = (k & 1) != 0;
-  x = select(needOffset, x - piOverFourVec, x);
-
-  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
-  auto kMod4 = k & 3;
-  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
-
-  const float oneVec = 1.0;
-
-  const float tanC2  = +0.33333075046539306640625;
-  const float tanC4  = +0.13339905440807342529296875;
-  const float tanC6  = +5.3348250687122344970703125e-2;
-  const float tanC8  = +2.46033705770969390869140625e-2;
-  const float tanC10 = +2.892402000725269317626953125e-3;
-  const float tanC12 = +9.500005282461643218994140625e-3;
-
-  const float cotC2  = -0.3333333432674407958984375;
-  const float cotC4  = -2.222204394638538360595703125e-2;
-  const float cotC6  = -2.11752182804048061370849609375e-3;
-  const float cotC8  = -2.0846328698098659515380859375e-4;
-  const float cotC10 = -2.548247357481159269809722900390625e-5;
-  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
-
-  auto x2 = x * x;
-  T z;
-  if (any(useCotan))
-  {
-    auto cotVal = x2 * cotC12 + cotC10;
-    cotVal = x2 * cotVal + cotC8;
-    cotVal = x2 * cotVal + cotC6;
-    cotVal = x2 * cotVal + cotC4;
-    cotVal = x2 * cotVal + cotC2;
-    cotVal = x2 * cotVal + oneVec;
-    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
-    cotVal /= -x;
-    z = cotVal;
-  }
-  auto useTan = !useCotan;
-  if (any(useTan))
-  {
-    auto tanVal = x2 * tanC12 + tanC10;
-    tanVal = x2 * tanVal + tanC8;
-    tanVal = x2 * tanVal + tanC6;
-    tanVal = x2 * tanVal + tanC4;
-    tanVal = x2 * tanVal + tanC2;
-    tanVal = x2 * tanVal + oneVec;
-    // Equation was for tan(x)/x
-    tanVal *= x;
-    z = select(useTan, tanVal, z);
-  }
-  return select(xLt0, -z, z);
-}
-
-template <typename T>
-__forceinline T asin(const T &x0)
-{
-  auto isneg = (x0 < 0.f);
-  auto x = abs(x0);
-  auto isnan = (x > 1.f);
-
-  // sollya
-  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
-  //           [1e-20;.9999999999999999]);
-  // avg error: 1.1105439e-06, max error 1.3187528e-06
-  auto v = 1.57079517841339111328125f +
-           x * (-0.21450997889041900634765625f +
-                x * (8.78556668758392333984375e-2f +
-                     x * (-4.489909112453460693359375e-2f +
-                          x * (1.928029954433441162109375e-2f +
-                               x * (-4.3095736764371395111083984375e-3f)))));
-
-  v *= -sqrt(1.f - x);
-  v = v + 1.57079637050628662109375f;
-
-  v = select(v < 0.f, T(0.f), v);
-  v = select(isneg, -v, v);
-  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
-
-  return v;
-}
-
-template <typename T>
-__forceinline T acos(const T &v)
-{
-  return 1.57079637050628662109375f - asin(v);
-}
-
-template <typename T>
-__forceinline T atan(const T &v)
-{
-  const float piOverTwoVec = 1.57079637050628662109375;
-  // atan(-x) = -atan(x) (so flip from negative to positive first)
-  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
-  auto xNeg = v < 0.f;
-  auto xFlipped = select(xNeg, -v, v);
-
-  auto xGt1 = xFlipped > 1.;
-  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
-
-  // These coefficients approximate atan(x)/x
-  const float atanC0  = +0.99999988079071044921875;
-  const float atanC2  = -0.3333191573619842529296875;
-  const float atanC4  = +0.199689209461212158203125;
-  const float atanC6  = -0.14015688002109527587890625;
-  const float atanC8  = +9.905083477497100830078125e-2;
-  const float atanC10 = -5.93664981424808502197265625e-2;
-  const float atanC12 = +2.417283318936824798583984375e-2;
-  const float atanC14 = -4.6721356920897960662841796875e-3;
-
-  auto x2 = x * x;
-  auto result = x2 * atanC14 + atanC12;
-  result = x2 * result + atanC10;
-  result = x2 * result + atanC8;
-  result = x2 * result + atanC6;
-  result = x2 * result + atanC4;
-  result = x2 * result + atanC2;
-  result = x2 * result + atanC0;
-  result *= x;
-
-  result = select(xGt1, piOverTwoVec - result, result);
-  result = select(xNeg, -result, result);
-  return result;
-}
-
-template <typename T>
-__forceinline T atan2(const T &y, const T &x)
-{
-  const float piVec = 3.1415926536;
-  // atan2(y, x) =
-  //
-  // atan2(y > 0, x = +-0) ->  Pi/2
-  // atan2(y < 0, x = +-0) -> -Pi/2
-  // atan2(y = +-0, x < +0) -> +-Pi
-  // atan2(y = +-0, x >= +0) -> +-0
-  //
-  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
-  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
-  // atan2(y, x > 0) -> atan(y/x)
-  //
-  // and then a bunch of code for dealing with infinities.
-  auto yOverX = y * rcpSafe(x);
-  auto atanArg = atan(yOverX);
-  auto xLt0 = x < 0.f;
-  auto yLt0 = y < 0.f;
-  auto offset = select(xLt0,
-                select(yLt0, T(-piVec), T(piVec)), 0.f);
-  return offset + atanArg;
-}
-
-template <typename T>
-__forceinline T exp(const T &v)
-{
-  const float ln2Part1 = 0.6931457519;
-  const float ln2Part2 = 1.4286067653e-6;
-  const float oneOverLn2 = 1.44269502162933349609375;
-
-  auto scaled = v * oneOverLn2;
-  auto kReal = floor(scaled);
-  auto k = toInt(kReal);
-
-  // Reduced range version of x
-  auto x = v - kReal * ln2Part1;
-  x -= kReal * ln2Part2;
-
-  // These coefficients are for e^x in [0, ln(2)]
-  const float one = 1.;
-  const float c2 = 0.4999999105930328369140625;
-  const float c3 = 0.166668415069580078125;
-  const float c4 = 4.16539050638675689697265625e-2;
-  const float c5 = 8.378830738365650177001953125e-3;
-  const float c6 = 1.304379315115511417388916015625e-3;
-  const float c7 = 2.7555381529964506626129150390625e-4;
-
-  auto result = x * c7 + c6;
-  result = x * result + c5;
-  result = x * result + c4;
-  result = x * result + c3;
-  result = x * result + c2;
-  result = x * result + one;
-  result = x * result + one;
-
-  // Compute 2^k (should differ for float and double, but I'll avoid
-  // it for now and just do floats)
-  const int fpbias = 127;
-  auto biasedN = k + fpbias;
-  auto overflow = kReal > fpbias;
-  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
-  // we've got underflow. -127 * ln(2) -> -88.02. So the most
-  // negative float input that doesn't result in zero is like -88.
-  auto underflow = kReal <= -fpbias;
-  const int infBits = 0x7f800000;
-  biasedN <<= 23;
-  // Reinterpret this thing as float
-  auto twoToTheN = asFloat(biasedN);
-  // Handle both doubles and floats (hopefully eliding the copy for float)
-  auto elemtype2n = twoToTheN;
-  result *= elemtype2n;
-  result = select(overflow, cast_i2f(infBits), result);
-  result = select(underflow, 0., result);
-  return result;
-}
-
-// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
-// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
-template <typename T, typename R>
-__forceinline void __rangeReduceLog(const T &input,
-                                    T &reduced,
-                                    R &exponent)
-{
-  auto intVersion = asInt(input);
-  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
-  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
-  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
-  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
-  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
-
-  //const int exponentMask(0x7F800000)
-  static const int nonexponentMask = 0x807FFFFF;
-
-  // We want the reduced version to have an exponent of -1 which is
-  // -1 + 127 after biasing or 126
-  static const int exponentNeg1 = (126l << 23);
-  // NOTE(boulos): We don't need to mask anything out since we know
-  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
-  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
-  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
-
-  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
-  exponent = offsetExponent - 127;          // get the real value
-
-  // Blend the offset_exponent with the original input (do this in
-  // int for now, until I decide if float can have & and &not)
-  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
-  reduced = asFloat(blended);
-}
-
-template <typename T> struct ExponentType            { };
-template <int N>      struct ExponentType<vfloat<N>> { typedef vint<N> Ty; };
-template <>           struct ExponentType<float>     { typedef int     Ty; };
-
-template <typename T>
-__forceinline T log(const T &v)
-{
-  T reduced;
-  typename ExponentType<T>::Ty exponent;
-
-  const int nanBits = 0x7fc00000;
-  const int negInfBits = 0xFF800000;
-  const float nan = cast_i2f(nanBits);
-  const float negInf = cast_i2f(negInfBits);
-  auto useNan = v < 0.;
-  auto useInf = v == 0.;
-  auto exceptional = useNan | useInf;
-  const float one = 1.0;
-
-  auto patched = select(exceptional, one, v);
-  __rangeReduceLog(patched, reduced, exponent);
-
-  const float ln2 = 0.693147182464599609375;
-
-  auto x1 = one - reduced;
-  const float c1 = +0.50000095367431640625;
-  const float c2 = +0.33326041698455810546875;
-  const float c3 = +0.2519190013408660888671875;
-  const float c4 = +0.17541764676570892333984375;
-  const float c5 = +0.3424419462680816650390625;
-  const float c6 = -0.599632322788238525390625;
-  const float c7 = +1.98442304134368896484375;
-  const float c8 = -2.4899270534515380859375;
-  const float c9 = +1.7491014003753662109375;
-
-  auto result = x1 * c9 + c8;
-  result = x1 * result + c7;
-  result = x1 * result + c6;
-  result = x1 * result + c5;
-  result = x1 * result + c4;
-  result = x1 * result + c3;
-  result = x1 * result + c2;
-  result = x1 * result + c1;
-  result = x1 * result + one;
-
-  // Equation was for -(ln(red)/(1-red))
-  result *= -x1;
-  result += toFloat(exponent) * ln2;
-
-  return select(exceptional,
-                select(useNan, T(nan), T(negInf)),
-                result);
-}
-
-template <typename T>
-__forceinline T pow(const T &x, const T &y)
-{
-  auto x1 = abs(x);
-  auto z = exp(y * log(x1));
-
-  // Handle special cases
-  const float twoOver23 = 8388608.0f;
-  auto yInt = y == round(y);
-  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
-
-  // x == 0
-  z = select(x == 0.0f,
-      select(y < 0.0f, T(inf) | signmsk(x),
-      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
-
-  // x < 0
-  auto xNegative = x < 0.0f;
-  if (any(xNegative))
-  {
-    auto z1 = z | asFloat(yOddInt);
-    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
-    z = select(xNegative, z1, z);
-  }
-
-  auto xFinite = isfinite(x);
-  auto yFinite = isfinite(y);
-  if (all(xFinite & yFinite))
-    return z;
-
-  // x finite and y infinite
-  z = select(andn(xFinite, yFinite),
-      select(x1 == 1.0f, 1.0f,
-      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
-
-  // x infinite
-  z = select(xFinite, z,
-      select(y == 0.0f, 1.0f,
-      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
-
-  return z;
-}
-
-template <typename T>
-__forceinline T pow(const T &x, float y)
-{
-  return pow(x, T(y));
-}
-
-} // namespace fastapprox
-
-} // namespace embree
diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h
deleted file mode 100644
index a619459e9c..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec2.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-
-namespace embree
-{
-  struct Vec2fa;
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Generic 2D vector Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct Vec2
-  {
-    enum { N = 2 };
-    union {
-      struct { T x, y; };
-#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
-      T components[N];
-#endif
-    };
-
-    typedef T Scalar;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec2( ) {}
-    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
-    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
-
-    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
-    __forceinline Vec2( const Vec2fa& other );
-
-    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
-    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
-
-    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
-    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
-    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
-    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
-
-#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
-	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
-	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
-#else
-	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
-	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
-#endif
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
-  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
-  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
-  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
-  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
-  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
-  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
-  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
-  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
-  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
-  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
-  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
-  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
-  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
-  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
-  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
-  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
-  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
-
-  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
-  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
-
-  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
-  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
-  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
-  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
-  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reduction Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
-  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
-  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
-  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
-  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
-  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
-    if (a.x != b.x) return a.x < b.x;
-    if (a.y != b.y) return a.y < b.y;
-    return false;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Shift Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
-    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
-  }
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
-  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
-  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
-  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
-  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
-  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
-
-  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
-    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
-    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
-  }
-
-  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
-    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
-  }
-
-  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
-    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
-  }
-
-  template<typename T>
-    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
-    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
-  }
-
-  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
-  {
-    const Vec2<T> b = abs(a);
-    if (b.x > b.y) return 0;
-    else return 1;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
-    return cout << "(" << a.x << ", " << a.y << ")";
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Default template instantiations
-  ////////////////////////////////////////////////////////////////////////////////
-
-  typedef Vec2<bool > Vec2b;
-  typedef Vec2<int  > Vec2i;
-  typedef Vec2<float> Vec2f;
-}
-
-#include "vec2fa.h"
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-#include "../simd/sse.h"
-#endif
-
-#if defined(__AVX__)
-#include "../simd/avx.h"
-#endif
-
-#if defined(__AVX512F__)
-#include "../simd/avx512.h"
-#endif
-
-namespace embree
-{
-  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
-#endif
-
-#if defined(__AVX__)
-  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
-#endif
-
-#if defined(__AVX512F__)
-  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
-#endif
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h
deleted file mode 100644
index 451ecd556c..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec2fa.h
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/alloc.h"
-#include "math.h"
-#include "../simd/sse.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE Vec2fa Type
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct __aligned(16) Vec2fa
-  {
-    ALIGNED_STRUCT_(16);
-
-    typedef float Scalar;
-    enum { N = 2 };
-    union {
-      __m128 m128;
-      struct { float x,y,az,aw; };
-    };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constructors, Assignment & Cast Operators
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec2fa( ) {}
-    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
-
-    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
-    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
-
-    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
-    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
-
-    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
-
-    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
-
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Loads and Stores
-    ////////////////////////////////////////////////////////////////////////////////
-
-    static __forceinline Vec2fa load( const void* const a ) {
-      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
-    }
-
-    static __forceinline Vec2fa loadu( const void* const a ) {
-      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
-    }
-
-    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
-      _mm_storeu_ps((float*)ptr,v);
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Array Access
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
-    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
-  __forceinline Vec2fa operator -( const Vec2fa& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
-  }
-  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
-  }
-  __forceinline Vec2fa sign ( const Vec2fa& a ) {
-    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
-  }
-
-  __forceinline Vec2fa rcp  ( const Vec2fa& a )
-  {
-#if defined(__aarch64__)
-        __m128 reciprocal = _mm_rcp_ps(a.m128);
-        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-        return (const Vec2fa)reciprocal;
-#else
-#if defined(__AVX512VL__)
-    const Vec2fa r = _mm_rcp14_ps(a.m128);
-#else
-    const Vec2fa r = _mm_rcp_ps(a.m128);
-#endif
-
-#if defined(__AVX2__)
-    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
-#else
-    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
-    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#endif
-
-    return res;
-#endif  //defined(__aarch64__) 
-  }
-
-  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
-
-  __forceinline Vec2fa rsqrt( const Vec2fa& a )
-  {
-#if defined(__aarch64__)
-        __m128 r = _mm_rsqrt_ps(a.m128);
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        return r;
-#else
-        
-#if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
-#else
-    __m128 r = _mm_rsqrt_ps(a.m128);
-#endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-        
-#endif
-  }
-
-  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
-    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
-  }
-  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
-    return rcp(zero_fix(a));
-  }
-  __forceinline Vec2fa log ( const Vec2fa& a ) {
-    return Vec2fa(logf(a.x),logf(a.y));
-  }
-
-  __forceinline Vec2fa exp ( const Vec2fa& a ) {
-    return Vec2fa(expf(a.x),expf(a.y));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
-  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
-  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
-  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
-
-  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
-      return Vec2fa(powf(a.x,b),powf(a.y,b));
-    }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__AVX2__)
-  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
-  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
-  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
-  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
-#else
-  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
-  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
-  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
-  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
-#endif
-
-  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
-  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
-  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
-  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
-  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
-  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
-  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
-  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
-  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
-  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
-  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
-  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
-  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__SSE4_1__)
-  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
-  }
-#else
-  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
-    return reduce_add(a*b);
-  }
-#endif
-
-  __forceinline Vec2fa cross ( const Vec2fa& a ) {
-    return Vec2fa(-a.y,a.x);
-  }
-
-  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
-  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
-  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
-  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
-  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
-  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
-    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f, t, mask);
-  }
-
-  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
-    return madd(1.0f-t,v0,t*v1);
-  }
-
-  __forceinline int maxDim ( const Vec2fa& a )
-  {
-    const Vec2fa b = abs(a);
-    if (b.x > b.y) return 0;
-    else return 1;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Rounding Functions
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__)
-__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
-__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
-//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
-#elif defined (__SSE4_1__)
-  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
-#else
-  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
-  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
-  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
-    return cout << "(" << a.x << ", " << a.y << ")";
-  }
-
-  typedef Vec2fa Vec2fa_t;
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h
deleted file mode 100644
index 1870321715..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec3.h
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-
-namespace embree
-{
-  struct Vec3fa;
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Generic 3D vector Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct Vec3
-  {
-    enum { N  = 3 };
-
-    union {
-      struct {
-	T x, y, z;
-      };
-#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
-      T components[N];
-#endif
-    };
-
-    typedef T Scalar;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3( ) {}
-    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
-    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
-
-    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
-    __forceinline Vec3( const Vec3fa& other );
-
-    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
-    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
-
-    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
-	
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
-    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
-    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
-    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
-
-#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
-    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
-    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
-#else
-	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
-	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
-#endif
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
-  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
-  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
-  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
-  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
-  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
-
-  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
-  {
-    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
-                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
-                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
-  }
-  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
-  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
-  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
-  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
-  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
-  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
-  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
-  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
-
-  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
-  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
-
-  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
-  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
-  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
-  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
-  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
-
-  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
-  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
-  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
-  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
-  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
-  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
-  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
-  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reduction Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
-  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
-  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
-  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
-  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
-  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
-    if (a.x != b.x) return a.x < b.x;
-    if (a.y != b.y) return a.y < b.y;
-    if (a.z != b.z) return a.z < b.z;
-    return false;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Shift Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
-    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
-    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
-  }
-
-  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
-    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
-  }
-
-  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
-    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
-  }
-
-  template<typename T>
-    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
-    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
-  }
-
-  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
-  {
-    const Vec3<T> b = abs(a);
-    if (b.x > b.y) {
-      if (b.x > b.z) return 0; else return 2;
-    } else {
-      if (b.y > b.z) return 1; else return 2;
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
-  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
-  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
-  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
-  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
-  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
-  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
-  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
-  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
-  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
-  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
-  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
-  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
-  {
-    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
-    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
-    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
-    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
-    const auto sx = abs(ab_x) < abs(bc_x);
-    const auto sy = abs(ab_y) < abs(bc_y);
-    const auto sz = abs(ab_z) < abs(bc_z);
-    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
-                   select(sy,cross_ab.y,cross_bc.y),
-                   select(sz,cross_ab.z,cross_bc.z));
-  }
-
-  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
-
-  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
-  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
-
-  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
-    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
-  }
-
-  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
-  {
-    const Vec3<T> N = cross(P-Q0,Q1-Q0);
-    const Vec3<T> D = Q1-Q0;
-    return dot(N,N)*rcp(dot(D,D));
-  }
-
-  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
-  {
-    const Vec3<T> N = cross(PmQ0,Q1mQ0);
-    const Vec3<T> D = Q1mQ0;
-    return dot(N,N)*rcp(dot(D,D));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
-    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
-  }
-
-  typedef Vec3<bool > Vec3b;
-  typedef Vec3<int  > Vec3i;
-  typedef Vec3<float> Vec3f;
-}
-
-#include "vec3ba.h"
-#include "vec3ia.h"
-#include "vec3fa.h"
-
-////////////////////////////////////////////////////////////////////////////////
-/// SSE / AVX / MIC specializations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-#include "../simd/sse.h"
-#endif
-
-#if defined(__AVX__)
-#include "../simd/avx.h"
-#endif
-
-#if defined(__AVX512F__)
-#include "../simd/avx512.h"
-#endif
-
-namespace embree
-{
-  template<typename Out, typename In>
-  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
-    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
-  }
-
-  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
-
-#if defined(__AVX__)
-  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
-    x = a.x; y = a.y; z = a.z;
-  }
-#elif defined(__SSE__) || defined(__ARM_NEON)
-  template<>
-  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
-    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
-  }
-#endif
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
-    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
-  }
-
-  template<>
-  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
-    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
-  }
-
-  template<int i0, int i1, int i2, int i3>
-  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
-    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
-  }
-#endif
-
-#if defined(__AVX__)
-  template<>
-  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
-    x = a.x; y = a.y; z = a.z;
-  }
-  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat8>& a, const size_t k) {
-    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
-  }
-  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat4>& a, const size_t k) {
-    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
-  }
-  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat8>& a, const size_t k) {
-    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
-  }
-
-  template<>
-  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
-    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
-  }
-  template<>
-  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
-    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
-  }
-
-  template<int i0, int i1, int i2, int i3>
-  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
-    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
-  }
-#endif
-
-#if defined(__AVX512F__)
-  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
-#endif
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h
deleted file mode 100644
index 90f31739c2..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec3ba.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/alloc.h"
-#include "math.h"
-#include "../simd/sse.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE Vec3ba Type
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct __aligned(16) Vec3ba
-  {
-    ALIGNED_STRUCT_(16);
-    
-    union {
-      __m128 m128;
-      struct { int x,y,z; };
-    };
-
-    typedef int Scalar;
-    enum { N = 3 };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constructors, Assignment & Cast Operators
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3ba( ) {}
-    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
-    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
-    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
-
-    __forceinline explicit Vec3ba( bool a )
-      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
-    __forceinline Vec3ba( bool a, bool b, bool c)
-      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
-
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Array Access
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
-    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
-  };
-
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
-  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
-  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-  
-  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
-  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
-  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
-  
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators + Select
-  ////////////////////////////////////////////////////////////////////////////////
-  
-  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
-    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
-  }
-  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
-    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
-  }
-  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
-    if (a.x != b.x) return a.x < b.x;
-    if (a.y != b.y) return a.y < b.y;
-    if (a.z != b.z) return a.z < b.z;
-    return false;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reduction Operations
-  ////////////////////////////////////////////////////////////////////////////////
-    
-  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
-  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
-
-  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
-  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
-  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
-
-  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
-    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h
deleted file mode 100644
index 6163cfb596..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec3fa.h
+++ /dev/null
@@ -1,810 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/alloc.h"
-#include "math.h"
-#include "../simd/sse.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE Vec3fa Type
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct __aligned(16) Vec3fa
-  {
-    ALIGNED_STRUCT_(16);
-
-    typedef float Scalar;
-    enum { N = 3 };
-    union {
-      __m128 m128;
-      struct { float x,y,z; };
-    };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constructors, Assignment & Cast Operators
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3fa( ) {}
-    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
-
-    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
-    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
-
-    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
-    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
-
-    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
-
-    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
-
-    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
-    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
-    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
-    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
-    
-    //__forceinline operator const __m128&() const { return m128; }
-    //__forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Loads and Stores
-    ////////////////////////////////////////////////////////////////////////////////
-
-    static __forceinline Vec3fa load( const void* const a ) {
-#if defined(__aarch64__)
-        __m128 t = _mm_load_ps((float*)a);
-        t[3] = 0.0f;
-        return Vec3fa(t);
-#else
-      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
-#endif
-    }
-
-    static __forceinline Vec3fa loadu( const void* const a ) {
-      return Vec3fa(_mm_loadu_ps((float*)a));
-    }
-
-    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
-      _mm_storeu_ps((float*)ptr,v.m128);
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Array Access
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
-    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
-  __forceinline Vec3fa operator -( const Vec3fa& a ) {
-#if defined(__aarch64__)
-    return vnegq_f32(a.m128);
-#else
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-
-    return _mm_xor_ps(a.m128, mask);
-#endif
-  }
-  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
-#if defined(__aarch64__)
-    return _mm_abs_ps(a.m128);
-#else
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
-#endif
-  }
-  __forceinline Vec3fa sign ( const Vec3fa& a ) {
-#if defined(__aarch64__)
-    Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
-    return r;
-#else
-    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
-#endif
-  }
-
-  __forceinline Vec3fa rcp  ( const Vec3fa& a )
-  {
-#if defined(__aarch64__) && defined(BUILD_IOS)
-  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
-#elif defined(__aarch64__)
-  __m128 reciprocal = _mm_rcp_ps(a.m128);
-  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-  return (const Vec3fa)reciprocal;
-#else
-        
-#if defined(__AVX512VL__)
-    const Vec3fa r = _mm_rcp14_ps(a.m128);
-#else
-    const Vec3fa r = _mm_rcp_ps(a.m128);
-#endif
-
-#if defined(__AVX2__)
-    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
-#else
-    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
-    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#endif
-
-    return res;
-#endif  //defined(__aarch64__)
-  }
-
-  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
-
-  __forceinline Vec3fa rsqrt( const Vec3fa& a )
-  {
-#if defined(__aarch64__)
-        __m128 r = _mm_rsqrt_ps(a.m128);
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        return r;
-#else
-        
-#if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
-#else
-    __m128 r = _mm_rsqrt_ps(a.m128);
-#endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-#endif
-  }
-
-  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
-    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
-  }
-  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
-    return rcp(zero_fix(a));
-  }
-  __forceinline Vec3fa log ( const Vec3fa& a ) {
-    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
-  }
-
-  __forceinline Vec3fa exp ( const Vec3fa& a ) {
-    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
-  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
-  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
-  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
-
-  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
-      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
-    }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__AVX2__)
-  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
-#else
-                                                                                
-#if defined(__aarch64__)
-  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
-        return _mm_madd_ps(a.m128, b.m128, c.m128);  //a*b+c;
-    }
-  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
-        return _mm_msub_ps(a.m128, b.m128, c.m128);  //-a*b+c;
-    }
-  __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
-        Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
-        return -t;
-    }
-  __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
-        return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
-    }
-
-#else
-  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
-  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
-  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
-  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
-#endif
-
-#endif
-
-  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
-  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
-  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
-  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
-  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
-  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
-  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
-  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
-  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
-  ////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__) && defined(BUILD_IOS)
-  __forceinline float reduce_add(const Vec3fa& v) {
-    float32x4_t t = v.m128;
-    t[3] = 0.0f;
-    return vaddvq_f32(t);
-  }
-                                                                                
-  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
-  __forceinline float reduce_min(const Vec3fa& v) {
-    float32x4_t t = v.m128;
-      t[3] = t[2];
-    return vminvq_f32(t);
-  }
-  __forceinline float reduce_max(const Vec3fa& v) {
-    float32x4_t t = v.m128;
-      t[3] = t[2];
-    return vmaxvq_f32(t);
-  }
-#else
-  __forceinline float reduce_add(const Vec3fa& v) {
-    const vfloat4 a(v.m128);
-    const vfloat4 b = shuffle<1>(a);
-    const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c);
-  }
-
-  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
-  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
-  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
-#endif
-                                                                                
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
-
-  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
-  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
-  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
- #if defined(__aarch64__)
-  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
-#else
-  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
-#endif
-
-  __forceinline bool isvalid ( const Vec3fa& v ) {
-    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
-  }
-
-  __forceinline bool is_finite ( const Vec3fa& a ) {
-    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
-  }
-
-  __forceinline bool isvalid4 ( const Vec3fa& v ) {
-    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
-  }
-
-  __forceinline bool is_finite4 ( const Vec3fa& a ) {
-    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__SSE4_1__)
-  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
-  }
-#else
-  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
-    return reduce_add(a*b);
-  }
-#endif
-
-  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
-  {
-    vfloat4 a0 = vfloat4(a.m128);
-    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
-    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
-    vfloat4 b1 = vfloat4(b.m128);
-    return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
-  }
-
-  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
-  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
-  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
-  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
-  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
-  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
-  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
-  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
-
-  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
-    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
-  }
-
-  /*! differentiated normalization */
-  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
-  {
-    const float pp  = dot(p,p);
-    const float pdp = dot(p,dp);
-    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
-    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f.m128, t.m128, mask);
-  }
-
-  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
-    return blendv_ps(f.m128, t.m128, s);
-  }
-
-  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
-    return madd(1.0f-t,v0,t*v1);
-  }
-
-  __forceinline int maxDim ( const Vec3fa& a )
-  {
-    const Vec3fa b = abs(a);
-    if (b.x > b.y) {
-      if (b.x > b.z) return 0; else return 2;
-    } else {
-      if (b.y > b.z) return 1; else return 2;
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Rounding Functions
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__)
-  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
-  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
-  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
-#elif defined (__SSE4_1__)
-  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
-#else
-  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
-  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
-  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
-    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
-  }
-
-  typedef Vec3fa Vec3fa_t;
-
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE Vec3fx Type
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct __aligned(16) Vec3fx
-  {
-    ALIGNED_STRUCT_(16);
-
-    typedef float Scalar;
-    enum { N = 3 };
-    union {
-      __m128 m128;
-      struct { float x,y,z; union { int a; unsigned u; float w; }; };
-    };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constructors, Assignment & Cast Operators
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3fx( ) {}
-    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
-
-    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
-    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
-        
-    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
-    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
-
-    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
-
-    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
-
-    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
-
-    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
-    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
-    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
-#if defined (__aarch64__)
-      m128 = other.m128; m128[3] = w1;
-#elif defined (__SSE4_1__)
-      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
-#else
-      const vint4 mask(-1,-1,-1,0);
-      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
-#endif
-    }
-    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
-    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
-    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
-    
-    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
-
-    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
-    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
-    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
-    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
-    
-    //__forceinline operator const __m128&() const { return m128; }
-    //__forceinline operator       __m128&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Loads and Stores
-    ////////////////////////////////////////////////////////////////////////////////
-
-    static __forceinline Vec3fx load( const void* const a ) {
-      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
-    }
-
-    static __forceinline Vec3fx loadu( const void* const a ) {
-      return Vec3fx(_mm_loadu_ps((float*)a));
-    }
-
-    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
-      _mm_storeu_ps((float*)ptr,v.m128);
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Array Access
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
-    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
-  __forceinline Vec3fx operator -( const Vec3fx& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
-  }
-  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
-    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
-  }
-  __forceinline Vec3fx sign ( const Vec3fx& a ) {
-    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
-  }
-
-  __forceinline Vec3fx rcp  ( const Vec3fx& a )
-  {
-#if defined(__AVX512VL__)
-    const Vec3fx r = _mm_rcp14_ps(a.m128);
-#else
-    const Vec3fx r = _mm_rcp_ps(a.m128);
-#endif
-
-#if defined(__AVX2__)
-    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
-#else
-    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
-    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
-#endif
-
-    return res;
-  }
-
-  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
-
-  __forceinline Vec3fx rsqrt( const Vec3fx& a )
-  {
-#if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
-#else
-    __m128 r = _mm_rsqrt_ps(a.m128);
-#endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
-  }
-
-  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
-    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
-  }
-  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
-    return rcp(zero_fix(a));
-  }
-  __forceinline Vec3fx log ( const Vec3fx& a ) {
-    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
-  }
-
-  __forceinline Vec3fx exp ( const Vec3fx& a ) {
-    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
-  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
-  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
-  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
-
-  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
-
-#if defined(__SSE4_1__) || defined(__aarch64__)
-    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-#if defined(__SSE4_1__) || defined(__aarch64__)
-    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
-    }
-#endif
-
-    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
-      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
-    }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__AVX2__)
-  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
-#else
-  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
-  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
-  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
-  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
-#endif
-
-  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
-  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
-  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
-  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
-  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
-  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
-  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
-  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
-  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline float reduce_add(const Vec3fx& v) {
-    const vfloat4 a(v.m128);
-    const vfloat4 b = shuffle<1>(a);
-    const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c);
-  }
-
-  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
-  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
-  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
-
-  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
-  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
-  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
-  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
-
-  __forceinline bool isvalid ( const Vec3fx& v ) {
-    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
-  }
-
-  __forceinline bool is_finite ( const Vec3fx& a ) {
-    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
-  }
-
-  __forceinline bool isvalid4 ( const Vec3fx& v ) {
-    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
-  }
-
-  __forceinline bool is_finite4 ( const Vec3fx& a ) {
-    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__SSE4_1__)
-  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
-  }
-#else
-  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
-    return reduce_add(a*b);
-  }
-#endif
-
-  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
-  {
-    vfloat4 a0 = vfloat4(a.m128);
-    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
-    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
-    vfloat4 b1 = vfloat4(b.m128);
-    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
-  }
-
-  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
-  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
-  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
-  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
-  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
-  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
-  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
-  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
-
-  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
-    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
-  }
-
-  /*! differentiated normalization */
-  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
-  {
-    const float pp  = dot(p,p);
-    const float pdp = dot(p,dp);
-    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
-    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f.m128, t.m128, mask);
-  }
-
-  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
-    return blendv_ps(f.m128, t.m128, s);
-  }
-
-  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
-    return madd(1.0f-t,v0,t*v1);
-  }
-
-  __forceinline int maxDim ( const Vec3fx& a )
-  {
-    const Vec3fx b = abs(a);
-    if (b.x > b.y) {
-      if (b.x > b.z) return 0; else return 2;
-    } else {
-      if (b.y > b.z) return 1; else return 2;
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Rounding Functions
-  ////////////////////////////////////////////////////////////////////////////////
-
-#if defined (__SSE4_1__) && !defined(__aarch64__)
-  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
-#else
-  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
-  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
-  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
-    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
-  }
-
-  
-  typedef Vec3fx Vec3ff;
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h
deleted file mode 100644
index 737f67fd72..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec3ia.h
+++ /dev/null
@@ -1,210 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "../sys/alloc.h"
-#include "math.h"
-#include "../simd/sse.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// SSE Vec3ia Type
-  ////////////////////////////////////////////////////////////////////////////////
-
-  struct __aligned(16) Vec3ia
-  {
-    ALIGNED_STRUCT_(16);
-
-    union {
-      __m128i m128;
-      struct { int x,y,z; };
-    };
-
-    typedef int Scalar;
-    enum { N = 3 };
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constructors, Assignment & Cast Operators
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3ia( ) {}
-    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
-    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
-    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
-
-    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
-    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
-    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
-
-    __forceinline operator const __m128i&() const { return m128; }
-    __forceinline operator       __m128i&()       { return m128; }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
-    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
-    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
-    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Array Access
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
-    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
-  };
-
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
-  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
-#if (defined(__aarch64__)) 
-  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
-#elif defined(__SSSE3__)
-  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
-  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
-  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
-
-  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
-  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
-  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
-  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
-  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
-#endif
-
-  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
-  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
-  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
-
-  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
-  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
-  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
-
-  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
-  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
-  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
-
-#if !defined(__ARM_NEON)
-  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
-  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
-
-  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
-  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
-  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
-  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
-  
-  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
-  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
-  
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
-  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
-#endif
-  
-  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
-  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
-  
-  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
-  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
-  
-#if !defined(__ARM_NEON)
-  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
-  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reductions
-  ////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__)
-  __forceinline int reduce_add(const Vec3ia& v) {
-    int32x4_t t = v.m128;
-    t[3] = 0;
-    return vaddvq_s32(t);
-        
-  }
-  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
-  __forceinline int reduce_min(const Vec3ia& v) {
-    int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
-    return vminvq_s32(t);
-        
-  }
-  __forceinline int reduce_max(const Vec3ia& v) {
-    int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
-    return vmaxvq_s32(t);
-        
-  }
-#else
-  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
-  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
-  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
-  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
-#endif
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
-  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
-    if (a.x != b.x) return a.x < b.x;
-    if (a.y != b.y) return a.y < b.y;
-    if (a.z != b.z) return a.z < b.z;
-    return false;
-  }
-
-  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
-  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
-  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
-#else
-    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
-#endif
-  }
-
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
-  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
-#else
-  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
-  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
-#endif
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
-    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
-  }
-}
diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h
deleted file mode 100644
index d16542f507..0000000000
--- a/thirdparty/embree-aarch64/common/math/vec4.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright 2009-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "math.h"
-#include "vec3.h"
-
-namespace embree
-{
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Generic 4D vector Class
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> struct Vec4
-  {
-    enum { N = 4 };    
-    union {
-      struct { T x, y, z, w; };
-#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
-      T components[N];
-#endif
-    };
-
-    typedef T Scalar;
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Construction
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec4( ) {}
-    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
-    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
-    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
-
-    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
-    __forceinline Vec4( const Vec3fx& other );
-
-    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
-    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
-
-    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
-
-    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Constants
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
-    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
-    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
-    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
-
-#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
-	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
-	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
-#else
-	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
-	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
-#endif
-
-    ////////////////////////////////////////////////////////////////////////////////
-    /// Swizzles
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Unary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
-  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
-  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
-  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
-  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
-  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Binary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
-  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
-  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
-  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
-  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
-  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
-
-  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
-  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Ternary Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
-
-  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
-  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Assignment Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
-  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
-  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
-  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Reduction Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
-  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
-  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
-  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Comparison Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
-  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
-  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
-    if (a.x != b.x) return a.x < b.x;
-    if (a.y != b.y) return a.y < b.y;
-    if (a.z != b.z) return a.z < b.z;
-    if (a.w != b.w) return a.w < b.w;
-    return false;
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Shift Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
-    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Euclidian Space Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
-
-  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
-  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
-  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Select
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
-    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
-  }
-
-  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
-    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
-  }
-
-  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
-    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
-  }
-
-  template<typename T>
-    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
-    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Output Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
-    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Default template instantiations
-  ////////////////////////////////////////////////////////////////////////////////
-
-  typedef Vec4<bool         > Vec4b;
-  typedef Vec4<uint8_t      > Vec4uc;
-  typedef Vec4<int          > Vec4i;
-  typedef Vec4<float        > Vec4f;
-}
-
-#include "vec3ba.h"
-#include "vec3ia.h"
-#include "vec3fa.h"
-
-////////////////////////////////////////////////////////////////////////////////
-/// SSE / AVX / MIC specializations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-#include "../simd/sse.h"
-#endif
-
-#if defined __AVX__
-#include "../simd/avx.h"
-#endif
-
-#if defined __AVX512F__
-#include "../simd/avx512.h"
-#endif
-
-namespace embree
-{
-  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
-
-#if defined(__AVX__)
-  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
-    x = a.x; y = a.y; z = a.z; w = a.w;
-  }
-#elif defined(__SSE__) || defined(__ARM_NEON)
-  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
-    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
-  }
-#endif
-
-#if defined(__SSE__) || defined(__ARM_NEON)
-  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
-    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
-  }
-#endif
-
-#if defined(__AVX__)
-  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
-    x = a.x; y = a.y; z = a.z; w = a.w;
-  }
-  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat8>& a, const size_t k ) {
-    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
-  }
-  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat4>& a, const size_t k ) {
-    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
-  }
-  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat8>& a, const size_t k ) {
-    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
-  }
-#endif
-
-#if defined(__AVX512F__)
-  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
-#endif
-}