46 files changed, 15110 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h
new file mode 100644
index 0000000000..c038d3cf21
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+  class Scene;
+
+  /*! Base class for the acceleration structure data. */
+  class AccelData : public RefCount 
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+  public:
+    AccelData (const Type type) 
+      : bounds(empty), type(type) {}
+
+    /*! notifies the acceleration structure about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+   
+    /*! clears the acceleration structure data */
+    virtual void clear() = 0;
+
+    /*! returns normal bounds */
+    __forceinline BBox3fa getBounds() const {
+      return bounds.bounds();
+    }
+
+    /*! returns bounds for some time */
+    __forceinline BBox3fa getBounds(float t) const {
+      return bounds.interpolate(t);
+    }
+
+    /*! returns linear bounds */
+    __forceinline LBBox3fa getLinearBounds() const {
+      return bounds;
+    }
+
+    /*! checks if acceleration structure is empty */
+    __forceinline bool isEmpty() const {
+      return bounds.bounds0.lower.x == float(pos_inf);
+    }
+
+  public:
+    LBBox3fa bounds; // linear bounds
+    Type type;
+  };
+
+  /*! Base class for all intersectable and buildable acceleration structures. */
+  class Accel : public AccelData
+  {
+     ALIGNED_CLASS_(16);
+  public:
+
+    struct Intersectors;
+
+    /*! Type of collide function */
+    typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+    /*! Type of point query function */
+    typedef bool(*PointQueryFunc)(Intersectors* This,          /*!< this pointer to accel */
+                                  PointQuery* query,        /*!< point query for lookup */
+                                  PointQueryContext* context); /*!< point query context */
+
+    /*! Type of intersect function pointer for single rays. */
+    typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
+                                  RTCRayHit& ray,      /*!< ray to intersect */
+                                  IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 4. */
+    typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit4& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 8. */
+    typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit8& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 16. */
+    typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRayHit16& ray,   /*!< ray packet to intersect */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
+                                   const size_t N,     /*!< number of rays in stream */
+                                   IntersectContext* context /*!< layout flags */);
+    
+    
+    /*! Type of occlusion function pointer for single rays. */
+    typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+                                  RTCRay& ray,        /*!< ray to test occlusion */
+                                  IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 4. */
+    typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay4& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 8. */
+    typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay8& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 16. */
+    typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRay16& ray,      /*!< ray packet to test occlusion. */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
+                                  const size_t N,     /*!< number of rays in stream */
+                                  IntersectContext* context /*!< layout flags */);
+    typedef void (*ErrorFunc) ();
+
+    struct Collider
+    {
+      Collider (ErrorFunc error = nullptr) 
+      : collide((CollideFunc)error), name(nullptr) {}
+
+      Collider (CollideFunc collide, const char* name)
+      : collide(collide), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      CollideFunc collide;  
+      const char* name;
+    };
+    
+    struct Intersector1
+    {
+      Intersector1 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      static const char* type;
+      IntersectFunc intersect;
+      OccludedFunc occluded;
+      PointQueryFunc pointQuery;
+      const char* name;
+    };
+    
+    struct Intersector4 
+    {
+      Intersector4 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+      Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc4 intersect;
+      OccludedFunc4 occluded;
+      const char* name;
+    };
+    
+    struct Intersector8 
+    {
+      Intersector8 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+      Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc8 intersect;
+      OccludedFunc8 occluded;
+      const char* name;
+    };
+    
+    struct Intersector16 
+    {
+      Intersector16 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+      Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc16 intersect;
+      OccludedFunc16 occluded;
+      const char* name;
+    };
+
+    struct IntersectorN 
+    {
+      IntersectorN (ErrorFunc error = nullptr)
+      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFuncN intersect;
+      OccludedFuncN occluded;
+      const char* name;
+    };
+   
+    struct Intersectors 
+    {
+      Intersectors() 
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+      Intersectors (ErrorFunc error) 
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+      void print(size_t ident) 
+      {
+        if (collider.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "collider  = " << collider.name << std::endl;
+        }
+        if (intersector1.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector1  = " << intersector1.name << std::endl;
+        }
+        if (intersector4.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector4  = " << intersector4.name << std::endl;
+        }
+        if (intersector8.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector8  = " << intersector8.name << std::endl;
+        }
+        if (intersector16.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector16 = " << intersector16.name << std::endl;
+        }
+        if (intersectorN.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersectorN = " << intersectorN.name << std::endl;
+        }        
+      }
+
+      void select(bool filter)
+      {
+        if (intersector4_filter) {
+          if (filter) intersector4 = intersector4_filter;
+          else        intersector4 = intersector4_nofilter;
+        }
+        if (intersector8_filter) {
+          if (filter) intersector8 = intersector8_filter;
+          else        intersector8 = intersector8_nofilter;
+        }
+        if (intersector16_filter) {
+          if (filter) intersector16 = intersector16_filter;
+          else         intersector16 = intersector16_nofilter;
+        }
+        if (intersectorN_filter) {
+          if (filter) intersectorN = intersectorN_filter;
+          else        intersectorN = intersectorN_nofilter;
+        }        
+      }
+
+      __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+        assert(intersector1.pointQuery);
+        return intersector1.pointQuery(this,query,context);
+      }
+
+      /*! collides two scenes */
+      __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+        assert(collider.collide);
+        collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+      }
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+        assert(intersector1.intersect);
+        intersector1.intersect(this,ray,context);
+      }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a stream of N rays in SOA layout with the scene. */
+      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.intersect);
+        intersectorN.intersect(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        intersect4(&mask,(RTCRayHit4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        intersect8(&mask,(RTCRayHit8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        intersect16(&mask,(RTCRayHit16&)ray,context);
+      }
+#endif
+      
+      template<int K>
+      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        intersectN((RTCRayHitN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+        assert(intersector1.occluded);
+        intersector1.occluded(this,ray,context);
+      }
+      
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.occluded);
+        intersectorN.occluded(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        occluded4(&mask,(RTCRay4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        occluded8(&mask,(RTCRay8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        occluded16(&mask,(RTCRay16&)ray,context);
+      }
+#endif
+
+      template<int K>
+      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        occludedN((RTCRayN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+        occluded(ray, context);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+        occluded(valid, ray, context);
+      }
+
+      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+      template<int K>
+      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+        occludedN(rayN, N, context);
+      }
+      
+    public:
+      AccelData* ptr;
+      void* leafIntersector;
+      Collider collider;
+      Intersector1 intersector1;
+      Intersector4 intersector4;
+      Intersector4 intersector4_filter;
+      Intersector4 intersector4_nofilter;
+      Intersector8 intersector8;
+      Intersector8 intersector8_filter;
+      Intersector8 intersector8_nofilter;
+      Intersector16 intersector16;
+      Intersector16 intersector16_filter;
+      Intersector16 intersector16_nofilter;
+      IntersectorN intersectorN;
+      IntersectorN intersectorN_filter;
+      IntersectorN intersectorN_nofilter;      
+    };
+  
+  public:
+
+    /*! Construction */
+    Accel (const AccelData::Type type) 
+      : AccelData(type) {}
+    
+    /*! Construction */
+    Accel (const AccelData::Type type, const Intersectors& intersectors) 
+      : AccelData(type), intersectors(intersectors) {}
+
+    /*! Virtual destructor */
+    virtual ~Accel() {}
+
+    /*! makes the acceleration structure immutable */
+    virtual void immutable () {}
+    
+    /*! build acceleration structure */
+    virtual void build () = 0;
+
+  public:
+    Intersectors intersectors;
+  };
+
+#define DEFINE_COLLIDER(symbol,collider)                                \
+  Accel::Collider symbol() {                                            \
+    return Accel::Collider((Accel::CollideFunc)collider::collide,       \
+                           TOSTRING(isa) "::" TOSTRING(symbol));        \
+  }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector)                               \
+  Accel::Intersector1 symbol() {                                              \
+    return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+                               (Accel::OccludedFunc  )intersector::occluded,  \
+                               (Accel::PointQueryFunc)intersector::pointQuery,\
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR4(symbol,intersector)                               \
+  Accel::Intersector4 symbol() {                                              \
+    return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+                               (Accel::OccludedFunc4)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR8(symbol,intersector)                               \
+  Accel::Intersector8 symbol() {                                              \
+    return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+                               (Accel::OccludedFunc8)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector)                                \
+  Accel::Intersector16 symbol() {                                               \
+    return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+                                (Accel::OccludedFunc16)intersector::occluded,   \
+                                TOSTRING(isa) "::" TOSTRING(symbol));           \
+  }
+
+#define DEFINE_INTERSECTORN(symbol,intersector)                               \
+  Accel::IntersectorN symbol() {                                              \
+    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+                               (Accel::OccludedFuncN)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+  /* ray stream filter interface */
+  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+  struct RayStreamFilterFuncs
+  {
+    RayStreamFilterFuncs()
+    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
+
+    RayStreamFilterFuncs(void (*ptr) ())
+    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
+
+    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
+    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
+
+  public:
+    intersectStreamAOS_func intersectAOS;
+    intersectStreamAOP_func intersectAOP;
+    intersectStreamSOA_func intersectSOA;
+    intersectStreamSOP_func intersectSOP;
+
+    occludedStreamAOS_func occludedAOS;
+    occludedStreamAOP_func occludedAOP;
+    occludedStreamSOA_func occludedSOA;
+    occludedStreamSOP_func occludedSOP;
+  }; 
+
+  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelinstance.h b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..d74b96df3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+  class AccelInstance : public Accel
+  {
+  public:
+    AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+      : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+    void immutable () {
+      builder.reset(nullptr);
+    }
+
+  public:
+    void build () {
+      if (builder) builder->build();
+      bounds = accel->bounds;
+    }
+
+    void deleteGeometry(size_t geomID) {
+      if (accel  ) accel->deleteGeometry(geomID);
+      if (builder) builder->deleteGeometry(geomID);
+    }
+    
+    void clear() {
+      if (accel) accel->clear();
+      if (builder) builder->clear();
+    }
+
+  private:
+    std::unique_ptr<AccelData> accel;
+    std::unique_ptr<Builder> builder;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..aadb4a64ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+  AccelN::AccelN()
+    : Accel(AccelData::TY_ACCELN), accels() {}
+
+  AccelN::~AccelN() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+  }
+
+  void AccelN::accels_add(Accel* accel) 
+  {
+    assert(accel);
+    accels.push_back(accel);
+  }
+
+  void AccelN::accels_init() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+    
+    accels.clear();
+  }
+
+  bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+  {
+    bool changed = false;
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        changed |= This->accels[i]->intersectors.pointQuery(query,context);
+    return changed;
+  }
+
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect(ray,context);
+  }
+
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect4(valid,ray,context);
+  }
+
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect8(valid,ray,context);
+  }
+
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect16(valid,ray,context);
+  }
+
+  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersectN(ray,N,context);
+  }
+
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded(ray,context); 
+      if (ray.tfar < 0.0f) break; 
+    }
+  }
+
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON)
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      if (unlikely(none(valid0 & hit0))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      vbool4 valid2 = asBool(((vint4*)valid)[2]);
+      vbool4 hit2   = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+      vbool4 valid3 = asBool(((vint4*)valid)[3]);
+      vbool4 hit3   = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    size_t M = N;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.occludedN(ray,M,context);
+  }
+
+  void AccelN::accels_print(size_t ident)
+  {
+    for (size_t i=0; i<accels.size(); i++)
+    {
+      for (size_t j=0; j<ident; j++) std::cout << " "; 
+      std::cout << "accels[" << i << "]" << std::endl;
+      accels[i]->intersectors.print(ident+2);
+    }
+  }
+
+  void AccelN::accels_immutable()
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      accels[i]->immutable();
+  }
+  
+  void AccelN::accels_build () 
+  {
+    /* reduce memory consumption */
+    accels.shrink_to_fit();
+    
+    /* build all acceleration structures in parallel */
+    parallel_for (accels.size(), [&] (size_t i) { 
+        accels[i]->build();
+      });
+
+    /* create list of non-empty acceleration structures */
+    bool valid1 = true;
+    bool valid4 = true;
+    bool valid8 = true;
+    bool valid16 = true;
+    for (size_t i=0; i<accels.size(); i++) {
+      valid1 &= (bool) accels[i]->intersectors.intersector1;
+      valid4 &= (bool) accels[i]->intersectors.intersector4;
+      valid8 &= (bool) accels[i]->intersectors.intersector8;
+      valid16 &= (bool) accels[i]->intersectors.intersector16;
+    }
+
+    if (accels.size() == 1) {
+      type = accels[0]->type; // FIXME: should just assign entire Accel
+      bounds = accels[0]->bounds;
+      intersectors = accels[0]->intersectors;
+    }
+    else 
+    {
+      type = AccelData::TY_ACCELN;
+      intersectors.ptr = this;
+      intersectors.intersector1  = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+      intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+      intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+      intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+      /*! calculate bounds */
+      bounds = empty;
+      for (size_t i=0; i<accels.size(); i++) 
+        bounds.extend(accels[i]->bounds);
+    }
+  }
+
+  void AccelN::accels_select(bool filter)
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->intersectors.select(filter);
+  }
+
+  void AccelN::accels_deleteGeometry(size_t geomID) 
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->deleteGeometry(geomID);
+  }
+
+  void AccelN::accels_clear()
+  {
+    for (size_t i=0; i<accels.size(); i++) {
+      accels[i]->clear();
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h
new file mode 100644
index 0000000000..2edd98f647
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+  /*! merges N acceleration structures together, by processing them in order */
+  class AccelN : public Accel
+  {
+  public:
+    AccelN ();
+    ~AccelN();
+
+  public:
+    void accels_add(Accel* accel);
+    void accels_init();
+
+  public:
+    static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+  public:
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    void accels_print(size_t ident);
+    void accels_immutable();
+    void accels_build ();
+    void accels_select(bool filter);
+    void accels_deleteGeometry(size_t geomID);
+    void accels_clear ();
+
+  public:
+    std::vector<Accel*> accels;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..79be1c4301
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+  AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) 
+    : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+  AccelSet::IntersectorN::IntersectorN (ErrorFunc error) 
+    : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+  
+  AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+    : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h
new file mode 100644
index 0000000000..3774b2accb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+  struct IntersectFunctionNArguments;
+  struct OccludedFunctionNArguments;
+  
+  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  
+  struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportIntersectionFunc report;
+  };
+
+  struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportOcclusionFunc report;
+  };
+
+  /*! Base class for set of acceleration structures. */
+  class AccelSet : public Geometry
+  {
+  public:
+    typedef RTCIntersectFunctionN IntersectFuncN;  
+    typedef RTCOccludedFunctionN OccludedFuncN;
+    typedef void (*ErrorFunc) ();
+
+      struct IntersectorN
+      {
+        IntersectorN (ErrorFunc error = nullptr) ;
+        IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+        
+        operator bool() const { return name; }
+        
+      public:
+        static const char* type;
+        IntersectFuncN intersect;
+        OccludedFuncN occluded; 
+        const char* name;
+      };
+      
+    public:
+      
+      /*! construction */
+      AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+      
+      /*! makes the acceleration structure immutable */
+      virtual void immutable () {}
+      
+      /*! build accel */
+      virtual void build () = 0;
+
+      /*! check if the i'th primitive is valid between the specified time range */
+      __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+      {
+        for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+          if (!isvalid_non_empty(bounds(i,itime))) return false;
+        
+        return true;
+      }
+
+      /*! Calculates the bounds of an item */
+      __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+      {
+        BBox3fa box;
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)itime;
+        args.bounds_o = (RTCBounds*)&box;
+        boundsFunc(&args);
+        return box;
+      }
+
+      /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+      __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+      {
+        BBox3fa box[2];
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)(itime+0);
+        args.bounds_o = (RTCBounds*)&box[0];
+        boundsFunc(&args);
+        args.timeStep = (unsigned int)(itime+1);
+        args.bounds_o = (RTCBounds*)&box[1];
+        boundsFunc(&args);
+        return LBBox3fa(box[0],box[1]);
+      }
+
+      /*! calculates the build bounds of the i'th item, if it's valid */
+      __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+      {
+        const BBox3fa b = bounds(i);
+        if (bbox) *bbox = b;
+        return isvalid_non_empty(b);
+      }
+
+      /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+      __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+      {
+        const LBBox3fa bounds = linearBounds(i,itime);
+        bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+        return isvalid_non_empty(bounds);
+      }
+
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+        return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+      }
+      
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const  {
+        if (!valid(i, timeSegmentRange(time_range))) return false;
+        bbox = linearBounds(i, time_range);
+        return true;
+      }
+
+      /* gets version info of topology */
+      unsigned int getTopologyVersion() const {
+        return numPrimitives;
+      }
+    
+      /* returns true if topology changed */
+      bool topologyChanged(unsigned int otherVersion) const {
+        return numPrimitives != otherVersion;
+      }
+
+  public:
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+   
+      /*! Intersects a packet of K rays with the scene. */
+      template<int K>
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        vint<K> mask = valid.mask32();
+        IntersectFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+         
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        vint<K> mask = valid.mask32();
+        OccludedFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+
+    public:
+      RTCBoundsFunction boundsFunc;
+      IntersectorN intersectorN;
+  };
+  
+#define DEFINE_SET_INTERSECTORN(symbol,intersector)                     \
+  AccelSet::IntersectorN symbol() {                                     \
+    return AccelSet::IntersectorN(intersector::intersect, \
+                                  intersector::occluded, \
+                                  TOSTRING(isa) "::" TOSTRING(symbol)); \
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..6fa406f03a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
@@ -0,0 +1,82 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include "../../common/sys/barrier.h"
+#endif
+
+namespace embree
+{
+  __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+   
+  struct fast_allocator_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> numFailed;
+    std::unique_ptr<FastAllocator> alloc;
+
+    fast_allocator_regression_test() 
+      : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(fast_allocator_regression_test* This)
+    {
+      FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+      size_t* ptrs[1000];
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        for (size_t i=0; i<1000; i++) {
+          ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+          *ptrs[i] = size_t(threadalloc.talloc0) + i;
+        }
+        for (size_t i=0; i<1000; i++) {
+          if (*ptrs[i] != size_t(threadalloc.talloc0) + i) 
+            This->numFailed++;
+        }
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      alloc = make_unique(new FastAllocator(nullptr,false));
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        alloc->reset();
+        barrier.wait();
+        barrier.wait();
+      }
+     
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      alloc = nullptr;
+
+      return numFailed == 0;
+    }
+  };
+
+  fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.h b/thirdparty/embree-aarch64/kernels/common/alloc.h
new file mode 100644
index 0000000000..488fa707ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.h
@@ -0,0 +1,1006 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+namespace embree
+{
+  class FastAllocator
+  {
+    /*! maximum supported alignment */
+    static const size_t maxAlignment = 64;
+
+    /*! maximum allocation size */
+
+    /* default settings */
+    //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+  public:
+
+    struct ThreadLocal2;
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
+
+    /*! Per thread structure holding the current memory block. */
+    struct __aligned(64) ThreadLocal
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      /*! Constructor for usage with ThreadLocalData */
+      __forceinline ThreadLocal (ThreadLocal2* parent) 
+	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+      /*! initialize allocator */
+      void init(FastAllocator* alloc) 
+      {
+        ptr = nullptr;
+	cur = end = 0;
+        bytesUsed = 0;
+        bytesWasted = 0;
+        allocBlockSize = 0;
+        if (alloc) allocBlockSize = alloc->defaultBlockSize;
+      }
+
+      /* Allocate aligned memory from the threads memory block. */
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      {
+        /* bind the thread local allocator to the proper FastAllocator*/
+        parent->bind(alloc);
+
+        assert(align <= maxAlignment);
+	bytesUsed += bytes;
+
+        /* try to allocate in local block */
+	size_t ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+        
+        /* if allocation is too large allocate with parent allocator */
+        if (4*bytes > allocBlockSize) {
+          return alloc->malloc(bytes,maxAlignment,false);
+	}
+
+        /* get new partial block if allocation failed */
+        size_t blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ 	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* get new full block if allocation failed */
+        blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* should never happen as large allocations get handled specially above */
+        assert(false);
+        return nullptr;
+      }
+
+      
+      /*! returns amount of used bytes */
+      __forceinline size_t getUsedBytes() const { return bytesUsed; }
+  
+      /*! returns amount of free bytes */
+      __forceinline size_t getFreeBytes() const { return end-cur; }
+      
+      /*! returns amount of wasted bytes */
+      __forceinline size_t getWastedBytes() const { return bytesWasted; }
+  
+    private:
+      ThreadLocal2* parent;
+      char*  ptr;            //!< pointer to memory block
+      size_t cur;            //!< current location of the allocator
+      size_t end;            //!< end of the memory block
+      size_t allocBlockSize; //!< block size for allocations
+      size_t bytesUsed;      //!< number of total bytes allocated
+      size_t bytesWasted;    //!< number of bytes wasted
+    };
+
+    /*! Two thread local structures. */
+    struct __aligned(64) ThreadLocal2
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      __forceinline ThreadLocal2()
+        : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+      /*! bind to fast allocator */
+      __forceinline void bind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() == alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+        if (alloc.load()) {
+          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        }
+        alloc0.init(alloc_i);
+        alloc1.init(alloc_i);
+        alloc.store(alloc_i);
+        alloc_i->join(this);
+      }
+
+      /*! unbind to fast allocator */
+      void unbind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() != alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        alloc0.init(nullptr);
+        alloc1.init(nullptr);
+        alloc.store(nullptr);
+      }
+
+    public:
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::mutex mutex;
+#else
+      SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
+      std::atomic<FastAllocator*> alloc;  //!< parent allocator
+      ThreadLocal alloc0;
+      ThreadLocal alloc1;
+    };
+
+    FastAllocator (Device* device, bool osAllocation) 
+      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
+        primrefarray(device,0)
+    {
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+        assert(!slotMutex[i].isLocked());
+      }
+    }
+
+    ~FastAllocator () {
+      clear();
+    }
+
+    /*! returns the device attached to this allocator */
+    Device* getDevice() {
+      return device;
+    }
+
+    void share(mvector<PrimRef>& primrefarray_i) {
+      primrefarray = std::move(primrefarray_i);
+    }
+
+    void unshare(mvector<PrimRef>& primrefarray_o)
+    {
+      reset(); // this removes blocks that are allocated inside the shared primref array
+      primrefarray_o = std::move(primrefarray);
+    }
+
+    /*! returns first fast thread local allocator */
+    __forceinline ThreadLocal* _threadLocal() {
+      return &threadLocal2()->alloc0;
+    }
+
+    void setOSallocation(bool flag)
+    {
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
+    }
+
+  private:
+
+    /*! returns both fast thread local allocators */
+    __forceinline ThreadLocal2* threadLocal2() 
+    {
+      ThreadLocal2* alloc = thread_local_allocator2;
+      if (alloc == nullptr) {
+        thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+        Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
+        s_thread_local_allocators.push_back(make_unique(alloc));
+      }
+      return alloc;
+    }
+
+  public:
+
+    __forceinline void join(ThreadLocal2* alloc)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+      Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
+      thread_local_allocators.push_back(alloc);
+    }
+
+  public:
+
+    struct CachedAllocator
+    {
+      __forceinline CachedAllocator(void* ptr)
+        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
+      {
+        assert(ptr == nullptr);
+      }
+
+      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+      __forceinline operator bool () const {
+        return alloc != nullptr;
+      }
+
+      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+        return talloc1->malloc(alloc,bytes,align);
+      }
+
+    public:
+      FastAllocator* alloc;
+      ThreadLocal* talloc0;
+      ThreadLocal* talloc1;
+    };
+
+    __forceinline CachedAllocator getCachedAllocator() {
+      return CachedAllocator(this,threadLocal2());
+    }
+
+    /*! Builder interface to create thread local allocator */
+    struct Create
+    {
+    public:
+      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
+
+    private:
+      FastAllocator* allocator;
+    };
+
+    void internal_fix_used_blocks()
+    {
+      /* move thread local blocks to global block list */
+      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        while (threadBlocks[i].load() != nullptr) {
+          Block* nextUsedBlock = threadBlocks[i].load()->next;
+          threadBlocks[i].load()->next = usedBlocks.load();
+          usedBlocks = threadBlocks[i].load();
+          threadBlocks[i] = nextUsedBlock;
+        }
+        threadBlocks[i] = nullptr;
+      }
+    }
+
+    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+#if defined(__AVX512ER__) // KNL
+    static const size_t mainAllocOverheadStatic  = 15;  //! 15 means 7.5% allocation overhead through unfilled main alloc blocks
+#else
+    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
+#endif
+    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+    /* calculates a single threaded threshold for the builders such
+     * that for small scenes the overhead of partly allocated blocks
+     * per thread is low */
+    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+    {
+      if (numPrimitives == 0 || bytesEstimated == 0) 
+        return defaultThreshold;
+
+      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+      /* if we do not have to limit number of threads use optimal thresdhold */
+      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        return defaultThreshold;
+
+      /* otherwise limit number of threads by calculating proper single thread threshold */
+      else {
+        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
+      }
+    }
+
+    __forceinline size_t alignSize(size_t i) {
+      return (i+127)/128*128;
+    }
+
+    /*! initializes the grow size */
+    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
+    {
+      /* we do not need single thread local allocator mode */
+      use_single_mode = false;
+     
+      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+      /* if we reached the maxAllocationSize for growSize, we can
+       * increase the number of allocation slots by still guaranteeing
+       * the mainAllocationOverhead */
+      slotMask = 0x0;
+
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+      /* set the thread local alloc block size */
+      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+      
+      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+        else
+#endif
+        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+      if (bytesEstimated == 0) {
+        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+        defaultBlockSize = defaultBlockSizeSwitch;
+      }
+      log2_grow_size_scale = 0;
+      
+      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+    }
+
+    /*! initializes the allocator */
+    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      /* distribute the allocation to multiple thread block slots */
+      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      if (bytesReserve == 0) bytesReserve = bytesAllocate;
+      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      estimatedSize = bytesEstimate;
+      initGrowSizeAndNumSlots(bytesEstimate,true);
+    }
+
+    /*! initializes the allocator */
+    void init_estimate(size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      /* single allocator mode ? */
+      estimatedSize = bytesEstimate;
+      //initGrowSizeAndNumSlots(bytesEstimate,false);
+      initGrowSizeAndNumSlots(bytesEstimate,false);
+
+    }
+
+    /*! frees state not required after build */
+    __forceinline void cleanup()
+    {
+      internal_fix_used_blocks();
+
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! resets the allocator, memory blocks get reused */
+    void reset ()
+    {
+      internal_fix_used_blocks();
+
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+
+      /* reset all used blocks and move them to begin of free block list */
+      while (usedBlocks.load() != nullptr) {
+        usedBlocks.load()->reset_block();
+        Block* nextUsedBlock = usedBlocks.load()->next;
+        usedBlocks.load()->next = freeBlocks.load();
+        freeBlocks = usedBlocks.load();
+        usedBlocks = nextUsedBlock;
+      }
+
+      /* remove all shared blocks as they are re-added during build */
+      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! frees all allocated memory */
+    __forceinline void clear()
+    {
+      cleanup();
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      primrefarray.clear();
+    }
+
+    __forceinline size_t incGrowSizeScale()
+    {
+      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+      return size_t(1) << min(size_t(16),scale);
+    }
+
+    /*! thread safe allocation of memory */
+    void* malloc(size_t& bytes, size_t align, bool partial)
+    {
+      assert(align <= maxAlignment);
+
+      while (true)
+      {
+        /* allocate using current block */
+        size_t threadID = TaskScheduler::threadID();
+        size_t slot = threadID & slotMask;
+	Block* myUsedBlocks = threadUsedBlocks[slot];
+        if (myUsedBlocks) {
+          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          if (ptr) return ptr;
+        }
+
+        /* throw error if allocation is too large */
+        if (bytes > maxAllocationSize)
+          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+        if (likely(freeBlocks.load() == nullptr))
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
+          Lock<SpinLock> lock(slotMutex[slot]);
+#endif
+          if (myUsedBlocks == threadUsedBlocks[slot]) {
+            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+            assert(allocSize >= bytes);
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+          }
+          continue;
+        }
+
+        /* if this fails allocate new block */
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
+	  if (myUsedBlocks == threadUsedBlocks[slot])
+	  {
+            if (freeBlocks.load() != nullptr) {
+	      Block* nextFreeBlock = freeBlocks.load()->next;
+	      freeBlocks.load()->next = usedBlocks;
+	      __memory_barrier();
+	      usedBlocks = freeBlocks.load();
+              threadUsedBlocks[slot] = freeBlocks.load();
+	      freeBlocks = nextFreeBlock;
+	    } else {
+              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+	    }
+          }
+        }
+      }
+    }
+
+    /*! add new block */
+    void addBlock(void* ptr, ssize_t bytes)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(mutex);
+#else
+      Lock<SpinLock> lock(mutex);
+#endif
+      const size_t sizeof_Header = offsetof(Block,data[0]);
+      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+      size_t ofs = (size_t) aptr - (size_t) ptr;
+      bytes -= ofs;
+      if (bytes < 4096) return; // ignore empty or very small blocks
+      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+    }
+
+    /* special allocation only used from morton builder only a single time for each build */
+    void* specialAlloc(size_t bytes)
+    {
+      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+      return freeBlocks.load()->ptr();
+    }
+
+    struct Statistics
+    {
+      Statistics ()
+      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+      : bytesUsed(0), bytesFree(0), bytesWasted(0)
+      {
+        Block* usedBlocks = alloc->usedBlocks.load();
+        Block* freeBlocks = alloc->freeBlocks.load();
+        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+      }
+
+      std::string str(size_t numPrimitives)
+      {
+        std::stringstream str;
+        str.setf(std::ios::fixed, std::ios::floatfield);
+        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+        return str.str();
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+      {
+        return Statistics(a.bytesUsed+b.bytesUsed,
+                          a.bytesFree+b.bytesFree,
+                          a.bytesWasted+b.bytesWasted);
+      }
+
+      size_t bytesAllocatedTotal() const {
+        return bytesUsed + bytesFree + bytesWasted;
+      }
+
+    public:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+    };
+
+    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+      return Statistics(this,atype,huge_pages);
+    }
+
+    size_t getUsedBytes() {
+      return bytesUsed;
+    }
+
+    size_t getWastedBytes() {
+      return bytesWasted;
+    }
+
+    struct AllStatistics
+    {
+      AllStatistics (FastAllocator* alloc)
+
+      : bytesUsed(alloc->bytesUsed),
+        bytesFree(alloc->bytesFree),
+        bytesWasted(alloc->bytesWasted),
+        stat_all(alloc,ANY_TYPE),
+        stat_malloc(alloc,ALIGNED_MALLOC),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
+        stat_shared(alloc,SHARED) {}
+
+      AllStatistics (size_t bytesUsed,
+                     size_t bytesFree,
+                     size_t bytesWasted,
+                     Statistics stat_all,
+                     Statistics stat_malloc,
+                     Statistics stat_4K,
+                     Statistics stat_2M,
+                     Statistics stat_shared)
+
+      : bytesUsed(bytesUsed),
+        bytesFree(bytesFree),
+        bytesWasted(bytesWasted),
+        stat_all(stat_all),
+        stat_malloc(stat_malloc),
+        stat_4K(stat_4K),
+        stat_2M(stat_2M),
+        stat_shared(stat_shared) {}
+
+      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+      {
+        return AllStatistics(a.bytesUsed+b.bytesUsed,
+                             a.bytesFree+b.bytesFree,
+                             a.bytesWasted+b.bytesWasted,
+                             a.stat_all + b.stat_all,
+                             a.stat_malloc + b.stat_malloc,
+                             a.stat_4K + b.stat_4K,
+                             a.stat_2M + b.stat_2M,
+                             a.stat_shared + b.stat_shared);
+      }
+
+      void print(size_t numPrimitives)
+      {
+        std::stringstream str0;
+        str0.setf(std::ios::fixed, std::ios::floatfield);
+        str0 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "                                                            " 
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+        std::cout << str0.str() << std::endl;
+      
+        std::stringstream str1;
+        str1.setf(std::ios::fixed, std::ios::floatfield);
+        str1 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
+             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+        std::cout << str1.str() << std::endl;
+     
+        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
+        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
+        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
+        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
+      }
+
+    private:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+      Statistics stat_all;
+      Statistics stat_malloc;
+      Statistics stat_4K;
+      Statistics stat_2M;
+      Statistics stat_shared;
+    };
+
+    void print_blocks()
+    {
+      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+      std::cout << "  used blocks = ";
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+
+      std::cout << "  free blocks = ";
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+    }
+
+  private:
+
+    struct Block
+    {
+      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      {
+        /* We avoid using os_malloc for small blocks as this could
+         * cause a risk of fragmenting the virtual address space and
+         * reach the limit of vm.max_map_count = 65k under Linux. */
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
+          atype = ALIGNED_MALLOC;
+
+        /* we need to additionally allocate some header */
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        bytesAllocate = sizeof_Header+bytesAllocate;
+        bytesReserve  = sizeof_Header+bytesReserve;
+
+        /* consume full 4k pages with using os_malloc */
+        if (atype == EMBREE_OS_MALLOC) {
+          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+        }
+
+        /* either use alignedMalloc or os_malloc */
+        void *ptr = nullptr;
+        if (atype == ALIGNED_MALLOC)
+        {
+          /* special handling for default block size */
+          if (bytesAllocate == (2*PAGE_SIZE_2M))
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+
+            /* give hint to transparently convert these pages to 2MB pages */
+            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+          else
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+        }
+        else if (atype == EMBREE_OS_MALLOC)
+        {
+          if (device) device->memoryMonitor(bytesAllocate,false);
+          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+        }
+        else
+          assert(false);
+
+        return NULL;
+      }
+
+      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+      {
+        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+      }
+
+      static Block* remove_shared_blocks(Block* head)
+      {
+        Block** prev_next = &head;
+        for (Block* block = head; block; block = block->next) {
+          if (block->atype == SHARED) *prev_next = block->next;
+          else                         prev_next = &block->next;
+        }
+        return head;
+      }
+
+      void clear_list(MemoryMonitorInterface* device)
+      {
+        Block* block = this;
+        while (block) {
+          Block* next = block->next;
+          block->clear_block(device);
+          block = next;
+        }
+      }
+
+      void clear_block (MemoryMonitorInterface* device)
+      {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+        if (atype == ALIGNED_MALLOC) {
+          alignedFree(this);
+          if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else if (atype == EMBREE_OS_MALLOC) {
+         size_t sizeof_This = sizeof_Header+reserveEnd;
+         os_free(this,sizeof_This,huge_pages);
+         if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else /* if (atype == SHARED) */ {
+        }
+      }
+
+      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+      {
+        size_t bytes = bytes_in;
+        assert(align <= maxAlignment);
+        bytes = (bytes+(align-1)) & ~(align-1);
+	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+        if (unlikely(i > reserveEnd)) return nullptr;
+        bytes_in = bytes = min(bytes,reserveEnd-i);
+        
+	if (i+bytes > allocEnd) {
+          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+        }
+	return &data[i];
+      }
+
+      void* ptr() {
+        return &data[cur];
+      }
+
+      void reset_block ()
+      {
+        allocEnd = max(allocEnd,(size_t)cur);
+        cur = 0;
+      }
+
+      size_t getBlockUsedBytes() const {
+        return min(size_t(cur),reserveEnd);
+      }
+
+      size_t getBlockFreeBytes() const {
+	return getBlockAllocatedBytes() - getBlockUsedBytes();
+      }
+
+      size_t getBlockAllocatedBytes() const {
+        return min(max(allocEnd,size_t(cur)),reserveEnd);
+      }
+
+      size_t getBlockWastedBytes() const {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        return sizeof_Header + wasted;
+      }
+
+      size_t getBlockReservedBytes() const {
+        return reserveEnd;
+      }
+  
+      bool hasType(AllocationType atype_i, bool huge_pages_i) const
+      {
+        if      (atype_i == ANY_TYPE ) return true;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else                           return atype_i == atype;
+      }
+
+      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockUsedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockFreeBytes();
+        }
+        return bytes;
+      }
+
+      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockWastedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockAllocatedBytes();
+        }
+        return bytes;
+      }
+
+      void print_list ()
+      {
+        for (const Block* block = this; block; block = block->next)
+          block->print_block();
+      }
+
+      void print_block() const
+      {
+        if (atype == ALIGNED_MALLOC) std::cout << "A";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
+        else if (atype == SHARED) std::cout << "S";
+        if (huge_pages) std::cout << "H";
+        size_t bytesUsed = getBlockUsedBytes();
+        size_t bytesFree = getBlockFreeBytes();
+        size_t bytesWasted = getBlockWastedBytes();
+        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+      }
+
+    public:
+      std::atomic<size_t> cur;        //!< current location of the allocator
+      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
+      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+      Block* next;               //!< pointer to next block in list
+      size_t wasted;             //!< amount of memory wasted through block alignment
+      AllocationType atype;      //!< allocation mode of the block
+      bool huge_pages;           //!< whether the block uses huge pages
+      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+      char data[1];              //!< here starts memory to use for allocations
+    };
+
+  private:
+    Device* device;
+    SpinLock mutex;
+    size_t slotMask;
+    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> usedBlocks;
+    std::atomic<Block*> freeBlocks;
+
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
+
+    bool use_single_mode;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+    std::atomic<size_t> bytesUsed;
+    std::atomic<size_t> bytesFree;
+    std::atomic<size_t> bytesWasted;
+    static __thread ThreadLocal2* thread_local_allocator2;
+    static SpinLock s_thread_local_allocators_lock;
+    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex thread_local_allocators_lock;
+#else
+    SpinLock thread_local_allocators_lock;
+#endif
+    std::vector<ThreadLocal2*> thread_local_allocators;
+    AllocationType atype;
+    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h
new file mode 100644
index 0000000000..02d319c59d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+  /*! Implements an API data buffer object. This class may or may not own the data. */
+  class Buffer : public RefCount
+  {
+  public:
+    /*! Buffer construction */
+    Buffer() 
+      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+    /*! Buffer construction */
+    Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+      : device(device), numBytes(numBytes_in)
+    {
+      device->refInc();
+      
+      if (ptr_in)
+      {
+        shared = true;
+        ptr = (char*)ptr_in;
+      }
+      else
+      {
+        shared = false;
+        alloc();
+      }
+    }
+    
+    /*! Buffer destruction */
+    ~Buffer() {
+      free();
+      device->refDec();
+    }
+    
+    /*! this class is not copyable */
+  private:
+    Buffer(const Buffer& other) DELETED; // do not implement
+    Buffer& operator =(const Buffer& other) DELETED; // do not implement
+    
+  public:
+    /* inits and allocates the buffer */
+    void create(Device* device_in, size_t numBytes_in)
+    {
+      init(device_in, numBytes_in);
+      alloc();
+    }
+    
+    /* inits the buffer */
+    void init(Device* device_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = nullptr;
+      numBytes = numBytes_in;
+      shared = false;
+    }
+
+    /*! sets shared buffer */
+    void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = (char*)ptr_in;
+      if (numBytes_in != (size_t)-1)
+        numBytes = numBytes_in;
+      shared = true;
+    }
+    
+    /*! allocated buffer */
+    void alloc()
+    {
+      if (device)
+        device->memoryMonitor(this->bytes(), false);
+      size_t b = (this->bytes()+15) & ssize_t(-16);
+      ptr = (char*)alignedMalloc(b,16);
+    }
+    
+    /*! frees the buffer */
+    void free()
+    {
+      if (shared) return;
+      alignedFree(ptr); 
+      if (device)
+        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      ptr = nullptr;
+    }
+    
+    /*! gets buffer pointer */
+    void* data()
+    {
+      /* report error if buffer is not existing */
+      if (!device)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+      
+      /* return buffer */
+      return ptr;
+    }
+
+    /*! returns pointer to first element */
+    __forceinline char* getPtr() const {
+      return ptr;
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return numBytes;
+    }
+    
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr; 
+    }
+
+  public:
+    Device* device;  //!< device to report memory usage to
+    char* ptr;       //!< pointer to buffer data
+    size_t numBytes; //!< number of bytes in the buffer
+    bool shared;     //!< set if memory is shared with application
+  };
+
+  /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+  class RawBufferView
+  {
+  public:
+    /*! Buffer construction */
+    RawBufferView()
+      : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+  public:
+    /*! sets the buffer view */
+    void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+    {
+      if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+      ptr_ofs = buffer_in->ptr + offset_in;
+      stride = stride_in;
+      num = num_in;
+      format = format_in;
+      modCounter++;
+      modified = true;
+      buffer = buffer_in;
+    }
+
+    /*! returns pointer to the first element */
+    __forceinline char* getPtr() const {
+      return ptr_ofs;
+    }
+
+    /*! returns pointer to the i'th element */
+    __forceinline char* getPtr(size_t i) const
+    {
+      assert(i<num);
+      return ptr_ofs + i*stride;
+    }
+
+    /*! returns the number of elements of the buffer */
+    __forceinline size_t size() const { 
+      return num; 
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return num*stride; 
+    }
+    
+    /*! returns the buffer stride */
+    __forceinline unsigned getStride() const
+    {
+      assert(stride <= unsigned(inf));
+      return unsigned(stride);
+    }
+
+    /*! return the buffer format */
+    __forceinline RTCFormat getFormat() const {
+      return format;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline void setModified() {
+      modCounter++;
+      modified = true;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline bool isModified(unsigned int otherModCounter) const {
+      return modCounter > otherModCounter;
+    }
+
+     /*! mark buffer as modified or unmodified */
+    __forceinline bool isLocalModified() const {
+      return modified;
+    }
+
+    /*! clear local modified flag */
+    __forceinline void clearLocalModified() {
+      modified = false;
+    }
+
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr_ofs; 
+    }
+
+    /*! checks padding to 16 byte check, fails hard */
+    __forceinline void checkPadding16() const
+    {
+      if (ptr_ofs && num)
+        volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+    }
+
+  public:
+    char* ptr_ofs;      //!< base pointer plus offset
+    size_t stride;      //!< stride of the buffer in bytes
+    size_t num;         //!< number of elements in the buffer
+    RTCFormat format;   //!< format of the buffer
+    unsigned int modCounter; //!< version ID of this buffer
+    bool modified;      //!< local modified data
+    int userData;       //!< special data
+    Ref<Buffer> buffer; //!< reference to the parent buffer
+  };
+
+  /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+  template<typename T>
+  class BufferView : public RawBufferView
+  {
+  public:
+    typedef T value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline       T& operator [](size_t i)       { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+    __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+  };
+
+  template<>
+  class BufferView<Vec3fa> : public RawBufferView
+  {
+  public:
+    typedef Vec3fa value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/builder.h b/thirdparty/embree-aarch64/kernels/common/builder.h
new file mode 100644
index 0000000000..d2a1cfe3ce
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+  /*! virtual interface for all hierarchy builders */
+  class Builder : public RefCount {
+  public:
+
+    static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    /*! initiates the hierarchy builder */
+    virtual void build() = 0;
+
+    /*! notifies the builder about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+
+    /*! clears internal builder state */
+    virtual void clear() = 0;
+  };
+
+  /*! virtual interface for progress monitor class */
+  struct BuildProgressMonitor {
+    virtual void operator() (size_t dn) const = 0;
+  };
+
+  /*! build the progress monitor interface from a closure */
+  template<typename Closure>
+    struct ProgressMonitorClosure : BuildProgressMonitor
+  {
+  public:
+    ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+    void operator() (size_t dn) const { closure(dn); }
+  private:
+    const Closure closure;
+  };
+  template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+    return ProgressMonitorClosure<Closure>(closure);
+  }
+
+  struct LineSegments;
+  struct TriangleMesh;
+  struct QuadMesh;
+  struct UserGeometry;
+
+  class Scene;
+
+  typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+  typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h
new file mode 100644
index 0000000000..d0185a74f2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+  class Scene;
+
+  struct IntersectContext
+  {
+  public:
+    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+      : scene(scene), user(user_context) {}
+
+    __forceinline bool hasContextFilter() const {
+      return user->filter != nullptr;
+    }
+
+    __forceinline bool isCoherent() const {
+      return embree::isCoherent(user->flags);
+    }
+
+    __forceinline bool isIncoherent() const {
+      return embree::isIncoherent(user->flags);
+    }
+    
+  public:
+    Scene* scene;
+    RTCIntersectContext* user;
+  };
+
+  template<int M, typename Geometry>
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+    {
+#if RTC_MIN_WIDTH
+      const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+      return v;
+#endif
+    }
+
+    template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+  {
+#if RTC_MIN_WIDTH
+    const float d = length(Vec3fa(v) - ray_org);
+    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    return Vec3ff(v.x,v.y,v.z,r);
+#else
+    return v;
+#endif
+  }
+  
+  enum PointQueryType
+  {
+    POINT_QUERY_TYPE_UNDEFINED = 0,
+    POINT_QUERY_TYPE_SPHERE = 1,
+    POINT_QUERY_TYPE_AABB = 2,
+  };
+
+  typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+  struct PointQueryContext
+  {
+  public:
+    __forceinline PointQueryContext(Scene* scene, 
+                                    PointQuery* query_ws, 
+                                    PointQueryType query_type,
+                                    PointQueryFunction func, 
+                                    RTCPointQueryContext* userContext,
+                                    float similarityScale,
+                                    void* userPtr)
+      : scene(scene)
+      , query_ws(query_ws)
+      , query_type(query_type)
+      , func(func)
+      , userContext(userContext)
+      , similarityScale(similarityScale)
+      , userPtr(userPtr) 
+      , primID(RTC_INVALID_GEOMETRY_ID)
+      , geomID(RTC_INVALID_GEOMETRY_ID)
+      , query_radius(query_ws->radius)
+    { 
+      if (query_type == POINT_QUERY_TYPE_AABB) {
+        assert(similarityScale == 0.f);
+        updateAABB();
+      }
+      if (userContext->instStackSize == 0) {
+        assert(similarityScale == 1.f);
+      }
+    }
+
+  public:
+    __forceinline void updateAABB() 
+    {
+      if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+        query_radius = Vec3fa(query_ws->radius);
+        return;
+      }
+
+      const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+      BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+      bbox = xfmBounds(m, bbox);
+      query_radius = 0.5f * (bbox.upper - bbox.lower);
+    }
+
+public:
+    Scene* scene;
+
+    PointQuery* query_ws; // the original world space point query 
+    PointQueryType query_type;
+    PointQueryFunction func;
+    RTCPointQueryContext* userContext;
+    const float similarityScale;
+
+    void* userPtr;
+
+    unsigned int primID;
+    unsigned int geomID;
+
+    Vec3fa query_radius;  // used if the query is converted to an AABB internally
+  };
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h
new file mode 100644
index 0000000000..709119163b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/default.h
@@ -0,0 +1,273 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+#if !defined(_DEBUG) && defined(BUILD_IOS)
+#undef assert
+#define assert(_EXPR)
+#endif
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec2 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec2vf  = Vec2<vfloat<N>>;
+  template<int N> using Vec2vd  = Vec2<vdouble<N>>;
+  template<int N> using Vec2vr  = Vec2<vreal<N>>;
+  template<int N> using Vec2vi  = Vec2<vint<N>>;
+  template<int N> using Vec2vl  = Vec2<vllong<N>>;
+  template<int N> using Vec2vb  = Vec2<vbool<N>>;
+  template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+  template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+  typedef Vec2<vfloat4>  Vec2vf4;
+  typedef Vec2<vdouble4> Vec2vd4;
+  typedef Vec2<vreal4>   Vec2vr4;
+  typedef Vec2<vint4>    Vec2vi4;
+  typedef Vec2<vllong4>  Vec2vl4;
+  typedef Vec2<vbool4>   Vec2vb4;
+  typedef Vec2<vboolf4>  Vec2vbf4;
+  typedef Vec2<vboold4>  Vec2vbd4;
+
+  typedef Vec2<vfloat8>  Vec2vf8;
+  typedef Vec2<vdouble8> Vec2vd8;
+  typedef Vec2<vreal8>   Vec2vr8;
+  typedef Vec2<vint8>    Vec2vi8;
+  typedef Vec2<vllong8>  Vec2vl8;
+  typedef Vec2<vbool8>   Vec2vb8;
+  typedef Vec2<vboolf8>  Vec2vbf8;
+  typedef Vec2<vboold8>  Vec2vbd8;
+
+  typedef Vec2<vfloat16>  Vec2vf16;
+  typedef Vec2<vdouble16> Vec2vd16;
+  typedef Vec2<vreal16>   Vec2vr16;
+  typedef Vec2<vint16>    Vec2vi16;
+  typedef Vec2<vllong16>  Vec2vl16;
+  typedef Vec2<vbool16>   Vec2vb16;
+  typedef Vec2<vboolf16>  Vec2vbf16;
+  typedef Vec2<vboold16>  Vec2vbd16;
+
+  typedef Vec2<vfloatx>  Vec2vfx;
+  typedef Vec2<vdoublex> Vec2vdx;
+  typedef Vec2<vrealx>   Vec2vrx;
+  typedef Vec2<vintx>    Vec2vix;
+  typedef Vec2<vllongx>  Vec2vlx;
+  typedef Vec2<vboolx>   Vec2vbx;
+  typedef Vec2<vboolfx>  Vec2vbfx;
+  typedef Vec2<vbooldx>  Vec2vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec3 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec3vf  = Vec3<vfloat<N>>;
+  template<int N> using Vec3vd  = Vec3<vdouble<N>>;
+  template<int N> using Vec3vr  = Vec3<vreal<N>>;
+  template<int N> using Vec3vi  = Vec3<vint<N>>;
+  template<int N> using Vec3vl  = Vec3<vllong<N>>;
+  template<int N> using Vec3vb  = Vec3<vbool<N>>;
+  template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+  template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+  typedef Vec3<vfloat4>  Vec3vf4;
+  typedef Vec3<vdouble4> Vec3vd4;
+  typedef Vec3<vreal4>   Vec3vr4;
+  typedef Vec3<vint4>    Vec3vi4;
+  typedef Vec3<vllong4>  Vec3vl4;
+  typedef Vec3<vbool4>   Vec3vb4;
+  typedef Vec3<vboolf4>  Vec3vbf4;
+  typedef Vec3<vboold4>  Vec3vbd4;
+
+  typedef Vec3<vfloat8>  Vec3vf8;
+  typedef Vec3<vdouble8> Vec3vd8;
+  typedef Vec3<vreal8>   Vec3vr8;
+  typedef Vec3<vint8>    Vec3vi8;
+  typedef Vec3<vllong8>  Vec3vl8;
+  typedef Vec3<vbool8>   Vec3vb8;
+  typedef Vec3<vboolf8>  Vec3vbf8;
+  typedef Vec3<vboold8>  Vec3vbd8;
+
+  typedef Vec3<vfloat16>  Vec3vf16;
+  typedef Vec3<vdouble16> Vec3vd16;
+  typedef Vec3<vreal16>   Vec3vr16;
+  typedef Vec3<vint16>    Vec3vi16;
+  typedef Vec3<vllong16>  Vec3vl16;
+  typedef Vec3<vbool16>   Vec3vb16;
+  typedef Vec3<vboolf16>  Vec3vbf16;
+  typedef Vec3<vboold16>  Vec3vbd16;
+
+  typedef Vec3<vfloatx>  Vec3vfx;
+  typedef Vec3<vdoublex> Vec3vdx;
+  typedef Vec3<vrealx>   Vec3vrx;
+  typedef Vec3<vintx>    Vec3vix;
+  typedef Vec3<vllongx>  Vec3vlx;
+  typedef Vec3<vboolx>   Vec3vbx;
+  typedef Vec3<vboolfx>  Vec3vbfx;
+  typedef Vec3<vbooldx>  Vec3vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec4 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec4vf  = Vec4<vfloat<N>>;
+  template<int N> using Vec4vd  = Vec4<vdouble<N>>;
+  template<int N> using Vec4vr  = Vec4<vreal<N>>;
+  template<int N> using Vec4vi  = Vec4<vint<N>>;
+  template<int N> using Vec4vl  = Vec4<vllong<N>>;
+  template<int N> using Vec4vb  = Vec4<vbool<N>>;
+  template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+  template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+  typedef Vec4<vfloat4>  Vec4vf4;
+  typedef Vec4<vdouble4> Vec4vd4;
+  typedef Vec4<vreal4>   Vec4vr4;
+  typedef Vec4<vint4>    Vec4vi4;
+  typedef Vec4<vllong4>  Vec4vl4;
+  typedef Vec4<vbool4>   Vec4vb4;
+  typedef Vec4<vboolf4>  Vec4vbf4;
+  typedef Vec4<vboold4>  Vec4vbd4;
+
+  typedef Vec4<vfloat8>  Vec4vf8;
+  typedef Vec4<vdouble8> Vec4vd8;
+  typedef Vec4<vreal8>   Vec4vr8;
+  typedef Vec4<vint8>    Vec4vi8;
+  typedef Vec4<vllong8>  Vec4vl8;
+  typedef Vec4<vbool8>   Vec4vb8;
+  typedef Vec4<vboolf8>  Vec4vbf8;
+  typedef Vec4<vboold8>  Vec4vbd8;
+
+  typedef Vec4<vfloat16>  Vec4vf16;
+  typedef Vec4<vdouble16> Vec4vd16;
+  typedef Vec4<vreal16>   Vec4vr16;
+  typedef Vec4<vint16>    Vec4vi16;
+  typedef Vec4<vllong16>  Vec4vl16;
+  typedef Vec4<vbool16>   Vec4vb16;
+  typedef Vec4<vboolf16>  Vec4vbf16;
+  typedef Vec4<vboold16>  Vec4vbd16;
+
+  typedef Vec4<vfloatx>  Vec4vfx;
+  typedef Vec4<vdoublex> Vec4vdx;
+  typedef Vec4<vrealx>   Vec4vrx;
+  typedef Vec4<vintx>    Vec4vix;
+  typedef Vec4<vllongx>  Vec4vlx;
+  typedef Vec4<vboolx>   Vec4vbx;
+  typedef Vec4<vboolfx>  Vec4vbfx;
+  typedef Vec4<vbooldx>  Vec4vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Other shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+  typedef BBox<Vec3vf4>  BBox3vf4;
+  typedef BBox<Vec3vf8>  BBox3vf8;
+  typedef BBox<Vec3vf16> BBox3vf16;
+
+  /* calculate time segment itime and fractional time ftime */
+  __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = time * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  template<int N>
+  __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = time * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  template<int N>
+    __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+  {
+    const float round_up   = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+    const float round_down = 1.0f-2.0f*float(ulp);
+    const int itime_lower = (int)max(floor(round_up  *time_range.lower*numTimeSegments), 0.0f);
+    const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+    return make_range(itime_lower, itime_upper);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+  {
+    const float lower = (range.lower-time_range.lower)/time_range.size();
+    const float upper = (range.upper-time_range.lower)/time_range.size();
+    return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp
new file mode 100644
index 0000000000..16ec11b892
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.cpp
@@ -0,0 +1,567 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+  /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+  ssize_t Device::debug_int0 = 0;
+  ssize_t Device::debug_int1 = 0;
+  ssize_t Device::debug_int2 = 0;
+  ssize_t Device::debug_int3 = 0;
+
+  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+  static MutexSys g_mutex;
+  static std::map<Device*,size_t> g_cache_size_map;
+  static std::map<Device*,size_t> g_num_threads_map;
+
+  Device::Device (const char* cfg)
+  {
+    /* check that CPU supports lowest ISA */
+    if (!hasISA(ISA)) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+    }
+
+    /* set default frequency level for detected CPU */
+    switch (getCPUModel()) {
+    case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::SANDY_BRIDGE:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::NEHALEM:         frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE2:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
+    }
+
+    /* initialize global state */
+#if defined(EMBREE_CONFIG)
+    State::parseString(EMBREE_CONFIG);
+#endif
+    State::parseString(cfg);
+    if (!ignore_config_files && FileName::executableFolder() != FileName(""))
+      State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    if (!ignore_config_files && FileName::homeFolder() != FileName(""))
+      State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    State::verify();
+
+    /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */    
+    if (!checkISASupport()) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+    }    
+    
+    /*! do some internal tests */
+    assert(isa::Cylinder::verify());
+
+    /*! enable huge page support if desired */
+#if defined(__WIN32__)
+    if (State::enable_selockmemoryprivilege)
+      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+    
+    /*! set tessellation cache size */
+    setCacheSize( State::tessellation_cache_size );
+
+    /*! enable some floating point exceptions to catch bugs */
+    if (State::float_exceptions)
+    {
+      int exceptions = _MM_MASK_MASK;
+      //exceptions &= ~_MM_MASK_INVALID;
+      exceptions &= ~_MM_MASK_DENORM;
+      exceptions &= ~_MM_MASK_DIV_ZERO;
+      //exceptions &= ~_MM_MASK_OVERFLOW;
+      //exceptions &= ~_MM_MASK_UNDERFLOW;
+      //exceptions &= ~_MM_MASK_INEXACT;
+      _MM_SET_EXCEPTION_MASK(exceptions);
+    }
+    
+    /* print info header */
+    if (State::verbosity(1))
+      print();
+    if (State::verbosity(2)) 
+      State::print();
+
+    /* register all algorithms */
+    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+    /* setup tasking system */
+    initTaskingSystem(numThreads);
+
+    /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+    RayStreamFilterFuncsType rayStreamFilterFuncs;
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs);
+    rayStreamFilters = rayStreamFilterFuncs();
+#endif
+  }
+
+  Device::~Device ()
+  {
+    setCacheSize(0);
+    exitTaskingSystem();
+  }
+
+  std::string getEnabledTargets()
+  {
+    std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+    v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512KNL)
+    v += "AVX512KNL ";
+#endif
+#if defined(EMBREE_TARGET_AVX512SKX)
+    v += "AVX512SKX ";
+#endif
+    return v;
+  }
+
+  std::string getEmbreeFeatures()
+  {
+    std::string v;
+#if defined(EMBREE_RAY_MASK)
+    v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+    v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+    v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+    v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+    v += "compact_polys ";
+#endif
+    return v;
+  }
+
+  void Device::print()
+  {
+    const int cpu_features = getCPUFeatures();
+    std::cout << std::endl;
+    std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+    std::cout << "  Compiler  : " << getCompilerName() << std::endl;
+    std::cout << "  Build     : ";
+#if defined(DEBUG)
+    std::cout << "Debug " << std::endl;
+#else
+    std::cout << "Release " << std::endl;
+#endif
+    std::cout << "  Platform  : " << getPlatformName() << std::endl;
+    std::cout << "  CPU       : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+    std::cout << "   Threads  : " << getNumberOfLogicalThreads() << std::endl;
+    std::cout << "   ISA      : " << stringOfCPUFeatures(cpu_features) << std::endl;
+    std::cout << "   Targets  : " << supportedTargetList(cpu_features) << std::endl;
+    const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+    const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+    std::cout << "   MXCSR    : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+    std::cout << "  Config" << std::endl;
+    std::cout << "    Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+    std::cout << "    ISA     : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+    std::cout << "    Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+    std::cout << "              " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+    std::cout << "    Features: " << getEmbreeFeatures() << std::endl;
+    std::cout << "    Tasking : ";
+#if defined(TASKING_TBB)
+    std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+  #if TBB_INTERFACE_VERSION >= 12002
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+  #else
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+  #endif
+#endif
+#if defined(TASKING_INTERNAL)
+    std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    std::cout << "GCD tasking system ";
+#endif
+#if defined(TASKING_PPL)
+	std::cout << "PPL ";
+#endif
+    std::cout << std::endl;
+
+    /* check of FTZ and DAZ flags are set in CSR */
+    if (!hasFTZ || !hasDAZ) 
+    {
+#if !defined(_DEBUG)
+      if (State::verbosity(1)) 
+#endif
+      {
+        std::cout << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << "  WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled "         << std::endl 
+                  << "           in the MXCSR control and status register. This can have a severe "     << std::endl
+                  << "           performance impact. Please enable these modes for each application "   << std::endl
+                  << "           thread the following way:" << std::endl
+                  << std::endl 
+                  << "           #include \"xmmintrin.h\"" << std::endl 
+                  << "           #include \"pmmintrin.h\"" << std::endl 
+                  << std::endl 
+                  << "           _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl 
+                  << "           _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  void Device::setDeviceErrorCode(RTCError error)
+  {
+    RTCError* stored_error = errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getDeviceErrorCode()
+  {
+    RTCError* stored_error = errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::setThreadErrorCode(RTCError error)
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getThreadErrorCode()
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::process_error(Device* device, RTCError error, const char* str)
+  { 
+    /* store global error code when device construction failed */
+    if (!device)
+      return setThreadErrorCode(error);
+
+    /* print error when in verbose mode */
+    if (device->verbosity(1)) 
+    {
+      switch (error) {
+      case RTC_ERROR_NONE         : std::cerr << "Embree: No error"; break;
+      case RTC_ERROR_UNKNOWN    : std::cerr << "Embree: Unknown error"; break;
+      case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+      case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+      case RTC_ERROR_OUT_OF_MEMORY    : std::cerr << "Embree: Out of memory"; break;
+      case RTC_ERROR_UNSUPPORTED_CPU  : std::cerr << "Embree: Unsupported CPU"; break;
+      default                   : std::cerr << "Embree: Invalid error code"; break;                   
+      };
+      if (str) std::cerr << ", (" << str << ")";
+      std::cerr << std::endl;
+    }
+
+    /* call user specified error callback */
+    if (device->error_function) 
+      device->error_function(device->error_function_userptr,error,str); 
+
+    /* record error code */
+    device->setDeviceErrorCode(error);
+  }
+
+  void Device::memoryMonitor(ssize_t bytes, bool post)
+  {
+    if (State::memory_monitor_function && bytes != 0) {
+      if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+        if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+          throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+        }
+      }
+    }
+  }
+
+  size_t getMaxNumThreads()
+  {
+    size_t maxNumThreads = 0;
+    for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+      maxNumThreads = max(maxNumThreads, (*i).second);
+    if (maxNumThreads == 0)
+      maxNumThreads = std::numeric_limits<size_t>::max();
+    return maxNumThreads;
+  }
+
+  size_t getMaxCacheSize()
+  {
+    size_t maxCacheSize = 0;
+    for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+      maxCacheSize = max(maxCacheSize, (*i).second);
+    return maxCacheSize;
+  }
+ 
+  void Device::setCacheSize(size_t bytes) 
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    Lock<MutexSys> lock(g_mutex);
+    if (bytes == 0) g_cache_size_map.erase(this);
+    else            g_cache_size_map[this] = bytes;
+    
+    size_t maxCacheSize = getMaxCacheSize();
+    resizeTessellationCache(maxCacheSize);
+#endif
+  }
+
+  void Device::initTaskingSystem(size_t numThreads) 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    if (numThreads == 0) 
+      g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+    else 
+      g_num_threads_map[this] = numThreads;
+
+    /* create task scheduler */
+    size_t maxNumThreads = getMaxNumThreads();
+    TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+    const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+    const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+  }
+
+  void Device::exitTaskingSystem() 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    g_num_threads_map.erase(this);
+
+    /* terminate tasking system */
+    if (g_num_threads_map.size() == 0) {
+      TaskScheduler::destroy();
+    } 
+    /* or configure new number of threads */
+    else {
+      size_t maxNumThreads = getMaxNumThreads();
+      TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+    }
+#if USE_TASK_ARENA
+    arena.reset();
+#endif
+  }
+
+  void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+  {
+    /* hidden internal properties */
+    switch ((size_t)prop)
+    {
+    case 1000000: debug_int0 = val; return;
+    case 1000001: debug_int1 = val; return;
+    case 1000002: debug_int2 = val; return;
+    case 1000003: debug_int3 = val; return;
+    }
+
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+  }
+
+  ssize_t Device::getProperty(const RTCDeviceProperty prop)
+  {
+    size_t iprop = (size_t)prop;
+
+    /* get name of internal regression test */
+    if (iprop >= 2000000 && iprop < 3000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-2000000);
+      if (test) return (ssize_t) test->name.c_str();
+      else      return 0;
+    }
+
+    /* run internal regression test */
+    if (iprop >= 3000000 && iprop < 4000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-3000000);
+      if (test) return test->run();
+      else      return 0;
+    }
+
+    /* documented properties */
+    switch (prop) 
+    {
+    case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+    case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+    case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+    case RTC_DEVICE_PROPERTY_VERSION      : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return hasISA(SSE2);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return hasISA(AVX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
+#endif
+    
+#if defined(EMBREE_RAY_MASK)
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+            
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+        
+#if defined(EMBREE_GEOMETRY_QUAD)
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+    default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h
new file mode 100644
index 0000000000..e9a81bb109
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+  class BVH4Factory;
+  class BVH8Factory;
+
+  class Device : public State, public MemoryMonitorInterface
+  {
+    ALIGNED_CLASS_(16);
+
+  public:
+
+    /*! Device construction */
+    Device (const char* cfg);
+
+    /*! Device destruction */
+    virtual ~Device ();
+
+    /*! prints info about the device */
+    void print();
+
+    /*! sets the error code */
+    void setDeviceErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    RTCError getDeviceErrorCode();
+
+    /*! sets the error code */
+    static void setThreadErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    static RTCError getThreadErrorCode();
+
+    /*! processes error codes, do not call directly */
+    static void process_error(Device* device, RTCError error, const char* str);
+
+    /*! invokes the memory monitor callback */
+    void memoryMonitor(ssize_t bytes, bool post);
+
+    /*! sets the size of the software cache. */
+    void setCacheSize(size_t bytes);
+
+    /*! sets a property */
+    void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+    /*! gets a property */
+    ssize_t getProperty(const RTCDeviceProperty prop);
+
+  private:
+
+    /*! initializes the tasking system */
+    void initTaskingSystem(size_t numThreads);
+
+    /*! shuts down the tasking system */
+    void exitTaskingSystem();
+
+    /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+  public:
+    static ssize_t debug_int0;
+    static ssize_t debug_int1;
+    static ssize_t debug_int2;
+    static ssize_t debug_int3;
+
+  public:
+    std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+    std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+    
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+    
+    /* ray streams filter */
+    RayStreamFilterFuncs rayStreamFilters;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..b3aa8e3396
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+  const char* Geometry::gtype_names[Geometry::GTY_END] =
+  {
+    "flat_linear_curve",
+    "round_linear_curve",
+    "oriented_linear_curve",
+    "",
+    "flat_bezier_curve",
+    "round_bezier_curve",
+    "oriented_bezier_curve",
+    "",
+    "flat_bspline_curve",
+    "round_bspline_curve",
+    "oriented_bspline_curve",
+    "",
+    "flat_hermite_curve",
+    "round_hermite_curve",
+    "oriented_hermite_curve",
+    "",
+    "flat_catmull_rom_curve",
+    "round_catmull_rom_curve",
+    "oriented_catmull_rom_curve",
+    "",    
+    "triangles",
+    "quads",
+    "grid",
+    "subdivs",
+    "",
+    "sphere",
+    "disc",
+    "oriented_disc",
+    "",
+    "usergeom",
+    "instance_cheap",
+    "instance_expensive",
+  };
+     
+  Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
+    : device(device), userPtr(nullptr),
+      numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+      mask(-1),
+      gtype(gtype),
+      gsubtype(GTY_SUBTYPE_DEFAULT),
+      quality(RTC_BUILD_QUALITY_MEDIUM),
+      state((unsigned)State::MODIFIED),
+      enabled(true),
+      intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+  {
+    device->refInc();
+  }
+
+  Geometry::~Geometry()
+  {
+    device->refDec();
+  }
+
+  void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+  {      
+    if (numPrimitives_in == numPrimitives) return;
+    
+    numPrimitives = numPrimitives_in;
+    
+    Geometry::update();
+  }
+
+  void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+  {
+    if (numTimeSteps_in == numTimeSteps) {
+      return;
+    }
+    
+    numTimeSteps = numTimeSteps_in;
+    fnumTimeSegments = float(numTimeSteps_in-1);
+    
+    Geometry::update();
+  }
+
+  void Geometry::setTimeRange (const BBox1f range)
+  {
+    time_range = range;
+    Geometry::update();
+  }
+  
+  void Geometry::update()
+  {
+    ++modCounter_; // FIXME: required?
+    state = (unsigned)State::MODIFIED;
+  }
+  
+  void Geometry::commit() 
+  {
+    ++modCounter_;
+    state = (unsigned)State::COMMITTED;
+  }
+
+  void Geometry::preCommit()
+  {
+    if (State::MODIFIED == (State)state)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+  }
+
+  void Geometry::postCommit()
+  {
+  }
+
+  void Geometry::enable () 
+  {
+    if (isEnabled()) 
+      return;
+
+    enabled = true;
+    ++modCounter_;
+  }
+
+  void Geometry::disable () 
+  {
+    if (isDisabled()) 
+      return;
+    
+    enabled = false;
+    ++modCounter_;
+  }
+
+  void Geometry::setUserData (void* ptr)
+  {
+    userPtr = ptr;
+  }
+  
+  void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    intersectionFilterN = filter;
+  }
+
+  void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    occlusionFilterN = filter;
+  }
+  
+  void Geometry::setPointQueryFunction (RTCPointQueryFunction func) 
+  {
+    pointQueryFunc = func;
+  }
+
+  void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+  {
+    const void* valid_i = args->valid;
+    const unsigned* primIDs = args->primIDs;
+    const float* u = args->u;
+    const float* v = args->v;
+    unsigned int N = args->N;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+    const int* valid = (const int*) valid_i;
+ 
+    __aligned(64) float P_tmp[256];
+    __aligned(64) float dPdu_tmp[256];
+    __aligned(64) float dPdv_tmp[256];
+    __aligned(64) float ddPdudu_tmp[256];
+    __aligned(64) float ddPdvdv_tmp[256];
+    __aligned(64) float ddPdudv_tmp[256];
+
+    float* Pt = P ? P_tmp : nullptr;
+    float* dPdut = nullptr, *dPdvt = nullptr;
+    if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+    float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+    if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+    
+    for (unsigned int i=0; i<N; i++)
+    {
+      if (valid && !valid[i]) continue;
+
+      RTCInterpolateArguments iargs;
+      iargs.primID = primIDs[i];
+      iargs.u = u[i];
+      iargs.v = v[i];
+      iargs.bufferType = bufferType;
+      iargs.bufferSlot = bufferSlot;
+      iargs.P = Pt;
+      iargs.dPdu = dPdut;
+      iargs.dPdv = dPdvt;
+      iargs.ddPdudu = ddPdudut;
+      iargs.ddPdvdv = ddPdvdvt;
+      iargs.ddPdudv = ddPdudvt;
+      iargs.valueCount = valueCount;
+      interpolate(&iargs);
+      
+      if (likely(P)) {
+        for (unsigned int j=0; j<valueCount; j++) 
+          P[j*N+i] = Pt[j];
+      }
+      if (likely(dPdu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          dPdu[j*N+i] = dPdut[j];
+          dPdv[j*N+i] = dPdvt[j];
+        }
+      }
+      if (likely(ddPdudu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          ddPdudu[j*N+i] = ddPdudut[j];
+          ddPdvdv[j*N+i] = ddPdvdvt[j];
+          ddPdudv[j*N+i] = ddPdudvt[j];
+        }
+      }
+    }
+  }
+    
+  bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+  {
+    assert(context->primID < size());
+   
+    RTCPointQueryFunctionArguments args;
+    args.query           = (RTCPointQuery*)context->query_ws;
+    args.userPtr         = context->userPtr;
+    args.primID          = context->primID;
+    args.geomID          = context->geomID;
+    args.context         = context->userContext;
+    args.similarityScale = context->similarityScale;
+    
+    bool update = false;
+    if(context->func)  update |= context->func(&args);
+    if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+    if (update && context->userContext->instStackSize > 0)
+    {
+      // update point query
+      if (context->query_type == POINT_QUERY_TYPE_AABB) {
+        context->updateAABB();
+      } else {
+        assert(context->similarityScale > 0.f);
+        query->radius = context->query_ws->radius * context->similarityScale;
+      }
+    }
+    return update;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h
new file mode 100644
index 0000000000..953974bfd2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+  class Scene;
+  class Geometry;
+
+  struct GeometryCounts 
+  {
+    __forceinline GeometryCounts()
+      : numFilterFunctions(0),
+        numTriangles(0), numMBTriangles(0), 
+        numQuads(0), numMBQuads(0), 
+        numBezierCurves(0), numMBBezierCurves(0), 
+        numLineSegments(0), numMBLineSegments(0), 
+        numSubdivPatches(0), numMBSubdivPatches(0), 
+        numUserGeometries(0), numMBUserGeometries(0), 
+        numInstancesCheap(0), numMBInstancesCheap(0), 
+        numInstancesExpensive(0), numMBInstancesExpensive(0), 
+        numGrids(0), numMBGrids(0), 
+        numPoints(0), numMBPoints(0) {}
+
+    __forceinline size_t size() const {
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+    }
+
+    __forceinline unsigned int enabledGeometryTypesMask() const
+    {
+      unsigned int mask = 0;
+      if (numTriangles) mask |= 1 << 0;
+      if (numQuads) mask |= 1 << 1;
+      if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+      if (numSubdivPatches) mask |= 1 << 3;
+      if (numUserGeometries) mask |= 1 << 4;
+      if (numInstancesCheap) mask |= 1 << 5;
+      if (numInstancesExpensive) mask |= 1 << 6;
+      if (numGrids) mask |= 1 << 7;
+      if (numPoints) mask |= 1 << 8;
+
+      unsigned int maskMB = 0;
+      if (numMBTriangles) maskMB |= 1 << 0;
+      if (numMBQuads) maskMB |= 1 << 1;
+      if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+      if (numMBSubdivPatches) maskMB |= 1 << 3;
+      if (numMBUserGeometries) maskMB |= 1 << 4;
+      if (numMBInstancesCheap) maskMB |= 1 << 5;
+      if (numMBInstancesExpensive) maskMB |= 1 << 6;
+      if (numMBGrids) maskMB |= 1 << 7;
+      if (numMBPoints) maskMB |= 1 << 8;
+      
+      return (mask<<8) + maskMB;
+    }
+
+    __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+    {
+      GeometryCounts ret;
+      ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+      ret.numTriangles = numTriangles + rhs.numTriangles;
+      ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+      ret.numQuads = numQuads + rhs.numQuads;
+      ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+      ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+      ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+      ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+      ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+      ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+      ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+      ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+      ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+      ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+      ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+      ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+      ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numGrids = numGrids + rhs.numGrids;
+      ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numPoints = numPoints + rhs.numPoints;
+      ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+      return ret;
+    }
+
+    size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
+    size_t numTriangles;             //!< number of enabled triangles
+    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numQuads;                 //!< number of enabled quads
+    size_t numMBQuads;               //!< number of enabled motion blurred quads
+    size_t numBezierCurves;          //!< number of enabled curves
+    size_t numMBBezierCurves;        //!< number of enabled motion blurred curves
+    size_t numLineSegments;          //!< number of enabled line segments
+    size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
+    size_t numSubdivPatches;         //!< number of enabled subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numUserGeometries;        //!< number of enabled user geometries
+    size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
+    size_t numInstancesCheap;        //!< number of enabled cheap instances
+    size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
+    size_t numInstancesExpensive;    //!< number of enabled expensive instances
+    size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numGrids;                 //!< number of enabled grid geometries
+    size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numPoints;                //!< number of enabled points
+    size_t numMBPoints;              //!< number of enabled motion blurred points
+  };
+
+  /*! Base class all geometries are derived from */
+  class Geometry : public RefCount
+  {
+    friend class Scene;
+  public:
+
+    /*! type of geometry */
+    enum GType
+    {
+      GTY_FLAT_LINEAR_CURVE = 0,
+      GTY_ROUND_LINEAR_CURVE = 1,
+      GTY_ORIENTED_LINEAR_CURVE = 2,
+      GTY_CONE_LINEAR_CURVE = 3,
+      
+      GTY_FLAT_BEZIER_CURVE = 4,
+      GTY_ROUND_BEZIER_CURVE = 5,
+      GTY_ORIENTED_BEZIER_CURVE = 6,
+      
+      GTY_FLAT_BSPLINE_CURVE = 8,
+      GTY_ROUND_BSPLINE_CURVE = 9,
+      GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+      GTY_FLAT_HERMITE_CURVE = 12,
+      GTY_ROUND_HERMITE_CURVE = 13,
+      GTY_ORIENTED_HERMITE_CURVE = 14,
+      
+      GTY_FLAT_CATMULL_ROM_CURVE = 16,
+      GTY_ROUND_CATMULL_ROM_CURVE = 17,
+      GTY_ORIENTED_CATMULL_ROM_CURVE = 18,      
+
+      GTY_TRIANGLE_MESH = 20,
+      GTY_QUAD_MESH = 21,
+      GTY_GRID_MESH = 22,
+      GTY_SUBDIV_MESH = 23,
+
+      GTY_SPHERE_POINT = 25,
+      GTY_DISC_POINT = 26,
+      GTY_ORIENTED_DISC_POINT = 27,
+      
+      GTY_USER_GEOMETRY = 29,
+      GTY_INSTANCE_CHEAP = 30,
+      GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_END = 32,
+
+      GTY_BASIS_LINEAR = 0,
+      GTY_BASIS_BEZIER = 4,
+      GTY_BASIS_BSPLINE = 8,
+      GTY_BASIS_HERMITE = 12,
+      GTY_BASIS_CATMULL_ROM = 16,
+      GTY_BASIS_MASK = 28,
+
+      GTY_SUBTYPE_FLAT_CURVE = 0,
+      GTY_SUBTYPE_ROUND_CURVE = 1,
+      GTY_SUBTYPE_ORIENTED_CURVE = 2,
+      GTY_SUBTYPE_MASK = 3,
+    };
+
+    enum GSubType
+    {
+      GTY_SUBTYPE_DEFAULT= 0,
+      GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+      GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+    };
+
+    enum GTypeMask
+    {
+      MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+      MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+      MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+      MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+      MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+      MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+      
+      MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+      MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+      MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+      MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+      MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+      MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+      MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+      MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+      MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+                   MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+                   MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+                   MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+      MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+      MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+      MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+      MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+      MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+      MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+      MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+      MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+      MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+      MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+      MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+    };
+
+    static const char* gtype_names[GTY_END];
+
+    enum class State : unsigned {
+      MODIFIED = 0,
+      COMMITTED = 1,
+    };
+
+  public:
+    
+    /*! Geometry constructor */
+    Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+    /*! Geometry destructor */
+    virtual ~Geometry();
+
+  public:
+
+    /*! tests if geometry is enabled */
+    __forceinline bool isEnabled() const { return enabled; }
+
+    /*! tests if geometry is disabled */
+    __forceinline bool isDisabled() const { return !isEnabled(); }
+
+    /*! tests if that geometry has some filter function set */
+    __forceinline bool hasFilterFunctions () const {
+      return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
+    }
+
+    /*! returns geometry type */
+    __forceinline GType getType() const { return gtype; }
+
+    /*! returns curve type */
+    __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+    /*! returns curve basis */
+    __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+    /*! returns geometry type mask */
+    __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+    /*! returns number of primitives */
+    __forceinline size_t size() const { return numPrimitives; }
+
+    /*! sets the number of primitives */
+    virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+    /*! sets number of time steps */
+    virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+    /*! sets motion blur time range */
+    void setTimeRange (const BBox1f range);
+
+    /*! sets number of vertex attributes */
+    virtual void setVertexAttributeCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets number of topologies */
+    virtual void setTopologyCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets the build quality */
+    void setBuildQuality(RTCBuildQuality quality_in)
+    {
+      this->quality = quality_in;
+      Geometry::update();
+    }
+
+    /* calculate time segment itime and fractional time ftime */
+    __forceinline int timeSegment(float time, float& ftime) const {
+      return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+    }
+
+    template<int N>
+      __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+      return getTimeSegment(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+    }
+    
+    /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+    }
+
+    /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<(int)numTimeSteps);
+      return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+    }
+    
+    /*! for all geometries */
+  public:
+
+    /*! Enable geometry. */
+    virtual void enable();
+
+    /*! Update geometry. */
+    void update();
+    
+    /*! commit of geometry */
+    virtual void commit();
+
+    /*! Update geometry buffer. */
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+      update(); // update everything for geometries not supporting this call
+    }
+    
+    /*! Disable geometry. */
+    virtual void disable();
+
+    /*! Verify the geometry */
+    virtual bool verify() { return true; }
+
+    /*! called before every build */
+    virtual void preCommit();
+  
+    /*! called after every build */
+    virtual void postCommit();
+
+    virtual void addElementsToCount (GeometryCounts & counts) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    };
+
+    /*! sets constant tessellation rate for the geometry */
+    virtual void setTessellationRate(float N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets the maximal curve radius scale allowed by min-width feature. */
+    virtual void setMaxRadiusScale(float s) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set user data pointer. */
+    virtual void setUserData(void* ptr);
+      
+    /*! Get user data pointer. */
+    __forceinline void* getUserData() const {
+      return userPtr;
+    }
+
+    /*! interpolates user data to the specified u/v location */
+    virtual void interpolate(const RTCInterpolateArguments* const args) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! interpolates user data to the specified u/v locations */
+    virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+    /* point query api */
+    bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+    /*! for subdivision surfaces only */
+  public:
+    virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set displacement function. */
+    virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFace(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! get fast access to first vertex buffer if applicable */
+    virtual float * getCompactVertexArray () const {
+      return nullptr;
+    }
+
+    /*! Returns the modified counter - how many times the geo has been modified */
+    __forceinline unsigned int getModCounter () const {
+      return modCounter_;
+    }
+
+    /*! for triangle meshes and bezier curves only */
+  public:
+
+
+    /*! Sets ray mask. */
+    virtual void setMask(unsigned mask) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Sets specified buffer. */
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Gets specified buffer. */
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Set intersection filter function for ray packets of size N. */
+    virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! Set occlusion filter function for ray packets of size N. */
+    virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! for instances only */
+  public:
+
+    /*! Sets the instanced scene */
+    virtual void setInstancedScene(const Ref<Scene>& scene) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! for user geometries only */
+  public:
+
+    /*! Set bounds function. */
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set intersect function for ray packets of size N. */
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set occlusion function for ray packets of size N. */
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set point query function. */
+    void setPointQueryFunction(RTCPointQueryFunction func);
+
+    /*! returns number of time segments */
+    __forceinline unsigned numTimeSegments () const {
+      return numTimeSteps-1;
+    }
+
+  public:
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+    
+    virtual Vec3fa computeDirection(unsigned int primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+    
+    virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+    virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+  public:
+    __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+    __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+  public:
+    Device* device;             //!< device this geometry belongs to
+
+    void* userPtr;              //!< user pointer
+    unsigned int numPrimitives; //!< number of primitives of this geometry
+    
+    unsigned int numTimeSteps;  //!< number of time steps
+    float fnumTimeSegments;     //!< number of time segments (precalculation)
+    BBox1f time_range;          //!< motion blur time range
+    
+    unsigned int mask;             //!< for masking out geometry
+    unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+    
+    struct {
+      GType gtype : 8;                //!< geometry type
+      GSubType gsubtype : 8;          //!< geometry subtype
+      RTCBuildQuality quality : 3;    //!< build quality for geometry
+      unsigned state : 2;
+      bool enabled : 1;              //!< true if geometry is enabled
+    };
+       
+    RTCFilterFunctionN intersectionFilterN;
+    RTCFilterFunctionN occlusionFilterN;
+    RTCPointQueryFunction pointQueryFunc;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h
new file mode 100644
index 0000000000..32a198cdfe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+  /* Hit structure for K hits */
+  template<int K>
+    struct HitK
+  {
+    /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+      : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return K; }
+
+  public:
+    Vec3vf<K> Ng;  // geometry normal
+    vfloat<K> u;         // barycentric u coordinate of hit
+    vfloat<K> v;         // barycentric v coordinate of hit
+    vuint<K> primID;      // primitive ID
+    vuint<K> geomID;      // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Specialization for a single hit */
+  template<>
+    struct __aligned(16) HitK<1>
+  {
+     /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+      : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+    {
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return 1; }
+
+  public:
+    Vec3<float> Ng;  // geometry normal
+    float u;         // barycentric u coordinate of hit
+    float v;         // barycentric v coordinate of hit
+    unsigned int primID;      // primitive ID
+    unsigned int geomID;      // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Shortcuts */
+  typedef HitK<1>  Hit;
+  typedef HitK<4>  Hit4;
+  typedef HitK<8>  Hit8;
+  typedef HitK<16> Hit16;
+
+  /* Outputs hit to stream */
+  template<int K>
+  __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  Ng = " << ray.Ng <<  embree_endl
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  template<typename Hit>
+    __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+  {
+    ray.Ng   = hit.Ng;
+    ray.u    = hit.u;
+    ray.v    = hit.v;
+    ray.primID = hit.primID;
+    ray.geomID = hit.geomID;
+    instance_id_stack::copy(hit.instID, ray.instID);
+  }
+
+  template<int K>
+    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+  {
+    vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+    vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+    vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+    vfloat<K>::storeu(mask,&ray.u, hit.u);
+    vfloat<K>::storeu(mask,&ray.v, hit.v);
+    vuint<K>::storeu(mask,&ray.primID, hit.primID);
+    vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+    instance_id_stack::copy(hit.instID, ray.instID, mask);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d7e3637f7b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
@@ -0,0 +1,199 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, 
+              "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/* 
+ * Push an instance to the stack. 
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
+                          unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+  /* We assert here because instances are silently dropped when the stack is full. 
+     This might be quite hard to find in production. */
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[context->instStackSize++] = instanceId;
+  return spaceAvailable;
+#else
+  const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[0] = instanceId;
+  return spaceAvailable;
+#endif
+}
+
+
+/* 
+ * Pop the last instance pushed to the stack. 
+ * Do not call on an empty stack. 
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*******************************************************************************
+ * Optimized instance id stack copy.
+ * The copy() function at the bottom of this block will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ ******************************************************************************/
+
+/*
+ * Plain array assignment. This works for scalar->scalar,
+ * scalar->vector, and vector->vector.
+ */
+template <class Src, class Tgt>
+RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt)
+{
+  tgt[level] = src[level];
+}
+
+/*
+ * Masked SIMD vector->vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::storeu(mask, tgt + level, src[level]);
+}
+
+/*
+ * Masked scalar->SIMD vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::store(mask, tgt + level, src[level]);
+}
+
+/*
+ * Indexed assign from vector to scalar.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, unsigned* tgt, const size_t& idx)
+{
+  tgt[level] = src[level][idx];
+}
+
+/*
+ * Indexed assign from scalar to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const size_t& idx)
+{
+  tgt[level][idx] = src[level];
+}
+
+/*
+ * Indexed assign from vector to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const size_t& i, const size_t& j)
+{
+  tgt[level][j] = src[level][i];
+}
+
+/*
+ * Check if the given stack level is valid.
+ * These are only used for large max stack sizes.
+ */
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool<K>& /*mask*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack)
+{
+  return any(stack[level] != RTC_INVALID_GEOMETRY_ID);
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const vbool<K>& mask)
+{
+  return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID));
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i, const size_t& /*j*/)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+
+/*
+ * Copy an instance ID stack.
+ *
+ * This function automatically selects a LevelFunctor from the above Assign 
+ * structs.
+ */
+template <class Src, class Tgt, class... Args>
+RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  /* 
+   * Avoid all loops for only one level. 
+   */
+  level_copy(0, src, tgt, std::forward<Args>(args)...);
+
+#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4)
+  /* 
+   * It is faster to avoid the valid test for low level counts.
+   * Just copy the whole stack.
+   */
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+
+#else
+  /* 
+   * For general stack sizes, it pays off to test for validity.
+   */
+  bool valid = true;
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l)
+  {
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+    valid = level_valid(l, src, std::forward<Args>(args)...);
+  }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
+
diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h
new file mode 100644
index 0000000000..63fb8d3351
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/isa.h
@@ -0,0 +1,271 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name)               \
+  typedef type (*name##Func)();                 \
+  name##Func name;
+  
+#define DECLARE_SYMBOL2(type,name)                                       \
+  namespace sse2      { extern type name(); }                           \
+  namespace sse42     { extern type name(); }                           \
+  namespace avx       { extern type name(); }                           \
+  namespace avx2      { extern type name(); }                           \
+  namespace avx512knl { extern type name(); }                           \
+  namespace avx512skx { extern type name(); }                           \
+  void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+  type name##_error() { return type(name##_error2); }                   \
+  type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args)                            \
+  namespace sse2      { extern type symbol(args); }                       \
+  namespace sse42     { extern type symbol(args); }                       \
+  namespace avx       { extern type symbol(args); }                       \
+  namespace avx2      { extern type symbol(args); }                       \
+  namespace avx512knl { extern type symbol(args); }                       \
+  namespace avx512skx { extern type symbol(args); }                     \
+  inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+  typedef type (*symbol##Ty)(args);                                       \
+  
+#define DEFINE_ISA_FUNCTION(type,symbol,args)   \
+  typedef type (*symbol##Func)(args);           \
+  symbol##Func symbol;
+  
+#define ZERO_SYMBOL(features,intersector)                      \
+  intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector)                      \
+  intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+  intersector = isa::intersector;
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+  if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+  if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512KNL)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512KNL(features,intersector) \
+  if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector;
+#else
+#define SELECT_SYMBOL_AVX512KNL(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512SKX)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512SKX(features,intersector) \
+  if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector;
+#else
+#define SELECT_SYMBOL_AVX512SKX(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);            \
+  SELECT_SYMBOL_SSE42(features,intersector);                                  
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                \
+  SELECT_SYMBOL_SSE42(features,intersector);                  \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                     \
+  SELECT_SYMBOL_SSE42(features,intersector);                       \
+  SELECT_SYMBOL_AVX(features,intersector);                         \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                          \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                         \
+  SELECT_SYMBOL_AVX(features,intersector);                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                         \
+  SELECT_SYMBOL_SSE42(features,intersector);                                           \
+  SELECT_SYMBOL_AVX(features,intersector);                                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                                            \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                       \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                               \
+  SELECT_SYMBOL_SSE42(features,intersector);                                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);          \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                              \
+  SELECT_SYMBOL_AVX(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                      \
+  SELECT_SYMBOL_AVX(features,intersector);                \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                            \
+  SELECT_SYMBOL_SSE42(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                      \
+  SELECT_SYMBOL_AVX2(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                           \
+  SELECT_SYMBOL_AVX(features,intersector);                     \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                     \
+  SELECT_SYMBOL_AVX(features,intersector);                               \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                          \
+  SELECT_SYMBOL_AVX(features,intersector);                                    \
+  SELECT_SYMBOL_AVX2(features,intersector);                                   \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                                \
+  SELECT_SYMBOL_SSE42(features,intersector);                                        \
+  SELECT_SYMBOL_AVX(features,intersector);                                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                    \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  ZERO_SYMBOL(features,intersector);                                    \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX2(features,intersector);                             \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                               \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                     \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_SSE42(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                 \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+  struct VerifyMultiTargetLinking {
+    static __noinline int getISA(int depth = 5) { 
+      if (depth == 0) return ISA; 
+      else return getISA(depth-1); 
+    }
+  };
+  namespace sse2      { int getISA(); };
+  namespace sse42     { int getISA(); };
+  namespace avx       { int getISA(); };
+  namespace avx2      { int getISA(); };
+  namespace avx512knl { int getISA(); };
+  namespace avx512skx { int getISA(); };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..82953f0e89
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+  float theta;
+  float coeffs[3*8*7];
+
+  MotionDerivativeCoefficients() {}
+
+  // xfm0 and xfm1 are interpret as quaternion decomposition
+  MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+  {
+    // cosTheta of the two quaternions
+    const float cosTheta = min(1.f, max(-1.f,
+                           xfm0.l.vx.w * xfm1.l.vx.w
+                         + xfm0.l.vy.w * xfm1.l.vy.w
+                         + xfm0.l.vz.w * xfm1.l.vz.w
+                         + xfm0.p.w * xfm1.p.w));
+
+    theta = std::acos(cosTheta);
+    Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+    if (cosTheta < 0.995f) {
+      // compute perpendicular quaternion
+      qperp.x = xfm1.p.w    - cosTheta * xfm0.p.w;
+      qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+      qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+      qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+      qperp = normalize(qperp);
+    }
+    const float p[33] = {
+      theta,
+      xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+      xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+      xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+      qperp.x, qperp.y, qperp.z, qperp.w,
+      xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+                   xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+                                xfm0.l.vz.z, xfm0.p.z,
+      xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+                   xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+                                xfm1.l.vz.z, xfm1.p.z
+    };
+    motion_derivative_coefficients(p, coeffs);
+  }
+};
+
+struct MotionDerivative
+{
+  float twoTheta;
+  float c[8];
+
+  MotionDerivative(MotionDerivativeCoefficients const& mdc,
+                    int dim, Vec3fa const& p0, Vec3fa const& p1)
+    : twoTheta(2.f*mdc.theta)
+  {
+    const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+    for (int i = 0; i < 8; ++i) {
+      c[i] = 0;
+      for (int j = 0; j < 7; ++j) {
+        c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+      }
+    }
+  }
+
+  template<typename T>
+  struct EvalMotionDerivative
+  {
+    MotionDerivative const& md;
+    float offset;
+
+    EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+    T operator()(T const& time) const {
+      return md.c[0] + md.c[1] * time
+          + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+          + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+          + offset;
+    }
+  };
+
+  unsigned int findRoots(
+    Interval1f const& interval,
+    float offset,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    unsigned int numRoots = 0;
+    EvalMotionDerivative<Interval1f> eval(*this, offset);
+    findRoots(eval, interval, numRoots, roots, maxNumRoots);
+    return numRoots;
+  }
+
+  template<typename Eval>
+  static void findRoots(
+
+    Eval const& eval,
+    Interval1f const& interval,
+    unsigned int& numRoots,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    Interval1f range = eval(interval);
+    if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+    const float split = 0.5f * (interval.upper + interval.lower);
+    if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f ||  abs(split-interval.upper) < 1e-7f)
+    {
+      // check if the root already exists
+      for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+        if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+        return;
+      }
+      if (numRoots < maxNumRoots) {
+        roots[numRoots++] = split;
+      }
+      if (numRoots > maxNumRoots) {
+        printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+        return;
+      }
+      return;
+    }
+
+    findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+    findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+  }
+};
+
+/******************************************************************************
+ *                       Code generated with sympy 1.4                        *
+ *              See http://www.sympy.org/ for more information.               *
+ *                                                                            *
+ * see                                                                        *
+ *                                                                            *
+ *     scripts/generate_motion_derivative_coefficients.py                     *
+ *                                                                            *
+ * for how this code is generated                                             *
+ *                                                                            *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+   coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+   coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+   coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+   coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+   coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+   coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+   coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+   coeff[7] = 0;
+   coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+   coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+   coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+   coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+   coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+   coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+   coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+   coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+   coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+   coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+   coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+   coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+   coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+   coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+   coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+   coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+   coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+   coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+   coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+   coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+   coeff[28] = 0;
+   coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+   coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+   coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+   coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+   coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+   coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+   coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+   coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+   coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+   coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+   coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+   coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+   coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+   coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+   coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+   coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+   coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+   coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+   coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+   coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+   coeff[49] = 0;
+   coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+   coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+   coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+   coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+   coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+   coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+   coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+   coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+   coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+   coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+   coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+   coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+   coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+   coeff[63] = 0;
+   coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+   coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+   coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+   coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+   coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+   coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+   coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+   coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+   coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+   coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+   coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+   coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+   coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+   coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+   coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+   coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+   coeff[84] = 0;
+   coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+   coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+   coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+   coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+   coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+   coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+   coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+   coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+   coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+   coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+   coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+   coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+   coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+   coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+   coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+   coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+   coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+   coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+   coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+   coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+   coeff[105] = 0;
+   coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+   coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+   coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+   coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+   coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+   coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+   coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+   coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+   coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+   coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+   coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+   coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+   coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+   coeff[119] = 0;
+   coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+   coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+   coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+   coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+   coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+   coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+   coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+   coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+   coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+   coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+   coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+   coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+   coeff[140] = 0;
+   coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+   coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+   coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+   coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+   coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+   coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+   coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+   coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+   coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+   coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+   coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+   coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+   coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+   coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+   coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+   coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+   coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+   coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+   coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+   coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+   coeff[161] = 0;
+   coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+   coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+   coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+   coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+   coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+   coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h
new file mode 100644
index 0000000000..27d158ca3a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* Point query structure for closest point query */
+  template<int K>
+  struct RTC_ALIGN(16) PointQueryK 
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = radius >= vfloat<K>(0);
+      const vbool<K> vf = abs(time) < vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(PointQueryK<1>* ray) const;
+    __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+    __forceinline void set(const PointQueryK<1>* ray);
+    __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+    Vec3vf<K> p;      // location of the query point
+    vfloat<K> time;   // time for motion blur
+    vfloat<K> radius; // radius for the point query
+  };
+  
+  /* Specialization for a single point query */
+  template<>
+  struct RTC_ALIGN(16) PointQueryK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+    }
+
+    Vec3f p;  
+    float time;
+    float radius;
+  };
+  
+  /* Converts point query packet to single point query */
+  template<int K>
+  __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      query[i].p.x    = p.x[i]; 
+      query[i].p.y    = p.y[i]; 
+      query[i].p.z    = p.z[i];
+      query[i].time   = time[i];
+      query[i].radius = radius[i]; 
+    }
+  }
+
+  /* Extracts a single point query out of a point query packet*/
+  template<int K>
+  __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+  {
+    query.p.x    = p.x[i]; 
+    query.p.y    = p.y[i]; 
+    query.p.z    = p.z[i];
+    query.radius = radius[i];  
+    query.time   = time[i];  
+  }
+
+  /* Converts single point query to point query packet */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+  {
+    for (size_t i = 0; i < K; i++)
+    {
+      p.x[i]    = query[i].p.x;
+      p.y[i]    = query[i].p.y;
+      p.z[i]    = query[i].p.z;
+      radius[i] = query[i].radius; 
+      time[i]   = query[i].time; 
+    }
+  }
+
+  /* inserts a single point query into a point query packet element */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+  {
+    p.x[i]    = query.p.x;
+    p.y[i]    = query.p.y;
+    p.z[i]    = query.p.z;
+    radius[i] = query.radius; 
+    time[i]   = query.time; 
+  }
+
+  /* Shortcuts */
+  typedef PointQueryK<1>  PointQuery;
+  typedef PointQueryK<4>  PointQuery4;
+  typedef PointQueryK<8>  PointQuery8;
+  typedef PointQueryK<16> PointQuery16;
+  struct PointQueryN;
+
+  /* Outputs point query to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+  {
+    cout << "{ " << embree_endl
+        << "  p = "    << query.p      << embree_endl
+        << "  r = "    << query.radius << embree_endl
+        << "  time = " << query.time   << embree_endl
+        << "}";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h
new file mode 100644
index 0000000000..ce75c982bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(32) PrimRef 
+  {
+    __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+    __forceinline PrimRef(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v));
+    }
+    __forceinline PrimRef& operator=(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+    }
+#endif
+
+    __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) 
+    {
+      lower = Vec3fx(bounds.lower, geomID);
+      upper = Vec3fx(bounds.upper, primID);
+    }
+
+    __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
+    {
+#if defined(__X86_64__) || defined(__aarch64__)
+      lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+      upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+      lower = Vec3fx(bounds.lower, (unsigned)id);
+      upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+    }
+
+    /*! calculates twice the center of the primitive */
+    __forceinline const Vec3fa center2() const {
+      return lower+upper;
+    }
+    
+    /*! return the bounding box of the primitive */
+    __forceinline const BBox3fa bounds() const {
+      return BBox3fa(lower,upper);
+    }
+
+    /*! size for bin heuristic is 1 */
+    __forceinline unsigned size() const { 
+      return 1;
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const 
+    {
+      bounds_o = bounds();
+      center_o = embree::center2(bounds_o);
+    }
+
+    __forceinline unsigned& geomIDref() {  // FIXME: remove !!!!!!!
+      return lower.u;
+    }
+    __forceinline unsigned& primIDref() {  // FIXME: remove !!!!!!!
+      return upper.u;
+    }
+    
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const { 
+      return lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const { 
+      return upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+      return size_t(lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+      return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+    }
+
+  public:
+    Vec3fx lower;     //!< lower bounds and geomID
+    Vec3fx upper;     //!< upper bounds and primID
+  };
+
+  /*! fast exchange for PrimRefs */
+  __forceinline void xchg(PrimRef& a, PrimRef& b)
+  {
+#if defined(__AVX__)
+    const vfloat8 aa = vfloat8::load((float*)&a);
+    const vfloat8 bb = vfloat8::load((float*)&b);
+    vfloat8::store((float*)&a,bb);
+    vfloat8::store((float*)&b,aa);
+#else
+    std::swap(a,b);
+#endif
+  }
+
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  
+  struct SubGridBuildData {
+    unsigned short sx,sy;
+    unsigned int primID;
+    
+    __forceinline SubGridBuildData() {};
+    __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+    
+    __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+    __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..b6c1ad5712
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct PrimRefMB
+  {
+    typedef LBBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      lbounds.bounds0.lower.a = geomID;
+      lbounds.bounds0.upper.a = primID;
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.a = id;
+      lbounds.bounds0.upper.a = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+    
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.u = id;
+      lbounds.bounds0.upper.u = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    /*! returns bounds for binning */
+    __forceinline LBBox3fa bounds() const {
+      return lbounds;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned size() const {
+      return lbounds.bounds1.lower.a;
+    }
+
+    __forceinline unsigned totalTimeSegments() const {
+      return lbounds.bounds1.upper.a;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)totalTimeSegments());
+      return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(lbounds.interpolate(0.5f));
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = binCenter();
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const {
+      return lbounds.bounds0.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const {
+      return lbounds.bounds0.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+      return size_t(lbounds.bounds0.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    LBBox3fx lbounds;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#else
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(16) PrimRefMB
+  {
+    typedef BBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      bbox.lower.a = geomID;
+      bbox.upper.a = primID;
+    }
+    
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      bbox.lower.u = id & 0xFFFFFFFF;
+      bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      bbox.lower.u = id;
+      bbox.upper.u = 0;
+#endif
+    }
+    
+    /*! returns bounds for binning */
+    __forceinline BBox3fa bounds() const {
+      return bbox;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned int size() const { 
+      return _activeTimeSegments;
+    }
+
+    __forceinline unsigned int totalTimeSegments() const { 
+      return _totalTimeSegments;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)_totalTimeSegments);
+      return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(bounds());
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = center2(bounds());
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned int geomID() const { 
+      return bbox.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned int primID() const { 
+      return bbox.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+      return size_t(bbox.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    BBox3fa bbox; // bounds, geomID, primID
+    unsigned int _activeTimeSegments;
+    unsigned int _totalTimeSegments;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h
new file mode 100644
index 0000000000..a7de36414d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! helper structure for the implementation of the profile functions below */
+  struct ProfileTimer
+  {
+    static const size_t N = 20;
+    
+    ProfileTimer () {}
+
+    ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+    {
+      for (size_t i=0; i<N; i++) names[i] = nullptr;
+      for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+      for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+    }
+    
+    __forceinline void begin() 
+    {
+      j=0;
+      t0 = tj = getSeconds();
+    }
+
+    __forceinline void end() {
+      absolute("total");
+      i++;
+    }
+
+    __forceinline void operator() (const char* name) {
+      relative(name);
+    }
+
+    __forceinline void absolute (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-t0;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    __forceinline void relative (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-tj;
+      tj = t1;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    void print(size_t numElements) 
+    {
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      printf("  profile [M/s]:\n");
+      for (size_t j=0; j<maxJ; j++)
+        printf("%20s:  fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+               names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+      printf("  profile [ms]:\n");
+      for (size_t j=0; j<maxJ; j++) 
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+    }
+
+    void print() 
+    {
+      printf("  profile:\n");
+
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      for (size_t j=0; j<maxJ; j++) {
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+      }
+    }
+
+    double avg() {
+      return dt_avg[maxJ-1]/double(i-numSkip);
+    }
+    
+  private:
+    size_t i;
+    size_t j;
+    size_t maxJ;
+    size_t numSkip;
+    double t0;
+    double tj;
+    const char* names[N];
+    double dt_fst[N];
+    double dt_min[N];
+    double dt_avg[N];
+    double dt_max[N];
+  };
+
+  /*! This function executes some code block multiple times and measured sections of it. 
+      Use the following way:
+
+      profile(1,10,1000,[&](ProfileTimer& timer) {
+        // code
+        timer("A");
+        // code 
+        timer("B");
+      });
+  */
+  template<typename Closure>
+    void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      ProfileTimer timer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+
+  /*! similar as the function above, but the timer object comes externally */
+  template<typename Closure>
+    void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      timer = ProfileTimer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/ray.h b/thirdparty/embree-aarch64/kernels/common/ray.h
new file mode 100644
index 0000000000..336d48942c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+  /* Ray structure for K rays */
+  template<int K>
+  struct RayK
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                       const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                       const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+      const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(RayK<1>* ray) const;
+    __forceinline void get(size_t i, RayK<1>& ray) const;
+    __forceinline void set(const RayK<1>* ray);
+    __forceinline void set(size_t i, const RayK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    __forceinline vint<K> octant() const
+    {
+      return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+             select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+             select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+    }
+
+    /* Ray data */
+    Vec3vf<K> org;    // ray origin
+    vfloat<K> _tnear; // start of ray segment
+    Vec3vf<K> dir;    // ray direction
+    vfloat<K> _time;  // time of this ray for motion blur
+    vfloat<K> tfar;   // end of ray segment
+    vint<K> mask;     // used to mask out objects during traversal
+    vint<K> id;      
+    vint<K> flags;  
+
+    __forceinline vfloat<K>& tnear() { return _tnear; }
+    __forceinline vfloat<K>& time()  { return _time; }
+    __forceinline const vfloat<K>& tnear() const { return _tnear; }
+    __forceinline const vfloat<K>& time()  const { return _time; }
+  };
+
+  /* Ray+hit structure for K rays */
+  template<int K>
+  struct RayHitK : RayK<K>
+  {
+    using RayK<K>::org;
+    using RayK<K>::_tnear;
+    using RayK<K>::dir;
+    using RayK<K>::_time;
+    using RayK<K>::tfar;
+    using RayK<K>::mask;
+    using RayK<K>::id;
+    using RayK<K>::flags;
+
+    using RayK<K>::tnear;
+    using RayK<K>::time;
+
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                          const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                          const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK(const RayK<K>& ray)
+      : RayK<K>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+    {
+      org    = ray.org;
+      _tnear = ray._tnear;
+      dir    = ray.dir;
+      _time  = ray._time;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit(const vbool<K>& valid0) const
+    {
+      vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+      const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+      const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+      if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+      if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+      if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+      if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+      if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+      if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+    }
+
+    __forceinline void get(RayHitK<1>* ray) const;
+    __forceinline void get(size_t i, RayHitK<1>& ray) const;
+    __forceinline void set(const RayHitK<1>* ray);
+    __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    /* Hit data */
+    Vec3vf<K> Ng;   // geometry normal
+    vfloat<K> u;    // barycentric u coordinate of hit
+    vfloat<K> v;    // barycentric v coordinate of hit
+    vuint<K> primID; // primitive ID
+    vuint<K> geomID; // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Specialization for a single ray */
+  template<>
+  struct RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+    }
+
+    /* Ray data */
+    Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
+    //float tnear; // start of ray segment
+    Vec3ff dir;  // 3 floats for ray direction, 1 float for time
+    // float time; 
+    float tfar;  // end of ray segment
+    int mask;    // used to mask out objects during traversal
+    int id;      // ray ID
+    int flags;   // ray flags
+
+    __forceinline float& tnear() { return org.w; };
+    __forceinline const float& tnear() const { return org.w; };
+
+    __forceinline float& time() { return dir.w; };
+    __forceinline const float& time() const { return dir.w; };
+
+  };
+
+  template<>
+  struct RayHitK<1> : RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK(const RayK<1>& ray)
+      : RayK<1>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+    {
+      org    = ray.org;
+      dir    = ray.dir;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit() const
+    {
+      if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+      const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+      const bool vu = (abs(u) <= FLT_LARGE);
+      const bool vv = (abs(u) <= FLT_LARGE);
+      const bool vnx = abs(Ng.x) <= FLT_LARGE;
+      const bool vny = abs(Ng.y) <= FLT_LARGE;
+      const bool vnz = abs(Ng.z) <= FLT_LARGE;
+      if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+      if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+      if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+      if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+      if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+      if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+    }
+
+    /* Hit data */
+    Vec3f Ng;            // not normalized geometry normal
+    float u;             // barycentric u coordinate of hit
+    float v;             // barycentric v coordinate of hit
+    unsigned int primID; // primitive ID
+    unsigned int geomID; // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Converts ray packet to single rays */
+  template<int K>
+  __forceinline void RayK<K>::get(RayK<1>* ray) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+      ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time()  = time()[i];
+      ray[i].tfar  = tfar[i];  ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+    }
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      get(i, ray[i]);
+  }
+
+  /* Extracts a single ray out of a ray packet*/
+  template<int K>
+  __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; 
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time()  = time()[i];  
+    ray.tfar  = tfar[i]; ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar  = tfar[i]; ray.time()  = time()[i]; 
+    ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+    ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+    ray.u = u[i]; ray.v = v[i];
+    ray.primID = primID[i]; ray.geomID = geomID[i]; 
+
+    instance_id_stack::copy(instID, ray.instID, i);
+  }
+
+  /* Converts single rays to ray packet */
+  template<int K>
+  __forceinline void RayK<K>::set(const RayK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  /* inserts a single ray into a ray packet element */
+  template<int K>
+  __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+    Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+    u[i] = ray.u; v[i] = ray.v;
+    primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+    instance_id_stack::copy(ray.instID, instID, i);
+  }
+
+  /* copies a ray packet element into another element*/
+  template<int K>
+  __forceinline void RayK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; 
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+    Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+    u[dest] = u[source]; v[dest] = v[source];
+    primID[dest] = primID[source]; geomID[dest] = geomID[source];  
+
+    instance_id_stack::copy(instID, instID, source, dest);
+  }
+
+  /* Shortcuts */
+  typedef RayK<1>  Ray;
+  typedef RayK<4>  Ray4;
+  typedef RayK<8>  Ray8;
+  typedef RayK<16> Ray16;
+  struct RayN;
+
+  typedef RayHitK<1>  RayHit;
+  typedef RayHitK<4>  RayHit4;
+  typedef RayHitK<8>  RayHit8;
+  typedef RayHitK<16> RayHit16;
+  struct RayHitN;
+
+  template<int K, bool intersect>
+  struct RayTypeHelper;
+
+  template<int K>
+  struct RayTypeHelper<K, true>
+  {
+    typedef RayHitK<K> Ty;
+  };
+
+  template<int K>
+  struct RayTypeHelper<K, false>
+  {
+    typedef RayK<K> Ty;
+  };
+
+  template<bool intersect>
+  using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+  template<int K, bool intersect>
+  using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+  /* Outputs ray to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+  {
+    return cout << "{ " << embree_endl
+                << "  org = " << ray.org << embree_endl
+                << "  dir = " << ray.dir << embree_endl
+                << "  near = " << ray.tnear() << embree_endl
+                << "  far = " << ray.tfar << embree_endl
+                << "  time = " << ray.time() << embree_endl
+                << "  mask = " << ray.mask << embree_endl
+                << "  id = " << ray.id << embree_endl
+                << "  flags = " << ray.flags << embree_endl
+                << "}";
+  }
+
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  org = " << ray.org << embree_endl
+         << "  dir = " << ray.dir << embree_endl
+         << "  near = " << ray.tnear() << embree_endl
+         << "  far = " << ray.tfar << embree_endl
+         << "  time = " << ray.time() << embree_endl
+         << "  mask = " << ray.mask << embree_endl
+         << "  id = " << ray.id << embree_endl
+         << "  flags = " << ray.flags << embree_endl
+         << "  Ng = " << ray.Ng
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  struct RayStreamSOA
+  {
+    __forceinline RayStreamSOA(void* rays, size_t N)
+      : ptr((char*)rays), N(N) {}
+
+    /* ray data access functions */
+    __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; }  // x coordinate of ray origin
+    __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; }  // y coordinate of ray origin
+    __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+    __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+    __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+    __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+    __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+    __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+    __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+    __forceinline int*   mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset];   }; // used to mask out objects during traversal (optional)
+    __forceinline int*   id   (size_t offset = 0) { return (int*)&ptr[10*4*N+offset];  }; // id
+    __forceinline int*   flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset];  }; // flags
+
+    /* hit data access functions */
+    __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+    __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+    __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+    __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; };    // barycentric u coordinate of hit
+    __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; };    // barycentric v coordinate of hit
+
+    __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
+    __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
+    __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = org_x(offset)[0];
+      ray.org.y   = org_y(offset)[0];
+      ray.org.z   = org_z(offset)[0];
+      ray.tnear() = tnear(offset)[0];
+      ray.dir.x   = dir_x(offset)[0];
+      ray.dir.y   = dir_y(offset)[0];
+      ray.dir.z   = dir_z(offset)[0];
+      ray.time()  = time(offset)[0];
+      ray.tfar    = tfar(offset)[0];
+      ray.mask    = mask(offset)[0];
+      ray.id      = id(offset)[0];
+      ray.flags   = flags(offset)[0];
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x  = vfloat<K>::loadu(org_x(offset));
+      ray.org.y  = vfloat<K>::loadu(org_y(offset));
+      ray.org.z  = vfloat<K>::loadu(org_z(offset));
+      ray.tnear  = vfloat<K>::loadu(tnear(offset));
+      ray.dir.x  = vfloat<K>::loadu(dir_x(offset));
+      ray.dir.y  = vfloat<K>::loadu(dir_y(offset));
+      ray.dir.z  = vfloat<K>::loadu(dir_z(offset));
+      ray.time   = vfloat<K>::loadu(time(offset));
+      ray.tfar   = vfloat<K>::loadu(tfar(offset));
+      ray.mask   = vint<K>::loadu(mask(offset));
+      ray.id     = vint<K>::loadu(id(offset));
+      ray.flags  = vint<K>::loadu(flags(offset));
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, org_x(offset));
+      ray.org.y   = vfloat<K>::loadu(valid, org_y(offset));
+      ray.org.z   = vfloat<K>::loadu(valid, org_z(offset));
+      ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, dir_x(offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, dir_y(offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, dir_z(offset));
+      ray.time()  = vfloat<K>::loadu(valid, time(offset));
+      ray.tfar  = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+      /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+         because the SSE masked loads always access the entire vector */
+      if (unlikely(!all(valid)))
+      {
+        ray.mask  = zero;
+        ray.id    = zero;
+        ray.flags = zero;
+
+        for (size_t k = 0; k < K; k++)
+        {
+          if (likely(valid[k]))
+          {
+            ray.mask[k]  = mask(offset)[k];
+            ray.id[k]    = id(offset)[k];
+            ray.flags[k] = flags(offset)[k];
+          }
+        }
+      }
+      else
+#endif
+      {
+        ray.mask  = vint<K>::loadu(valid, mask(offset));
+        ray.id    = vint<K>::loadu(valid, id(offset));
+        ray.flags = vint<K>::loadu(valid, flags(offset));
+      }
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      /* 
+       * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+       * valid:   stores which of the rays actually hit something.
+       */
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+        vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+        vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+        vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, u(offset), ray.u);
+        vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+        /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+           because the SSE masked stores always access the entire vector */
+        if (unlikely(!all(valid_i)))
+        {
+          for (size_t k = 0; k < K; k++)
+          {
+            if (likely(valid[k]))
+            {
+              primID(offset)[k] = ray.primID[k];
+              geomID(offset)[k] = ray.geomID[k];
+
+              instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+                instID(l, offset)[k] = ray.instID[l][k];
+#endif
+            }
+          }
+        }
+        else
+#endif
+        {
+          vuint<K>::storeu(valid, primID(offset), ray.primID);
+          vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+          vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = dir_x(offset)[0];
+      const float dy = dir_y(offset)[0];
+      const float dz = dir_z(offset)[0];
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear(offset)[0];
+      const float ffar  = tfar(offset)[0];
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x(), offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y(), offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z(), offset);
+      ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+      ray.time()  = vfloat<K>::template gather<1>(valid, time(), offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar(), offset);
+      ray.mask    = vint<K>::template gather<1>(valid, mask(), offset);
+      ray.id      = vint<K>::template gather<1>(valid, id(), offset);
+      ray.flags   = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.time()  = zero;
+      ray.tfar    = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *org_x(ofs);
+          ray.org.y[k]   = *org_y(ofs);
+          ray.org.z[k]   = *org_z(ofs);
+          ray.tnear()[k] = *tnear(ofs);
+          ray.dir.x[k]   = *dir_x(ofs);
+          ray.dir.y[k]   = *dir_y(ofs);
+          ray.dir.z[k]   = *dir_z(ofs);
+          ray.time()[k]  = *time(ofs);
+          ray.tfar[k]    = *tfar(ofs);
+          ray.mask[k]    = *mask(ofs);
+          ray.id[k]      = *id(ofs);
+          ray.flags[k]   = *flags(ofs);
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+        vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+
+          *Ng_x(ofs)   = ray.Ng.x[k];
+          *Ng_y(ofs)   = ray.Ng.y[k];
+          *Ng_z(ofs)   = ray.Ng.z[k];
+          *u(ofs)      = ray.u[k];
+          *v(ofs)      = ray.v[k];
+          *primID(ofs) = ray.primID[k];
+          *geomID(ofs) = ray.geomID[k];
+
+          *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *instID(l, ofs) = ray.instID[l][k];
+#endif
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    char* __restrict__ ptr;
+    size_t N;
+  };
+
+  template<size_t MAX_K>
+  struct StackRayStreamSOA : public RayStreamSOA
+  {
+    __forceinline StackRayStreamSOA(size_t K)
+      : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+    char data[MAX_K / 4 * sizeof(RayHit4)];
+  };
+
+
+  struct RayStreamSOP
+  {
+    template<class T>
+    __forceinline void init(T& t)
+    {
+      org_x  = (float*)&t.org.x;
+      org_y  = (float*)&t.org.y;
+      org_z  = (float*)&t.org.z;
+      tnear  = (float*)&t.tnear;
+      dir_x  = (float*)&t.dir.x;
+      dir_y  = (float*)&t.dir.y;
+      dir_z  = (float*)&t.dir.z;
+      time   = (float*)&t.time;
+      tfar   = (float*)&t.tfar;
+      mask   = (unsigned int*)&t.mask;
+      id     = (unsigned int*)&t.id;
+      flags  = (unsigned int*)&t.flags;
+
+      Ng_x   = (float*)&t.Ng.x;
+      Ng_y   = (float*)&t.Ng.y;
+      Ng_z   = (float*)&t.Ng.z;
+      u      = (float*)&t.u;
+      v      = (float*)&t.v;
+      primID = (unsigned int*)&t.primID;
+      geomID = (unsigned int*)&t.geomID;
+
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = (unsigned int*)&t.instID[l];
+    }
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = *(float* __restrict__)((char*)org_x + offset);
+      ray.org.y   = *(float* __restrict__)((char*)org_y + offset);
+      ray.org.z   = *(float* __restrict__)((char*)org_z + offset);
+      ray.dir.x   = *(float* __restrict__)((char*)dir_x + offset);
+      ray.dir.y   = *(float* __restrict__)((char*)dir_y + offset);
+      ray.dir.z   = *(float* __restrict__)((char*)dir_z + offset);
+      ray.tfar  = *(float* __restrict__)((char*)tfar + offset);
+      ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      ray.time()  = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+      ray.mask    = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+      ray.id      = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+      ray.flags   = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+      ray.org.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+      ray.org.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      ray.tfar    = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      ray.time()  = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+      ray.mask    = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+      ray.id      = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+      ray.flags   = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+    {
+      Vec3vf<K> dir;
+      dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      return dir;
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+    {
+      if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+      {
+        *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+        if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+        if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+        if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+        *(float* __restrict__)((char*)u + offset) = ray.u;
+        *(float* __restrict__)((char*)v + offset) = ray.v;
+        *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+        *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+        if (likely(instID[0])) {
+          *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+        }
+      }
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+    {
+      *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+        if (likely(instID[0])) {
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = *(float* __restrict__)((char*)dir_x + offset);
+      const float dy = *(float* __restrict__)((char*)dir_y + offset);
+      const float dz = *(float* __restrict__)((char*)dir_z + offset);
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      const float ffar  = *(float* __restrict__)((char*)tfar + offset);
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+    {
+      const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      const vfloat<K> ffar  = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x, offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y, offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z, offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x, offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y, offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z, offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar, offset);
+      ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+      ray.time()  = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+      ray.mask    = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+      ray.id      = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+      ray.flags   = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.tfar    = zero;
+      ray.time()  = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *(float* __restrict__)((char*)org_x + ofs);
+          ray.org.y[k]   = *(float* __restrict__)((char*)org_y + ofs);
+          ray.org.z[k]   = *(float* __restrict__)((char*)org_z + ofs);
+          ray.dir.x[k]   = *(float* __restrict__)((char*)dir_x + ofs);
+          ray.dir.y[k]   = *(float* __restrict__)((char*)dir_y + ofs);
+          ray.dir.z[k]   = *(float* __restrict__)((char*)dir_z + ofs);
+          ray.tfar[k]  = *(float* __restrict__)((char*)tfar + ofs);
+          ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+          ray.time()[k]  = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+          ray.mask[k]    = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+          ray.id[k]      = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+          ray.flags[k]   = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+        if (likely(instID[0])) {
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+        }
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+          if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+          if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+          if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+          *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+          *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+          *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+          *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+          if (likely(instID[0])) {
+            *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+          }
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    /* ray data */
+    float* __restrict__ org_x; // x coordinate of ray origin
+    float* __restrict__ org_y; // y coordinate of ray origin
+    float* __restrict__ org_z; // z coordinate of ray origin
+    float* __restrict__ tnear; // start of ray segment (optional)
+
+    float* __restrict__ dir_x; // x coordinate of ray direction
+    float* __restrict__ dir_y; // y coordinate of ray direction
+    float* __restrict__ dir_z; // z coordinate of ray direction
+    float* __restrict__ time;         // time of this ray for motion blur (optional)
+
+    float* __restrict__ tfar;  // end of ray segment (set to hit distance)
+    unsigned int* __restrict__ mask;  // used to mask out objects during traversal (optional)
+    unsigned int* __restrict__ id;    // ray ID
+    unsigned int* __restrict__ flags; // ray flags
+
+    /* hit data */
+    float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+    float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+    float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+    float* __restrict__ u;    // barycentric u coordinate of hit
+    float* __restrict__ v;    // barycentric v coordinate of hit
+
+    unsigned int* __restrict__ primID; // primitive ID
+    unsigned int* __restrict__ geomID; // geometry ID
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+  };
+
+
+  struct RayStreamAOS
+  {
+    __forceinline RayStreamAOS(void* rays)
+      : ptr((Ray*)rays) {}
+
+    __forceinline Ray& getRayByOffset(size_t offset)
+    {
+      return *(Ray*)((char*)ptr + offset);
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      const vint<K> valid_offset = select(valid, offset, vintx(zero));
+      return getRayByOffset(valid_offset);
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+          ray_k->tfar   = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+          ray_k->tfar = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    Ray* __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+    const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+    const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+    const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+    const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+    const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+    const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+    const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+    const vfloat8 ab1  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+    const vfloat8 ab2  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+    const vfloat8 ab3  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+    const vfloat8 ab4  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+    const vfloat8 ab5  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+    const vfloat8 ab6  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+    const vfloat8 ab7  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+    const vfloat8 ab8  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+    const vfloat8 ab9  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+    const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+    const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+    const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+    const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+    const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+    const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+
+  struct RayStreamAOP
+  {
+    __forceinline RayStreamAOP(void* rays)
+      : ptr((Ray**)rays) {}
+
+    __forceinline Ray& getRayByIndex(size_t index)
+    {
+      return *ptr[index];
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+    {
+      const vint<K> valid_index = select(valid, index, vintx(zero));
+      return getRayByIndex(valid_index);
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+        }
+      }
+    }
+
+    Ray** __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+    const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+    const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+    const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+    const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+    const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+    const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1  = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2  = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3  = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4  = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5  = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6  = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7  = vfloat8::loadu(&ptr[index[7]]->org);
+    const vfloat8 ab8  = vfloat8::loadu(&ptr[index[8]]->org);
+    const vfloat8 ab9  = vfloat8::loadu(&ptr[index[9]]->org);
+    const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+    const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+    const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+    const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+    const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+    const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&ptr[index[7]]->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&ptr[index[8]]->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&ptr[index[9]]->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..625fbf6d4f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
@@ -0,0 +1,1799 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+  /* mutex to make API thread safe */
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    static std::mutex g_mutex;
+#else
+    static MutexSys g_mutex;
+#endif
+
+  RTC_API RTCDevice rtcNewDevice(const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewDevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    Device* device = new Device(config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API void rtcRetainDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refInc();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API void rtcReleaseDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refDec();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceProperty);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    return device->getProperty(prop);
+    RTC_CATCH_END(device);
+    return 0;
+  }
+
+  RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceProperty);
+    const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+    if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->setProperty(prop,val);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceError);
+    if (device == nullptr) return Device::getThreadErrorCode();
+    else                   return device->getDeviceErrorCode();
+    RTC_CATCH_END(device);
+    return RTC_ERROR_UNKNOWN;
+  }
+
+  RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceErrorFunction);
+    RTC_VERIFY_HANDLE(hdevice);
+    device->setErrorFunction(error, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+    device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSharedBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetBufferData);
+    RTC_VERIFY_HANDLE(hbuffer);
+    return buffer->data();
+    RTC_CATCH_END2(buffer);
+    return nullptr;
+  }
+
+  RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refInc();
+    RTC_CATCH_END2(buffer);
+  }
+  
+  RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refDec();
+    RTC_CATCH_END2(buffer);
+  }
+
+  RTC_API RTCScene rtcNewScene (RTCDevice hdevice) 
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewScene);
+    RTC_VERIFY_HANDLE(hdevice);
+    Scene* scene = new Scene((Device*)hdevice);
+    return (RTCScene) scene->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneDevice);
+    RTC_VERIFY_HANDLE(hscene);
+    return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+    RTC_CATCH_END2(scene);
+    return (RTCDevice)nullptr;
+  }
+
+  RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+    RTC_VERIFY_HANDLE(hscene);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    scene->setProgressMonitorFunction(progress,ptr);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneBuildQuality);
+    RTC_VERIFY_HANDLE(hscene);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    scene->setBuildQuality(quality);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->setSceneFlags(flags);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    return scene->getSceneFlags();
+    RTC_CATCH_END2(scene);
+    return RTC_SCENE_FLAG_NONE;
+  }
+  
+  RTC_API void rtcCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(false);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcJoinCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcJoinCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(true);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    BBox3fa bounds = scene->bounds.bounds();
+    bounds_o->lower_x = bounds.lower.x;
+    bounds_o->lower_y = bounds.lower.y;
+    bounds_o->lower_z = bounds.lower.z;
+    bounds_o->align0  = 0;
+    bounds_o->upper_x = bounds.upper.x;
+    bounds_o->upper_y = bounds.upper.y;
+    bounds_o->upper_z = bounds.upper.z;
+    bounds_o->align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (bounds_o == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+    if (scene->isModified())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    
+    bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+    bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+    bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+    bounds_o->bounds0.align0  = 0;
+    bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+    bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+    bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+    bounds_o->bounds0.align1  = 0;
+    bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+    bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+    bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+    bounds_o->bounds1.align0  = 0;
+    bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+    bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+    bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+    bounds_o->bounds1.align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+  {
+    Scene* scene0 = (Scene*) hscene0;
+    Scene* scene1 = (Scene*) hscene1;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene0);
+    RTC_VERIFY_HANDLE(hscene1);
+    if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+    auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+    scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+    RTC_CATCH_END(scene0->device);
+  }
+  
+  inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    bool changed = false;
+    if (userContext->instStackSize > 0)
+    {
+      const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+      float similarityScale = 0.f;
+      const bool similtude = similarityTransform(transform, &similarityScale);
+      assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+      PointQuery query_inst;
+      query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); 
+      query_inst.radius = query->radius * similarityScale;
+      query_inst.time = query->time;
+      
+      PointQueryContext context_inst(scene, (PointQuery*)query,
+        similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+        queryFunc, userContext, similarityScale, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+    }
+    else
+    {
+      PointQueryContext context(scene, (PointQuery*)query, 
+        POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+    }
+    return changed;
+  }
+
+  RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(userContext);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+    if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");   
+#endif
+
+    return pointQuery(scene, query, userContext, queryFunc, userPtr);
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery4* query4 = (PointQuery4*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      query4->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query4->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery8);
+    
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery8* query8 = (PointQuery8*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      query8->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query8->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery16* query16 = (PointQuery16*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      PointQuery query1; query16->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query16->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT3(normal.travs,1,1,1);
+    IntersectContext context(scene,user_context);
+    scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+    ((RayHit*)rayhit)->verifyHit();
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)rayhit)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit4* rayhit4 = (RayHit4*)rayhit;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit4->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit4->set(i,ray1);
+    }
+#else
+    scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)rayhit)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit8* rayhit8 = (RayHit8*) rayhit;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit8->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.intersect8(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)rayhit)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit16* rayhit16 = (RayHit16*) rayhit;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit16->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.intersect16(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
+        scene->intersectors.intersect(*rayhit,&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
+        scene->intersectors.intersect(*rn[0],&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N*M,N*M,N*M);
+    IntersectContext context(scene,user_context);
+
+    /* code path for single ray streams */
+    if (likely(N == 1))
+    {
+      /* fast code path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+      } 
+      /* normal codepath for single ray streams */
+      else {
+        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    IntersectContext context(scene,user_context);
+    scene->intersectors.occluded(*ray,&context);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)ray)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray4* ray4 = (Ray4*) ray;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray4->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray4->set(i,ray1);
+    }
+#else
+    scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+ 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)ray)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray8* ray8 = (Ray8*) ray;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray8->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.occluded8(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)ray)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray16* ray16 = (Ray16*) ray;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray16->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.occluded16(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray->tnear <= ray->tfar)) 
+        scene->intersectors.occluded (*ray,&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
+        scene->intersectors.occluded (*ray[0],&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N*M,N*N,N*N);
+    IntersectContext context(scene,user_context);
+
+    /* codepath for single rays */
+    if (likely(N == 1))
+    {
+      /* fast path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+          scene->intersectors.occluded (*(RTCRay*)ray,&context);
+      } 
+      /* codepath for normal ray streams */
+      else {
+        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
+    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
+    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
+    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
+    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
+    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
+    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
+    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refInc();
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcReleaseScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refDec();
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Scene> scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    geometry->setInstancedScene(scene);
+    RTC_CATCH_END2(geometry);
+  }
+
+  AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+  {
+    AffineSpace3fa space = one;
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+                             Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+                             Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+                             Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+                             Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+                             Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+                             Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+                             Vec3fa(xfm[12], xfm[13], xfm[14]));
+      break;
+
+    default: 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+    return space;
+  }
+
+  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+  }
+
+  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransform);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(xfm);
+    const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+    geometry->setTransform(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransformQuaternion);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(qd);
+    
+    AffineSpace3fx transform;
+    transform.l.vx.x = qd->scale_x;
+    transform.l.vy.y = qd->scale_y;
+    transform.l.vz.z = qd->scale_z;
+    transform.l.vy.x = qd->skew_xy;
+    transform.l.vz.x = qd->skew_xz;
+    transform.l.vz.y = qd->skew_yz;
+    transform.l.vx.y = qd->translation_x;
+    transform.l.vx.z = qd->translation_y;
+    transform.l.vy.z = qd->translation_z;
+    transform.p.x    = qd->shift_x;
+    transform.p.y    = qd->shift_y;
+    transform.p.z    = qd->shift_z;
+
+    // normalize quaternion
+    Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+    q = normalize(q);
+    transform.l.vx.w = q.i;
+    transform.l.vy.w = q.j;
+    transform.l.vz.w = q.k;
+    transform.p.w    = q.r;
+
+    geometry->setQuaternionDecomposition(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransform);
+    const AffineSpace3fa transform = geometry->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+
+  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+  
+  RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewGeometry);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    switch (type)
+    {
+    case RTC_GEOMETRY_TYPE_TRIANGLE:
+    {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+      createTriangleMeshTy createTriangleMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh);
+      Geometry* geom = createTriangleMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_QUAD:
+    {
+#if defined(EMBREE_GEOMETRY_QUAD)
+      createQuadMeshTy createQuadMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh);
+      Geometry* geom = createQuadMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+    case RTC_GEOMETRY_TYPE_DISC_POINT:
+    case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+    {
+#if defined(EMBREE_GEOMETRY_POINT)
+      createPointsTy createPoints = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints);
+
+      Geometry *geom;
+      switch(type) {
+        case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+          geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_DISC_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+          break;
+        default:
+          geom = nullptr;
+          break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+    {
+#if defined(EMBREE_GEOMETRY_CURVE)
+      createLineSegmentsTy createLineSegments = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments);
+      createCurvesTy createCurves = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves);
+      
+      Geometry* geom;
+      switch (type) {
+      case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE            : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+      //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE  : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE            : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE             : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+      case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+      default:                                    geom = nullptr; break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SUBDIVISION:
+    {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+      createSubdivMeshTy createSubdivMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+      //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+      Geometry* geom = createSubdivMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_USER:
+    {
+#if defined(EMBREE_GEOMETRY_USER)
+      createUserGeometryTy createUserGeometry = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry);
+      Geometry* geom = createUserGeometry(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_INSTANCE:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+      createInstanceTy createInstance = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance);
+      Geometry* geom = createInstance(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_GRID:
+    {
+#if defined(EMBREE_GEOMETRY_GRID)
+      createGridMeshTy createGridMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh);
+      Geometry* geom = createGridMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+    }
+    
+    default:
+      throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+    }
+    
+    RTC_CATCH_END(device);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
+
+    geometry->setNumPrimitives(userPrimitiveCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeStepCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+    
+    geometry->setNumTimeSteps(timeStepCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+  {
+    Ref<Geometry> geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeRange);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (startTime > endTime)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+        
+    geometry->setTimeRange(BBox1f(startTime,endTime));
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTopologyCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTopologyCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+ 
+  RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuildQuality);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH &&
+        quality != RTC_BUILD_QUALITY_REFIT)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    geometry->setBuildQuality(quality);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+    RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+    if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+    geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMask);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setMask(mask);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometrySubdivisionMode);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setSubdivisionMode(topologyID,mode);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Buffer> buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hbuffer);
+    
+    if (geometry->device != buffer->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSharedGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetNewGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+    size_t bytes = itemCount*byteStride;
+    if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      bytes += (16 - (byteStride%16))%16;
+      
+    Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    return buffer->data();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryBufferData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+  
+  RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcEnableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->enable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcUpdateGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->updateBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDisableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->disable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTessellationRate);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTessellationRate(tessellationRate);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setUserData(ptr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getUserData();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBoundsFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setBoundsFunction(bounds,userPtr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryDisplacementFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setDisplacementFunction(displacement);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectFunctionN(intersect);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryPointQueryFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setPointQueryFunction(pointQuery);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    return geometry->getFirstHalfEdge(faceID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFace);
+    return geometry->getFace(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    return geometry->getNextHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    return geometry->getPreviousHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    return geometry->getOppositeHalfEdge(topologyID,edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetOccludedFunctionN);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOccludedFunctionN(occluded);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOcclusionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolate(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolateN(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->commit();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+    RTC_CATCH_END2(scene);
+    return -1;
+  }
+
+  RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometryByID);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_GEOMID(geomID);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    scene->bind(geomID,geometry);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDetachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+    scene->detachGeometry(geomID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refInc();
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refDec();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    return (RTCGeometry) scene->get(geomID);
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
new file mode 100644
index 0000000000..4b070e122b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{  
+  /*! decoding of intersection flags */
+  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+  
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
+  
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle)                               \
+  if (handle == nullptr) {                                         \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_GEOMID(id)                                   \
+  if (id == RTC_INVALID_GEOMETRY_ID) {                             \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_UPPER(id,upper)                              \
+  if (id > upper) {                                                \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_RANGE(id,lower,upper)	\
+  if (id < lower || id > upper)						  \
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+  
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x) 
+#endif
+
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define throw_RTCError(error,str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
+  #define throw_RTCError(error,str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..6bb96bba07
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa // FIXME: support more ISAs for builders
+  {
+    struct BVH : public RefCount
+    {
+      BVH (Device* device)
+        : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+      {
+        device->refInc();
+      }
+
+      ~BVH() {
+        device->refDec();
+      }
+
+    public:
+      Device* device;
+      FastAllocator allocator;
+      mvector<BVHBuilderMorton::BuildPrim> morton_src;
+      mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+    };
+
+    void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims_i =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+        
+      std::atomic<size_t> progress(0);
+      
+      /* initialize temporary arrays for morton builder */
+      PrimRef* prims = (PrimRef*) prims_i;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+      morton_src.resize(primitiveCount);
+      morton_tmp.resize(primitiveCount);
+
+      /* compute centroid bounds */
+      const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+          BBox3fa bounds(empty);
+          for (size_t i=r.begin(); i<r.end(); i++) 
+            bounds.extend(prims[i].bounds().center2());
+          return bounds;
+        }, BBox3fa::merge);
+      
+      /* compute morton codes */
+      BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+      parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+          BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            generator(prims[i].bounds(),(unsigned) i);
+          }
+        });
+
+      /* start morton build */
+      std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+        
+        /* lambda function that allocates BVH nodes */
+        [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+          return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+        },
+        
+        /* lambda function that sets bounds */
+        [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+        {
+          BBox3fa bounds = empty;
+          void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) {
+            bounds.extend(children[i].second);
+            childptrs[i] = children[i].first;
+            cbounds[i] = (const RTCBounds*)&children[i].second;
+          }
+          setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+          setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+        {
+	  RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+	  BBox3fa bounds = empty;
+	  for (size_t i=0;i<current.size();i++)
+	    {
+	      const size_t id = morton_src[current.begin()+i].index;
+	      bounds.extend(prims[id].bounds());
+	      localBuildPrims[i] = prims_i[id];
+	    }
+          void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda that calculates the bounds for some primitive */
+        [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+          return prims[morton.index].bounds();
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        morton_src.data(),morton_tmp.data(),primitiveCount,
+        *arguments);
+
+      bvh->allocator.cleanup();
+      return root.first;
+    }
+
+    void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+      auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+        {
+          CentGeomBBox3fa bounds(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            bounds.extend((BBox3fa&)prims[j]);
+          return bounds;
+        };
+      const CentGeomBBox3fa bounds = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+      
+      /* build BVH */
+      void* root = BVHBuilderBinnedSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+
+    static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+      CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+      unsigned int maxGeomID = max(a.second,b.second); 
+      return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+    }
+
+    void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+
+      auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+        {
+          CentGeomBBox3fa bounds(empty);
+          unsigned maxGeomID = 0;
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            bounds.extend((BBox3fa&)prims[j]);
+            maxGeomID = max(maxGeomID,prims[j].geomID);
+          }
+          return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+        };
+
+
+      const std::pair<CentGeomBBox3fa,unsigned int> pair = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+      CentGeomBBox3fa bounds = pair.first;
+      const unsigned int maxGeomID = pair.second;
+      
+      if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+        {
+          /* fallback code for max geomID larger than threshold */
+          return rtcBuildBVHBinnedSAH(arguments);
+        }
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+
+      /* function that splits a build primitive */
+      struct Splitter
+      {
+        Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+          : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+        
+        __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const 
+        {
+          prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+          left_o.geomIDref()  = geomID; left_o.primIDref()  = primID;
+          right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+        }
+
+        __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const 
+        {
+          PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+        }
+   
+        RTCSplitPrimitiveFunction splitPrimitive;
+        unsigned geomID;
+        unsigned primID;
+        void* userPtr;
+      };
+
+      /* build BVH */
+      void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* returns the splitter */
+        [&] ( const PrimRef& prim ) -> Splitter {
+          return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+        },
+
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,
+        arguments->primitiveArrayCapacity,
+        pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+  }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+    RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+    {
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcNewAllocator);
+      RTC_VERIFY_HANDLE(device);
+      BVH* bvh = new BVH((Device*)device);
+      return (RTCBVH) bvh->refInc();
+      RTC_CATCH_END((Device*)device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcBuildBVH);
+      RTC_VERIFY_HANDLE(bvh);
+      RTC_VERIFY_HANDLE(arguments);
+      RTC_VERIFY_HANDLE(arguments->createNode);
+      RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+      RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+      RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+      if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+      /* initialize the allocator */
+      bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+      bvh->allocator.reset();
+
+      /* switch between differnet builders based on quality level */
+      if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+        return rtcBuildBVHMorton(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+        return rtcBuildBVHBinnedSAH(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+        if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+          return rtcBuildBVHBinnedSAH(arguments);
+        else
+          return rtcBuildBVHSpatialSAH(arguments);
+      }
+      else
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+      /* if we are in dynamic mode, then do not clear temporary data */
+      if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+      {
+        bvh->morton_src.clear();
+        bvh->morton_tmp.clear();
+      }
+
+      RTC_CATCH_END(bvh->device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+    {
+      FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcThreadLocalAlloc);
+      return alloc->malloc0(bytes,align);
+      RTC_CATCH_END(alloc->alloc->getDevice());
+      return nullptr;
+    }
+
+    RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcStaticBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->morton_src.clear();
+      bvh->morton_tmp.clear();
+      RTC_CATCH_END(bvh->device);
+    }
+
+    RTC_API void rtcRetainBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcRetainBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refInc();
+      RTC_CATCH_END(device);
+    }
+    
+    RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcReleaseBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refDec();
+      RTC_CATCH_END(device);
+    }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
new file mode 100644
index 0000000000..1e23aeb415
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
@@ -0,0 +1,976 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  /* error raising rtcIntersect and rtcOccluded functions */
+  void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+  void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+  void invalid_rtcIntersect4()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+  void invalid_rtcIntersect8()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+  void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+  void invalid_rtcIntersectN()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+  Scene::Scene (Device* device)
+    : device(device),
+      flags_modified(true), enabled_geometry_types(0),
+      scene_flags(RTC_SCENE_FLAG_NONE),
+      quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+      is_build(false), modified(true),
+      progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+  {
+    device->refInc();
+
+    intersectors = Accel::Intersectors(missing_rtcCommit);
+
+    /* one can overwrite flags through device for debugging */
+    if (device->quality_flags != -1)
+      quality_flags = (RTCBuildQuality) device->quality_flags;
+    if (device->scene_flags != -1)
+      scene_flags = (RTCSceneFlags) device->scene_flags;
+  }
+
+  Scene::~Scene() noexcept
+  {
+    device->refDec();
+  }
+
+  void Scene::printStatistics()
+  {
+    /* calculate maximum number of time segments */
+    unsigned max_time_steps = 0;
+    for (size_t i=0; i<size(); i++) {
+      if (!get(i)) continue;
+      max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+    }
+
+    /* initialize vectors*/
+    std::vector<size_t> statistics[Geometry::GTY_END];
+    for (size_t i=0; i<Geometry::GTY_END; i++)
+      statistics[i].resize(max_time_steps);
+
+    /* gather statistics */
+    for (size_t i=0; i<size(); i++)
+    {
+      if (!get(i)) continue;
+      int ty = get(i)->getType();
+      assert(ty<Geometry::GTY_END);
+      int timesegments = get(i)->numTimeSegments();
+      assert((unsigned int)timesegments < max_time_steps);
+      statistics[ty][timesegments] += get(i)->size();
+    }
+
+    /* print statistics */
+    std::cout << std::setw(23) << "segments" << ": ";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << std::setw(10) << t;
+    std::cout << std::endl;
+
+    std::cout << "-------------------------";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << "----------";
+    std::cout << std::endl;
+
+    for (size_t p=0; p<Geometry::GTY_END; p++)
+    {
+      if (std::string(Geometry::gtype_names[p]) == "") continue;
+      std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+      for (size_t t=0; t<max_time_steps; t++)
+        std::cout << std::setw(10) << statistics[p][t];
+      std::cout << std::endl;
+    }
+  }
+
+  void Scene::createTriangleAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+          break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->tri_accel == "bvh4.triangle4")       accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+    else if (device->tri_accel == "bvh4.triangle4v")      accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+    else if (device->tri_accel == "bvh4.triangle4i")      accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+    else if (device->tri_accel == "qbvh4.triangle4i")     accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel == "bvh8.triangle4")       accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+    else if (device->tri_accel == "bvh8.triangle4v")      accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+    else if (device->tri_accel == "bvh8.triangle4i")      accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4i")     accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4")      accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+  }
+
+  void Scene::createTriangleMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+  }
+
+  void Scene::createQuadAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        /* static */
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          break;
+
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->quad_accel == "bvh4.quad4v")       accels_add(device->bvh4_factory->BVH4Quad4v(this));
+    else if (device->quad_accel == "bvh4.quad4i")       accels_add(device->bvh4_factory->BVH4Quad4i(this));
+    else if (device->quad_accel == "qbvh4.quad4i")      accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel == "bvh8.quad4v")       accels_add(device->bvh8_factory->BVH8Quad4v(this));
+    else if (device->quad_accel == "bvh8.quad4i")       accels_add(device->bvh8_factory->BVH8Quad4i(this));
+    else if (device->quad_accel == "qbvh8.quad4i")      accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+  }
+
+  void Scene::createQuadMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+      switch (mode) {
+      case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        break;
+
+      case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        break;
+
+      case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+      case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+      }
+    }
+    else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+  }
+
+  void Scene::createHairAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel == "default")
+    {
+      int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+  }
+
+  void Scene::createHairMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+      {
+        if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+      else
+#endif
+      {
+        if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+    }
+    else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+  }
+
+  void Scene::createSubdivAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    }
+    else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+  }
+
+  void Scene::createSubdivMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel_mb == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+  }
+
+  void Scene::createUserGeometryAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+  }
+
+  void Scene::createUserGeometryMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel_mb == "default"    ) {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+    }
+    else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createGridAccel()
+  {
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+      else
+#endif
+      {
+        accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+    }
+    else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::createGridMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel_mb == "default")
+    {
+      accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+    }
+    else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::clear() {
+  }
+
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+    if (geomID == RTC_INVALID_GEOMETRY_ID) {
+      geomID = id_pool.allocate();
+      if (geomID == RTC_INVALID_GEOMETRY_ID)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+    }
+    else
+    {
+      if (!id_pool.add(geomID))
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+    }
+    if (geomID >= geometries.size()) {
+      geometries.resize(geomID+1);
+      vertices.resize(geomID+1);
+      geometryModCounters_.resize(geomID+1);
+    }
+    geometries[geomID] = geometry;
+    geometryModCounters_[geomID] = 0;
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    return geomID;
+  }
+
+  void Scene::detachGeometry(size_t geomID)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+
+    if (geomID >= geometries.size())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+    Ref<Geometry>& geometry = geometries[geomID];
+    if (geometry == null)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    accels_deleteGeometry(unsigned(geomID));
+    id_pool.deallocate((unsigned)geomID);
+    geometries[geomID] = null;
+    vertices[geomID] = nullptr;
+    geometryModCounters_[geomID] = 0;
+  }
+
+  void Scene::updateInterface()
+  {
+    is_build = true;
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i)
+        {
+          if (geometries[i] && geometries[i]->isEnabled())
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+
+    /* select acceleration structures to build */
+    unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+    if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+    {
+      accels_init();
+
+      /* we need to make all geometries modified, otherwise two level builder will
+        not rebuild currently not modified geometries */
+      parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+          geometryModCounters_[i] = 0;
+        });
+
+      if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+      if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+      if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+      if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+
+      flags_modified = false;
+      enabled_geometry_types = new_enabled_geometry_types;
+    }
+
+    /* select fast code path if no filter function is present */
+    accels_select(hasFilterFunction());
+
+    /* build all hierarchies of this scene */
+    accels_build();
+
+    /* make static geometry immutable */
+    if (!isDynamicAccel()) {
+      accels_immutable();
+      flags_modified = true; // in non-dynamic mode we have to re-create accels
+    }
+
+    /* call postCommit function of each geometry */
+    parallel_for(geometries.size(), [&] ( const size_t i ) {
+        if (geometries[i] && geometries[i]->isEnabled()) {
+          geometries[i]->postCommit();
+          vertices[i] = geometries[i]->getCompactVertexArray();
+          geometryModCounters_[i] = geometries[i]->getModCounter();
+        }
+      });
+
+    updateInterface();
+
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+
+    setModified(false);
+  }
+
+  void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+  {
+    if (quality_flags == quality_flags_i) return;
+    quality_flags = quality_flags_i;
+    flags_modified = true;
+  }
+
+  RTCBuildQuality Scene::getBuildQuality() const {
+    return quality_flags;
+  }
+
+  void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+  {
+    if (scene_flags == scene_flags_i) return;
+    scene_flags = scene_flags_i;
+    flags_modified = true;
+  }
+
+  RTCSceneFlags Scene::getSceneFlags() const {
+    return scene_flags;
+  }
+
+#if defined(TASKING_INTERNAL)
+
+  void Scene::commit (bool join)
+  {
+    Lock<MutexSys> buildLock(buildMutex,false);
+
+    /* allocates own taskscheduler for each build */
+    Ref<TaskScheduler> scheduler = nullptr;
+    {
+      Lock<MutexSys> lock(schedulerMutex);
+      scheduler = this->scheduler;
+      if (scheduler == null) {
+        buildLock.lock();
+        this->scheduler = scheduler = new TaskScheduler;
+      }
+    }
+
+    /* worker threads join build */
+    if (!buildLock.isLocked())
+    {
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+
+      scheduler->join();
+      return;
+    }
+
+    /* initiate build */
+    // -- GODOT start --
+    // try {
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
+  }
+
+#endif
+
+#if defined(TASKING_TBB) || defined(TASKING_GCD)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+    /* join hierarchy build */
+    if (!lock.isLocked())
+    {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+
+      do {
+
+#if defined(TASKING_GCD)
+      // Do Nothing
+#else
+#if USE_TASK_ARENA
+        if (join) {
+          device->arena->execute([&]{ group.wait(); });
+        }
+        else
+#endif
+        {
+          group.wait();
+        }
+#endif
+
+        pause_cpu();
+        yield();
+
+      } while (!buildMutex.try_lock());
+
+      buildMutex.unlock();
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION_MAJOR < 8
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+      //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+      if (join)
+      {
+        device->arena->execute([&]{
+            group.run([&]{
+                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+              });
+            group.wait();
+          });
+      }
+      else
+#endif
+      {
+        group.run([&]{
+            tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+          });
+        group.wait();
+      }
+
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+#elif defined(TASKING_GCD)
+
+      commit_task();
+
+#endif  // #if defined(TASKING_TBB)
+
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+#if defined(TASKING_PPL)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_PPL)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex);
+
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+
+      group.run([&]{
+          concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+        });
+      group.wait();
+
+       /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
+  {
+    progress_monitor_function = func;
+    progress_monitor_ptr      = ptr;
+  }
+
+  void Scene::progressMonitor(double dn)
+  {
+    if (progress_monitor_function) {
+      size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+      if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+        throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h
new file mode 100644
index 0000000000..b41c6cde91
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+ 
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! Base class all scenes are derived from */
+  class Scene : public AccelN
+  {
+    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+  public:
+    template<typename Ty, bool mblur = false>
+      class Iterator
+      {
+      public:
+      Iterator ()  {}
+      
+      Iterator (Scene* scene, bool all = false) 
+      : scene(scene), all(all) {}
+      
+      __forceinline Ty* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!all && !geom->isEnabled()) return nullptr;
+        const size_t mask = geom->getTypeMask() & Ty::geom_type; 
+        if (!(mask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return (Ty*) geom;
+      }
+
+      __forceinline Ty* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+      __forceinline size_t numPrimitives() const {
+        return scene->getNumPrimitives(Ty::geom_type,mblur);
+      }
+
+      __forceinline size_t maxPrimitivesPerGeometry() 
+      {
+        size_t ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->size());
+        }
+        return ret;
+      }
+
+      __forceinline unsigned int maxGeomID() 
+      {
+        unsigned int ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,(unsigned int)i);
+        }
+        return ret;
+      }
+
+      __forceinline unsigned maxTimeStepsPerGeometry()
+      {
+        unsigned ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->numTimeSteps);
+        }
+        return ret;
+      }
+      
+    private:
+      Scene* scene;
+      bool all;
+      };
+
+      class Iterator2
+      {
+      public:
+      Iterator2 () {}
+      
+      Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) 
+      : scene(scene), typemask(typemask), mblur(mblur) {}
+      
+      __forceinline Geometry* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!geom->isEnabled()) return nullptr;
+        if (!(geom->getTypeMask() & typemask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return geom;
+      }
+
+      __forceinline Geometry* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+    private:
+      Scene* scene;
+      Geometry::GTypeMask typemask;
+      bool mblur;
+    };
+
+  public:
+    
+    /*! Scene construction */
+    Scene (Device* device);
+
+    /*! Scene destruction */
+    ~Scene () noexcept;
+
+  private:
+    /*! class is non-copyable */
+    Scene (const Scene& other) DELETED; // do not implement
+    Scene& operator= (const Scene& other) DELETED; // do not implement
+
+  public:
+    void createTriangleAccel();
+    void createTriangleMBAccel();
+    void createQuadAccel();
+    void createQuadMBAccel();
+    void createHairAccel();
+    void createHairMBAccel();
+    void createSubdivAccel();
+    void createSubdivMBAccel();
+    void createUserGeometryAccel();
+    void createUserGeometryMBAccel();
+    void createInstanceAccel();
+    void createInstanceMBAccel();
+    void createInstanceExpensiveAccel();
+    void createInstanceExpensiveMBAccel();
+    void createGridAccel();
+    void createGridMBAccel();
+
+    /*! prints statistics about the scene */
+    void printStatistics();
+
+    /*! clears the scene */
+    void clear();
+
+    /*! detaches some geometry */
+    void detachGeometry(size_t geomID);
+
+    void setBuildQuality(RTCBuildQuality quality_flags);
+    RTCBuildQuality getBuildQuality() const;
+    
+    void setSceneFlags(RTCSceneFlags scene_flags);
+    RTCSceneFlags getSceneFlags() const;
+    
+    void commit (bool join);
+    void commit_task ();
+    void build () {}
+
+    void updateInterface();
+
+    /* return number of geometries */
+    __forceinline size_t size() const { return geometries.size(); }
+    
+    /* bind geometry to the scene */
+    unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+    
+    /* determines if scene is modified */
+    __forceinline bool isModified() const { return modified; }
+
+    /* sets modified flag */
+    __forceinline void setModified(bool f = true) { 
+      modified = f; 
+    }
+
+    __forceinline bool isGeometryModified(size_t geomID)
+    {
+      Ref<Geometry>& g = geometries[geomID];
+      if (!g) return false;
+      return g->getModCounter() > geometryModCounters_[geomID];
+    }
+
+  protected:
+    
+    __forceinline void checkIfModifiedAndSet () 
+    {
+      if (isModified ()) return;
+      
+      auto geometryIsModified = [this](size_t geomID)->bool {
+        return isGeometryModified(geomID);
+      };
+
+      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+        setModified ();
+      }
+    }
+    
+  public:
+
+    /* get mesh by ID */
+    __forceinline       Geometry* get(size_t i)       { assert(i < geometries.size()); return geometries[i].ptr; }
+    __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+    template<typename Mesh>
+      __forceinline       Mesh* get(size_t i)       { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+    template<typename Mesh>
+      __forceinline const Mesh* get(size_t i) const { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+
+    template<typename Mesh>
+    __forceinline Mesh* getSafe(size_t i) {
+      assert(i < geometries.size());
+      if (geometries[i] == null) return nullptr;
+      if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+      else return (Mesh*) geometries[i].ptr;
+    }
+
+    __forceinline Ref<Geometry> get_locked(size_t i)  {
+      Lock<SpinLock> lock(geometriesMutex);
+      assert(i < geometries.size()); 
+      return geometries[i]; 
+    }
+
+    /* flag decoding */
+    __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+    __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+    __forceinline bool isRobustAccel()  const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+    __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+    __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+    
+    __forceinline bool hasContextFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    }
+    
+    __forceinline bool hasGeometryFilterFunction() {
+      return world.numFilterFunctions != 0;
+    }
+      
+    __forceinline bool hasFilterFunction() {
+      return hasContextFilterFunction() || hasGeometryFilterFunction();
+    }
+    
+    /* test if scene got already build */
+    __forceinline bool isBuild() const { return is_build; }
+
+  public:
+    IDPool<unsigned,0xFFFFFFFE> id_pool;
+    vector<Ref<Geometry>> geometries; //!< list of all user geometries
+    vector<unsigned int> geometryModCounters_;
+    vector<float*> vertices;
+    
+  public:
+    Device* device;
+
+    /* these are to detect if we need to recreate the acceleration structures */
+    bool flags_modified;
+    unsigned int enabled_geometry_types;
+    
+    RTCSceneFlags scene_flags;
+    RTCBuildQuality quality_flags;
+    MutexSys buildMutex;
+    SpinLock geometriesMutex;
+    bool is_build;
+  private:
+    bool modified;                   //!< true if scene got modified
+
+  public:
+    
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL) 
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+    
+  public:
+    struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+      BuildProgressMonitorInterface(Scene* scene) 
+      : scene(scene) {}
+      void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+    private:
+      Scene* scene;
+    };
+    BuildProgressMonitorInterface progressInterface;
+    RTCProgressMonitorFunction progress_monitor_function;
+    void* progress_monitor_ptr;
+    std::atomic<size_t> progress_monitor_counter;
+    void progressMonitor(double nprims);
+    void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+  private:
+    GeometryCounts world;               //!< counts for geometry
+
+  public:
+
+    __forceinline size_t numPrimitives() const {
+      return world.size();
+    }
+
+    __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_TRIANGLE_MESH)
+        count += mblur ? world.numMBTriangles : world.numTriangles;
+      
+      if (mask & Geometry::MTY_QUAD_MESH)
+        count += mblur ? world.numMBQuads : world.numQuads;
+      
+      if (mask & Geometry::MTY_CURVE2)
+        count += mblur ? world.numMBLineSegments : world.numLineSegments;
+      
+      if (mask & Geometry::MTY_CURVE4)
+        count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+      
+      if (mask & Geometry::MTY_POINTS)
+        count += mblur ? world.numMBPoints : world.numPoints;
+      
+      if (mask & Geometry::MTY_SUBDIV_MESH)
+        count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+      
+      if (mask & Geometry::MTY_USER_GEOMETRY)
+        count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+      
+      if (mask & Geometry::MTY_INSTANCE_CHEAP)
+        count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+      
+      if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+        count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBGrids : world.numGrids;
+      
+      return count;
+    }
+    
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned getNumTimeSteps()
+    {
+      if (!mblur)
+        return 1;
+
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxTimeStepsPerGeometry();
+    }
+
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned int getMaxGeomID()
+    {
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxGeomID();
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..2649ab0e3e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
@@ -0,0 +1,341 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of bicubic bezier curves */
+  struct CurveGeometry : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+  public:
+    
+    /*! bezier curve construction */
+    CurveGeometry (Device* device, Geometry::GType gtype);
+    
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+    
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th curve */
+    __forceinline const unsigned int& curve(size_t i) const {
+      return curves[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th tangent of the first time step */
+    __forceinline Vec3ff tangent(size_t i) const {
+      return tangents0[i];
+    }
+
+    /*! returns i'th normal derivative of the first time step */
+    __forceinline Vec3fa dnormal(size_t i) const {
+      return dnormals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th tangent of itime'th timestep */
+    __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+      return tangents[itime][i];
+    }
+
+    /*! returns i'th normal derivative of itime'th timestep */
+    __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+      return dnormals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      n2 = normal(i+2,itime);
+      n3 = normal(i+3,itime);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL1_vertices(size_t i) const
+    {
+      prefetchL1(vertices0.getPtr(i)+0);
+      prefetchL1(vertices0.getPtr(i)+64);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL2_vertices(size_t i) const
+    {
+      prefetchL2(vertices0.getPtr(i)+0);
+      prefetchL2(vertices0.getPtr(i)+64);
+    }  
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3;
+      gather(a0,a1,a2,a3,i,itime);
+      Vec3ff b0,b1,b2,b3;
+      gather(b0,b1,b2,b3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+      gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+      Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+      gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+      n0 = madd(Vec3ff(t0),an0,t1*bn0);
+      n1 = madd(Vec3ff(t0),an1,t1*bn1);
+      n2 = madd(Vec3ff(t0),an2,t1*bn2);
+      n3 = madd(Vec3ff(t0),an3,t1*bn3);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+      unsigned int vertexID = curve(primID);
+      gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+      SourceCurve3ff ccurve(v0,v1,v2,v3);
+      SourceCurve3fa ncurve(n0,n1,n2,n3);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1;
+      gather_hermite(ap0,at0,ap1,at1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1;
+      gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      dn0 = dnormal(i+0);
+      dn1 = dnormal(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      dn0 = dnormal(i+0,itime);
+      dn1 = dnormal(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+      gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+      gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      n0 = madd(Vec3ff(f0),an0,f1*bn0);
+      dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+      n1 = madd(Vec3ff(f0),an1,f1*bn1);
+      dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+      unsigned int vertexID = curve(primID);
+      gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+      SourceCurve3ff ccurve(v0,t0,v1,t1);
+      SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+  private:
+    void resizeBuffers(unsigned int numSteps);
+
+  public:
+    BufferView<unsigned int> curves;        //!< array of curve indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
+    BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
+    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for flat curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+  
+  DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..c08658466a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Grid Mesh */
+  struct GridMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+    /*! grid */
+    struct Grid 
+    {
+      unsigned int startVtxID;
+      unsigned int lineVtxOffset;
+      unsigned short resX,resY;
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+      {
+        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+      }
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+      {
+        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+      }
+
+      /*! outputs grid structure */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+      }
+    };
+
+  public:
+
+    /*! grid mesh construction */
+    GridMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    {
+      const Grid &g = grid(gridID);
+      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th grid*/
+    __forceinline const Grid& grid(size_t i) const {
+      return grids[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th vertex of the first timestep */
+    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+      assert(x < (size_t)g.resX);
+      assert(y < (size_t)g.resY);
+      return g.startVtxID + x + y * g.lineVtxOffset;
+    }
+    
+    /*! returns i'th vertex of the first timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index);
+    }
+
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,itime);
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+    {
+      assert(itime < numTimeSteps);
+      BBox3fa b0(empty);
+      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+        {
+          const Vec3fa v = grid_vertex(g,x,y,itime);
+          if (unlikely(!isvalid(v))) return false;
+          b0.extend(v);
+        }
+
+      /* use bounds of first time step in builder */
+      bbox = b0;
+      return true;
+    }
+
+    __forceinline bool valid(size_t gridID, size_t itime=0) const {
+      return valid(gridID, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+    {
+      if (unlikely(gridID >= grids.size())) return false;
+      const Grid &g = grid(gridID);
+      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
+      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+      for (size_t y=0;y<g.resY;y++)
+        for (size_t x=0;x<g.resX;x++)
+          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+      return true;
+    }
+
+
+    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+    {
+      BBox3fa box(empty);
+      buildBounds(g,sx,sy,itime,box);
+      return box;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+      BBox3fa bounds0, bounds1;
+      buildBounds(g,sx,sy,itime+0,bounds0);
+      buildBounds(g,sx,sy,itime+1,bounds1);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+  public:
+    BufferView<Grid> grids;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct GridMeshISA : public GridMesh
+    {
+      GridMeshISA (Device* device)
+        : GridMesh(device) {}
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..7ff82a4fb8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct Instance : public Geometry
+  {
+    ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+  public:
+    Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+    ~Instance();
+
+  private:
+    Instance (const Instance& other) DELETED; // do not implement
+    Instance& operator= (const Instance& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+                         lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+      return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+                        lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+    }
+
+  public:
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+    virtual AffineSpace3fa getTransform(float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+      return xfmBounds(local2world[0],object->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t itime) const {
+      return object->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+      return xfmBounds(local2world[itime],getObjectBounds(itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      assert(i == 0);
+      LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      assert(i==0);
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      assert(i==0);
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      assert(i == 0);
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World() const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(local2world[0]);
+      return local2world[0];
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(float t) const
+    {
+      float ftime; const unsigned int itime = timeSegment(t, ftime);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return slerp(local2world[itime+0],local2world[itime+1],ftime);
+      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local() const {
+      return world2local0;
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(float t) const {
+      return rcp(getLocal2World(t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp(valid, t);
+      return getWorld2LocalLerp(valid, t);
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+                         AffineSpace3vff<K>(local2world[itime+1]),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+                        AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  public:
+    Accel* object;                 //!< pointer to instanced acceleration structure
+    AffineSpace3ff* local2world;   //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    AffineSpace3fa world2local0;   //!< transformation from world space to local space for timestep 0
+  };
+
+  namespace isa
+  {
+    struct InstanceISA : public Instance
+    {
+      InstanceISA (Device* device)
+        : Instance(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // const BBox3fa b = bounds(0);
+        // if (!isvalid(b)) return pinfo;
+
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // if (!valid(0,range<size_t>(itime))) return pinfo;
+        // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfoMB pinfo(empty);
+        if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+        const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+        pinfo.add_primref(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..c0f9ee8f77
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
@@ -0,0 +1,307 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of line segments */
+  struct LineSegments : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+  public:
+
+    /*! line segments construction */
+    LineSegments (Device* device, Geometry::GType gtype);
+
+  public:
+    void setMask (unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify ();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th segment */
+    __forceinline const unsigned int& segment(size_t i) const {
+      return segments[i];
+    }
+
+    /*! returns the segment to the left of the i'th segment */
+    __forceinline bool segmentLeftExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+    }
+
+    /*! returns the segment to the right of the i'th segment */
+    __forceinline bool segmentRightExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+    }
+
+     /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+    {
+      const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+      return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = segment(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+        const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+        if (min(v0.w,v1.w) < 0.0f) return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i,0)) return false;
+      *bbox = bounds(i); 
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+      bbox = bounds(i,itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+    BufferView<unsigned int> segments;      //!< array of line segment indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for bezier curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct LineSegmentsISA : public LineSegments
+    {
+      LineSegmentsISA (Device* device, Geometry::GType gtype)
+        : LineSegments(device,gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0);
+        const Vec3fa v1 = vertex(vtxID+1);
+        return v1-v0;
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0,time);
+        const Vec3fa v1 = vertex(vtxID+1,time);
+        return v1-v0;
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const {
+        return bounds(i);
+      }
+      
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+        return bounds(space,i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+      
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(space,primID,time_range);
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h
new file mode 100644
index 0000000000..1d39ed07ba
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! represents an array of points */
+  struct Points : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+   public:
+    /*! line segments construction */
+    Points(Device* device, Geometry::GType gtype);
+
+   public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps(unsigned int numTimeSteps);
+    void setVertexAttributeCount(unsigned int N);
+    void setBuffer(RTCBufferType type,
+                   unsigned int slot,
+                   RTCFormat format,
+                   const Ref<Buffer>& buffer,
+                   size_t offset,
+                   size_t stride,
+                   unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+   public:
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+      return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = (unsigned int)i;
+      if (index >= numVertices())
+        return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+        const Vec3ff v0 = vertex(index + 0, itime);
+        if (unlikely(!isvalid4(v0)))
+          return false;
+        if (v0.w < 0.0f)
+          return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i, 0))
+        return false;
+      *bbox = bounds(i);
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i, itime + 0) || !valid(i, itime + 1))
+        return false;
+      bbox = bounds(i, itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+   public:
+    BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
+    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct PointsISA : public Points
+    {
+      PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+                                      const BBox1f& t0t1,
+                                      const range<size_t>& r,
+                                      size_t k,
+                                      unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const
+      {
+        return bounds(i);
+      }
+
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+      {
+        return bounds(space, i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(primID, time_range);
+      }
+
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(space, primID, time_range);
+      }
+    };
+  }  // namespace isa
+
+  DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..d5bb054b14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Quad Mesh */
+  struct QuadMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+    
+    /*! triangle indices */
+    struct Quad
+    {
+      uint32_t v[4];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+        return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+      }
+    };
+
+  public:
+
+    /*! quad mesh construction */
+    QuadMesh (Device* device); 
+  
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th quad */
+    __forceinline const Quad& quad(size_t i) const {
+      return quads[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th quad */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! calculates the bounds of the i'th quad at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0],itime);
+      const Vec3fa v1 = vertex(q.v[1],itime);
+      const Vec3fa v2 = vertex(q.v[2],itime);
+      const Vec3fa v3 = vertex(q.v[3],itime);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(q.v[0],itime))) return false;
+        if (!isvalid(vertex(q.v[1],itime))) return false;
+        if (!isvalid(vertex(q.v[2],itime))) return false;
+        if (!isvalid(vertex(q.v[3],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Quad& q = quad(i);
+      if (q.v[0] >= numVertices()) return false;
+      if (q.v[1] >= numVertices()) return false;
+      if (q.v[2] >= numVertices()) return false;
+      if (q.v[3] >= numVertices()) return false;
+
+      for (unsigned int t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(q.v[0],t);
+        const Vec3fa v1 = vertex(q.v[1],t);
+        const Vec3fa v2 = vertex(q.v[2],t);
+        const Vec3fa v3 = vertex(q.v[3],t);
+
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+          return false;
+      }
+
+      if (bbox) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+      const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return quads.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return quads.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return areaProjectedTriangle(v0,v1,v3) +
+	areaProjectedTriangle(v1,v2,v3);
+    }
+
+  public:
+    BufferView<Quad> quads;                 //!< array of quads
+    BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+  };
+
+  namespace isa
+  {
+    struct QuadMeshISA : public QuadMesh
+    {
+      QuadMeshISA (Device* device)
+        : QuadMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..d0246009db
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+  class SubdivMesh : public Geometry
+  {
+    ALIGNED_CLASS_(16);
+  public:
+
+    typedef HalfEdge::Edge Edge;
+    
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+    /*! structure used to sort half edges using radix sort by their key */
+    struct KeyHalfEdge 
+    {
+      KeyHalfEdge() {}
+      
+      KeyHalfEdge (uint64_t key, HalfEdge* edge) 
+      : key(key), edge(edge) {}
+      
+      __forceinline operator uint64_t() const { 
+	return key; 
+      }
+
+      friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+        return e0.key < e1.key;
+      }
+      
+    public:
+      uint64_t key;
+      HalfEdge* edge;
+    };
+
+  public:
+
+    /*! subdiv mesh construction */
+    SubdivMesh(Device* device);
+
+  public:
+    void setMask (unsigned mask);
+    void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+    void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setTopologyCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void setTessellationRate(float N);
+    bool verify();
+    void commit();
+    void addElementsToCount (GeometryCounts & counts) const;
+    void setDisplacementFunction (RTCDisplacementFunctionN func);
+    unsigned int getFirstHalfEdge(unsigned int faceID);
+    unsigned int getFace(unsigned int edgeID);
+    unsigned int getNextHalfEdge(unsigned int edgeID);
+    unsigned int getPreviousHalfEdge(unsigned int edgeID);
+    unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+  public:
+
+    /*! return the number of faces */
+    size_t numFaces() const { 
+      return faceVertices.size(); 
+    }
+
+    /*! return the number of edges */
+    size_t numEdges() const { 
+      return topology[0].vertexIndices.size(); 
+    }
+
+    /*! return the number of vertices */
+    size_t numVertices() const { 
+      return vertices[0].size(); 
+    }
+
+    /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+      return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+    }
+
+    /*! check if the i'th primitive is valid */
+    __forceinline bool valid(size_t i) const {
+      return topology[0].valid(i) && !invalidFace(i);
+    }
+
+    /*! check if the i'th primitive is valid for the j'th time range */
+    __forceinline bool valid(size_t i, size_t j) const {
+      return topology[0].valid(i) && !invalidFace(i,j);
+    }
+
+    /*! prints some statistics */
+    void printStatistics();
+
+    /*! initializes the half edge data structure */
+    void initializeHalfEdgeStructures ();
+ 
+  public:
+
+    /*! returns the vertex buffer for some time step */
+    __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+      return vertices[t];
+    }
+
+    /* returns tessellation level of edge */
+    __forceinline float getEdgeLevel(const size_t i) const
+    {
+      if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+      else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+    }
+
+  public:
+    RTCDisplacementFunctionN displFunc;    //!< displacement function
+
+    /*! all buffers in this section are provided by the application */
+  public:
+    
+    /*! the topology contains all data that may differ when
+     *  interpolating different user data buffers */
+    struct Topology
+    {
+    public:
+
+      /*! Default topology construction */
+      Topology () : halfEdges(nullptr,0) {}
+
+      /*! Topology initialization */
+      Topology (SubdivMesh* mesh);
+
+      /*! make the class movable */
+    public: 
+      Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+        : mesh(std::move(other.mesh)), 
+          vertexIndices(std::move(other.vertexIndices)),
+          subdiv_mode(std::move(other.subdiv_mode)),
+          halfEdges(std::move(other.halfEdges)),
+          halfEdges0(std::move(other.halfEdges0)),
+          halfEdges1(std::move(other.halfEdges1)) {}
+      
+      Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+      {
+        mesh = std::move(other.mesh); 
+        vertexIndices = std::move(other.vertexIndices);
+        subdiv_mode = std::move(other.subdiv_mode);
+        halfEdges = std::move(other.halfEdges);
+        halfEdges0 = std::move(other.halfEdges0);
+        halfEdges1 = std::move(other.halfEdges1);
+        return *this;
+      }
+
+    public:
+      /*! check if the i'th primitive is valid in this topology */
+      __forceinline bool valid(size_t i) const 
+      {
+        if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+          if (getHalfEdge(i)->faceHasBorder()) return false;
+        }
+        return true;
+      }
+      
+      /*! updates the interpolation mode for the topology */
+      void setSubdivisionMode (RTCSubdivisionMode mode);
+
+      /*! marks all buffers as modified */
+      void update ();
+
+      /*! verifies index array */
+      bool verify (size_t numVertices);
+
+      /*! initializes the half edge data structure */
+      void initializeHalfEdgeStructures ();
+
+    private:
+      
+      /*! recalculates the half edges */
+      void calculateHalfEdges();
+      
+      /*! updates half edges when recalculation is not necessary */
+      void updateHalfEdges();
+      
+      /*! user input data */
+    public:
+
+      SubdivMesh* mesh;
+
+      /*! indices of the vertices composing each face */
+      BufferView<unsigned int> vertexIndices;
+      
+      /*! subdiv interpolation mode */
+      RTCSubdivisionMode subdiv_mode;
+
+      /*! generated data */
+    public:
+
+      /*! returns the start half edge for face f */
+      __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { 
+        return &halfEdges[mesh->faceStartEdge[f]]; 
+      }
+
+      /*! Half edge structure, generated by initHalfEdgeStructures */
+      mvector<HalfEdge> halfEdges;
+
+      /*! the following data is only required during construction of the
+       *  half edge structure and can be cleared for static scenes */
+    private:
+      
+      /*! two arrays used to sort the half edges */
+      std::vector<KeyHalfEdge> halfEdges0;
+      std::vector<KeyHalfEdge> halfEdges1;
+    };
+
+    /*! returns the start half edge for topology t and face f */
+    __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { 
+      return topology[t].getHalfEdge(f);
+    }
+
+    /*! buffer containing the number of vertices for each face */
+    BufferView<unsigned int> faceVertices;
+
+    /*! array of topologies */
+    vector<Topology> topology;
+
+    /*! vertex buffer (one buffer for each time step) */
+    vector<BufferView<Vec3fa>> vertices;
+
+    /*! user data buffers */
+    vector<RawBufferView> vertexAttribs;
+
+    /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+    BufferView<Edge> edge_creases;
+    
+    /*! edge crease weights for each edge of the edge_creases buffer */
+    BufferView<float> edge_crease_weights;
+    
+    /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+    BufferView<unsigned int> vertex_creases;
+    
+    /*! vertex crease weights for each vertex of the vertex_creases buffer */
+    BufferView<float> vertex_crease_weights;
+
+    /*! subdivision level for each half edge of the vertexIndices buffer */
+    BufferView<float> levels;
+    float tessellationRate;  // constant rate that is used when levels is not set
+
+    /*! buffer that marks specific faces as holes */
+    BufferView<unsigned> holes;
+
+    /*! all data in this section is generated by initializeHalfEdgeStructures function */
+  private:
+
+    /*! number of half edges used by faces */
+    size_t numHalfEdges; 
+
+    /*! fast lookup table to find the first half edge for some face */
+    mvector<uint32_t> faceStartEdge;
+
+    /*! fast lookup table to find the face for some half edge */
+    mvector<uint32_t> halfEdgeFace;
+
+    /*! set with all holes */
+    parallel_set<uint32_t> holeSet;
+
+    /*! fast lookup table to detect invalid faces */
+    mvector<int8_t> invalid_face;
+
+    /*! test if face i is invalid in timestep j */
+    __forceinline       int8_t& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+    /*! interpolation cache */
+  public:
+    static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+    static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+    static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+      const size_t slots = numInterpolationSlots4(stride); 
+      assert(slot < slots); 
+      return slots*prim+slot;
+    }
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+    std::vector<Patch3fa::Ref> patch_eval_trees;
+    
+    /*! the following data is only required during construction of the
+     *  half edge structure and can be cleared for static scenes */
+  private:
+
+    /*! map with all vertex creases */
+    parallel_map<uint32_t,float> vertexCreaseMap;
+    
+    /*! map with all edge creases */
+    parallel_map<uint64_t,float> edgeCreaseMap;
+
+  protected:
+    
+    /*! counts number of geometry commits */
+    size_t commitCounter;
+  };
+
+  namespace isa
+  {
+    struct SubdivMeshISA : public SubdivMesh
+    {
+      SubdivMeshISA (Device* device)
+        : SubdivMesh(device) {}
+
+      void interpolate(const RTCInterpolateArguments* const args);
+      void interpolateN(const RTCInterpolateNArguments* const args);
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..d1c2750f14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,243 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+  TriangleMesh::TriangleMesh (Device* device)
+    : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+  {
+    vertices.resize(numTimeSteps);
+  }
+
+  void TriangleMesh::setMask (unsigned mask) 
+  {
+    this->mask = mask; 
+    Geometry::update();
+  }
+
+  void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+  {
+    vertices.resize(numTimeSteps);
+    Geometry::setNumTimeSteps(numTimeSteps);
+  }
+
+  void TriangleMesh::setVertexAttributeCount (unsigned int N)
+  {
+    vertexAttribs.resize(N);
+    Geometry::update();
+  }
+  
+  void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+  {
+    /* verify that all accesses are 4 bytes aligned */
+    if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+    if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (format != RTC_FORMAT_FLOAT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+      /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+      if (stride*num > 16ll*1024ll*1024ll*1024ll)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+      vertices[slot].set(buffer, offset, stride, num, format);
+      vertices[slot].checkPadding16();
+      vertices0 = vertices[0];
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+      
+      vertexAttribs[slot].set(buffer, offset, stride, num, format);
+      vertexAttribs[slot].checkPadding16();
+    }
+    else if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      if (format != RTC_FORMAT_UINT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+      triangles.set(buffer, offset, stride, num, format);
+      setNumPrimitives(num);
+    }
+    else 
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+  }
+
+  void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return triangles.getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertices[slot].getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertexAttribs[slot].getPtr();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+      return nullptr;
+    }
+  }
+
+  void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      triangles.setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertices[slot].setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertexAttribs[slot].setModified();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+    }
+
+    Geometry::update();
+  }
+
+  void TriangleMesh::commit() 
+  {
+    /* verify that stride of all time steps are identical */
+    for (unsigned int t=0; t<numTimeSteps; t++)
+      if (vertices[t].getStride() != vertices[0].getStride())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+    Geometry::commit();
+  }
+
+  void TriangleMesh::addElementsToCount (GeometryCounts & counts) const 
+  {
+    if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+    else                   counts.numMBTriangles += numPrimitives;
+  }
+
+  bool TriangleMesh::verify() 
+  {
+    /*! verify size of vertex arrays */
+    if (vertices.size() == 0) return false;
+    for (const auto& buffer : vertices)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify size of user vertex arrays */
+    for (const auto& buffer : vertexAttribs)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify triangle indices */
+    for (size_t i=0; i<size(); i++) {     
+      if (triangles[i].v[0] >= numVertices()) return false; 
+      if (triangles[i].v[1] >= numVertices()) return false; 
+      if (triangles[i].v[2] >= numVertices()) return false; 
+    }
+
+    /*! verify vertices */
+    for (const auto& buffer : vertices)
+      for (size_t i=0; i<buffer.size(); i++)
+	if (!isvalid(buffer[i])) 
+	  return false;
+
+    return true;
+  }
+  
+  void TriangleMesh::interpolate(const RTCInterpolateArguments* const args)
+  {
+    unsigned int primID = args->primID;
+    float u = args->u;
+    float v = args->v;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    /* calculate base pointer and stride */
+    assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+           (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+    const char* src = nullptr; 
+    size_t stride = 0;
+    if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+      src    = vertexAttribs[bufferSlot].getPtr();
+      stride = vertexAttribs[bufferSlot].getStride();
+    } else {
+      src    = vertices[bufferSlot].getPtr();
+      stride = vertices[bufferSlot].getStride();
+    }
+    
+    for (unsigned int i=0; i<valueCount; i+=4)
+    {
+      size_t ofs = i*sizeof(float);
+      const float w = 1.0f-u-v;
+      const Triangle& tri = triangle(primID);
+      const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount));
+      const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+      const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+      const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+      
+      if (P) {
+        vfloat4::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+      }
+      if (dPdu) {
+        assert(dPdu); vfloat4::storeu(valid,dPdu+i,p1-p0);
+        assert(dPdv); vfloat4::storeu(valid,dPdv+i,p2-p0);
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero));
+        assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero));
+        assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero));
+      }
+    }
+  }
+  
+#endif
+  
+  namespace isa
+  {
+    TriangleMesh* createTriangleMesh(Device* device) {
+      return new TriangleMeshISA(device);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..eaf2e1799a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,264 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Triangle Mesh */
+  struct TriangleMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+    /*! triangle indices */
+    struct Triangle 
+    {
+      uint32_t v[3];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+        return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+      }
+    };
+
+  public:
+
+    /*! triangle mesh construction */
+    TriangleMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th triangle*/
+    __forceinline const Triangle& triangle(size_t i) const {
+      return triangles[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th triangle */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0],itime);
+      const Vec3fa v1 = vertex(tri.v[1],itime);
+      const Vec3fa v2 = vertex(tri.v[2],itime);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(tri.v[0],itime))) return false;
+        if (!isvalid(vertex(tri.v[1],itime))) return false;
+        if (!isvalid(vertex(tri.v[2],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(tri.v[0],t);
+        const Vec3fa v1 = vertex(tri.v[1],t);
+        const Vec3fa v2 = vertex(tri.v[2],t);
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+          return false;
+      }
+
+      if (likely(bbox)) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const  {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return triangles.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);      
+      return areaProjectedTriangle(v0,v1,v2);
+    }
+
+  public:
+    BufferView<Triangle> triangles;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct TriangleMeshISA : public TriangleMesh
+    {
+      TriangleMeshISA (Device* device)
+        : TriangleMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..8d11ed6986
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+  /*! User geometry with user defined intersection functions */
+  struct UserGeometry : public AccelSet
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+  public:
+    UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+    virtual void setMask (unsigned mask);
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const;
+  };
+
+  namespace isa
+  {
+    struct UserGeometryISA : public UserGeometry
+    {
+      UserGeometryISA (Device* device)
+        : UserGeometry(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+  
+  DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h
new file mode 100644
index 0000000000..533c385365
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! An item on the stack holds the node ID and distance of that node. */
+  template<typename T>
+  struct __aligned(16) StackItemT
+  {
+    /*! assert that the xchg function works */
+    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+    __forceinline StackItemT() {}
+
+    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
+    { 
+      const vfloat4 sse_a = vfloat4::load((float*)&a); 
+      const vfloat4 sse_b = vfloat4::load((float*)&b);
+      vfloat4::store(&a,sse_b);
+      vfloat4::store(&b,sse_a);
+    }
+
+    /*! Sort 2 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s4.dist < s3.dist) xchg(s4,s3);
+      if (s3.dist < s1.dist) xchg(s3,s1);
+      if (s4.dist < s2.dist) xchg(s4,s2);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+    }
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
+    { 
+#if defined(__AVX512VL__)
+      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+      const vboolf4 mask0(b < a);
+      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+      const vint4 c = select(mask,b,a);
+      const vint4 d = select(mask,a,b);
+      a = c;
+      b = d;
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s3,s2);
+      cmp_xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s4,s3);
+      cmp_xchg(s3,s1);
+      cmp_xchg(s4,s2);
+      cmp_xchg(s3,s2);
+    }
+
+
+    /*! Sort N stack items. */
+    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+    {
+      for (StackItemT* i = begin+1; i != end; ++i)
+      {
+        const vfloat4 item = vfloat4::load((float*)i);
+        const unsigned dist = i->dist;
+        StackItemT* j = i;
+
+        while ((j != begin) && ((j-1)->dist < dist))
+        {
+          vfloat4::store(j, vfloat4::load((float*)(j-1)));
+          --j;
+        }
+
+        vfloat4::store(j, item);
+      }
+    }
+    
+  public:
+    T ptr;
+    unsigned dist;
+  };
+
+  /*! An item on the stack holds the node ID and active ray mask. */
+  template<typename T>
+  struct __aligned(8) StackItemMaskT
+  {
+    T ptr;
+    size_t mask;
+  };
+
+  struct __aligned(8) StackItemMaskCoherent
+  {
+    size_t mask;
+    size_t parent;
+    size_t child;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp
new file mode 100644
index 0000000000..b73c3a8c76
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+  Stat Stat::instance; 
+  
+  Stat::Stat () {
+  }
+
+  Stat::~Stat () 
+  {
+#ifdef EMBREE_STAT_COUNTERS
+    Stat::print(std::cout);
+#endif
+  }
+
+  void Stat::print(std::ostream& cout)
+  {
+    Counters& cntrs = instance.cntrs;
+    Counters::Data& data = instance.cntrs.code;
+    //Counters::Data& data = instance.cntrs.active;
+
+    /* print absolute numbers */
+    cout << "--------- ABSOLUTE ---------" << std::endl;
+    cout << "  #normal_travs   = " << float(data.normal.travs            )*1E-6 << "M" << std::endl;
+    cout << "    #nodes        = " << float(data.normal.trav_nodes       )*1E-6 << "M" << std::endl;
+    cout << "    #nodes_xfm    = " << float(data.normal.trav_xfm_nodes   )*1E-6 << "M" << std::endl;
+    cout << "    #leaves       = " << float(data.normal.trav_leaves      )*1E-6 << "M" << std::endl;
+    cout << "    #prims        = " << float(data.normal.trav_prims       )*1E-6 << "M" << std::endl;
+    cout << "    #prim_hits    = " << float(data.normal.trav_prim_hits   )*1E-6 << "M" << std::endl;
+
+    cout << "    #stack nodes  = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+    cout << "    #stack pop    = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+    size_t normal_box_hits = 0;
+    size_t weighted_box_hits = 0;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) { 
+      normal_box_hits += data.normal.trav_hit_boxes[i];
+      weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+    }
+    cout << "    #hit_boxes    = " << normal_box_hits << " (total) distribution: ";
+    float average = 0.0f;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) 
+    {
+      float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+      cout << "[" << i << "] " << value << " ";
+      average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+    }
+    cout << "    average = " << average << std::endl;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+    cout << std::endl;
+
+    if (data.shadow.travs) {
+      cout << "  #shadow_travs = " << float(data.shadow.travs         )*1E-6 << "M" << std::endl;
+      cout << "    #nodes      = " << float(data.shadow.trav_nodes    )*1E-6 << "M" << std::endl;
+      cout << "    #nodes_xfm  = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+      cout << "    #leaves     = " << float(data.shadow.trav_leaves   )*1E-6 << "M" << std::endl;
+      cout << "    #prims      = " << float(data.shadow.trav_prims    )*1E-6 << "M" << std::endl;
+      cout << "    #prim_hits  = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+      cout << "    #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+      cout << "    #stack pop   = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+      size_t shadow_box_hits = 0;
+      size_t weighted_shadow_box_hits = 0;
+
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) {        
+        shadow_box_hits += data.shadow.trav_hit_boxes[i];
+        weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+      }
+      cout << "    #hit_boxes    = ";
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+      cout << std::endl;
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+      cout << std::endl;
+    }
+    cout << std::endl;
+
+    /* print per traversal numbers */
+    cout << "--------- PER TRAVERSAL ---------" << std::endl;
+    float active_normal_travs       = float(cntrs.active.normal.travs      )/float(cntrs.all.normal.travs      );
+    float active_normal_trav_nodes  = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+    float active_normal_trav_xfm_nodes  = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+    float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+    float active_normal_trav_prims   = float(cntrs.active.normal.trav_prims  )/float(cntrs.all.normal.trav_prims  );
+    float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits  )/float(cntrs.all.normal.trav_prim_hits  );
+    float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop  )/float(cntrs.all.normal.trav_stack_pop  );
+
+    cout << "  #normal_travs   = " << float(cntrs.code.normal.travs      )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs       << "% active" << std::endl;
+    cout << "    #nodes        = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes  << "% active" << std::endl;
+    cout << "    #node_xfm     = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes  << "% active" << std::endl;
+    cout << "    #leaves       = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+    cout << "    #prims        = " << float(cntrs.code.normal.trav_prims  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims   << "% active" << std::endl;
+    cout << "    #prim_hits    = " << float(cntrs.code.normal.trav_prim_hits  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits   << "% active" << std::endl;
+    cout << "    #stack_pop    = " << float(cntrs.code.normal.trav_stack_pop  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop   << "% active" << std::endl;
+
+    if (cntrs.all.shadow.travs) {
+      float active_shadow_travs       = float(cntrs.active.shadow.travs      )/float(cntrs.all.shadow.travs      );
+      float active_shadow_trav_nodes  = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+      float active_shadow_trav_xfm_nodes  = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+      float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+      float active_shadow_trav_prims   = float(cntrs.active.shadow.trav_prims  )/float(cntrs.all.shadow.trav_prims  );
+      float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits  )/float(cntrs.all.shadow.trav_prim_hits  );
+
+      cout << "  #shadow_travs = " << float(cntrs.code.shadow.travs      )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs       << "% active" << std::endl;
+      cout << "    #nodes      = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes  << "% active" << std::endl;
+      cout << "    #nodes_xfm  = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes  << "% active" << std::endl;
+      cout << "    #leaves     = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+      cout << "    #prims      = " << float(cntrs.code.shadow.trav_prims  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims   << "% active" << std::endl;
+      cout << "    #prim_hits  = " << float(cntrs.code.shadow.trav_prim_hits  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits   << "% active" << std::endl;
+
+    }
+    cout << std::endl;
+
+     /* print user counters for performance tuning */
+    cout << "--------- USER ---------" << std::endl;
+    for (size_t i=0; i<10; i++)
+      cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+    cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.h b/thirdparty/embree-aarch64/kernels/common/stat.h
new file mode 100644
index 0000000000..3cda2bd014
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+#  define STAT(x) x
+#  define STAT3(s,x,y,z) \
+  STAT(Stat::get().code  .s+=x);               \
+  STAT(Stat::get().active.s+=y);               \
+  STAT(Stat::get().all   .s+=z);
+#  define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+#  define STAT(x)
+#  define STAT3(s,x,y,z)
+#  define STAT_USER(i,x) 
+#endif
+
+namespace embree
+{
+  /*! Gathers ray tracing statistics. We count 1) how often a code
+   *  location is reached, 2) how many SIMD lanes are active, 3) how
+   *  many SIMD lanes reach the code location */
+  class Stat
+  { 
+  public:
+
+    static const size_t SIZE_HISTOGRAM = 64+1;
+
+    /*! constructs stat counter class */
+    Stat ();
+
+    /*! destructs stat counter class */
+    ~Stat ();
+
+    class Counters 
+    {
+    public:
+      Counters () { 
+        clear(); 
+      }
+      
+      void clear() 
+      { 
+        all.clear();
+        active.clear();
+        code.clear();
+        for (auto& u : user) u.store(0);
+      }
+
+    public:
+
+	/* per packet and per ray stastics */
+	struct Data
+        {
+          void clear () {
+            normal.clear();
+            shadow.clear();
+            point_query.clear();
+          }
+
+	  /* normal and shadow ray statistics */
+	  struct 
+          {
+            void clear() 
+            {
+              travs.store(0);
+              trav_nodes.store(0);
+              trav_leaves.store(0);
+              trav_prims.store(0);
+              trav_prim_hits.store(0);
+              for (auto& v : trav_hit_boxes) v.store(0);
+              trav_stack_pop.store(0);
+              trav_stack_nodes.store(0); 
+              trav_xfm_nodes.store(0); 
+            }
+
+          public:
+	    std::atomic<size_t> travs;
+	    std::atomic<size_t> trav_nodes;
+	    std::atomic<size_t> trav_leaves;
+	    std::atomic<size_t> trav_prims;
+	    std::atomic<size_t> trav_prim_hits;
+	    std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+	    std::atomic<size_t> trav_stack_pop;
+	    std::atomic<size_t> trav_stack_nodes; 
+            std::atomic<size_t> trav_xfm_nodes; 
+            
+	  } normal, shadow, point_query;
+	} all, active, code; 
+
+        std::atomic<size_t> user[10];
+    };
+
+  public:
+
+    static __forceinline Counters& get() {
+      return instance.cntrs;
+    }
+    
+    static void clear() {
+      instance.cntrs.clear();
+    }
+    
+    static void print(embree_ostream cout);
+
+  private: 
+    Counters cntrs;
+
+  private:
+    static Stat instance;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp
new file mode 100644
index 0000000000..51fc9b7826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+  MutexSys g_printMutex;
+
+  State::ErrorHandler State::g_errorHandler;
+
+  State::ErrorHandler::ErrorHandler()
+    : thread_error(createTls()) {}
+
+  State::ErrorHandler::~ErrorHandler()
+  {
+    Lock<MutexSys> lock(errors_mutex);
+    for (size_t i=0; i<thread_errors.size(); i++)
+      delete thread_errors[i];
+    destroyTls(thread_error);
+    thread_errors.clear();
+  }
+
+  RTCError* State::ErrorHandler::error() 
+  {
+    RTCError* stored_error = (RTCError*) getTls(thread_error);
+    if (stored_error) return stored_error;
+
+    Lock<MutexSys> lock(errors_mutex);
+    stored_error = new RTCError(RTC_ERROR_NONE);
+    thread_errors.push_back(stored_error);
+    setTls(thread_error,stored_error);
+    return stored_error;
+  }
+
+  State::State () 
+    : enabled_cpu_features(getCPUFeatures()),
+      enabled_builder_cpu_features(enabled_cpu_features),
+      frequency_level(FREQUENCY_SIMD256)
+  {
+    tri_accel = "default";
+    tri_builder = "default";
+    tri_traverser = "default";
+    
+    tri_accel_mb = "default";
+    tri_builder_mb = "default";
+    tri_traverser_mb = "default";
+
+    quad_accel = "default";
+    quad_builder = "default";
+    quad_traverser = "default";
+
+    quad_accel_mb = "default";
+    quad_builder_mb = "default";
+    quad_traverser_mb = "default";
+
+    line_accel = "default";
+    line_builder = "default";
+    line_traverser = "default";
+
+    line_accel_mb = "default";
+    line_builder_mb = "default";
+    line_traverser_mb = "default";
+    
+    hair_accel = "default";
+    hair_builder = "default";
+    hair_traverser = "default";
+
+    hair_accel_mb = "default";
+    hair_builder_mb = "default";
+    hair_traverser_mb = "default";
+
+    object_accel = "default";
+    object_builder = "default";
+    object_accel_min_leaf_size = 1;
+    object_accel_max_leaf_size = 1;
+
+    object_accel_mb = "default";
+    object_builder_mb = "default";
+    object_accel_mb_min_leaf_size = 1;
+    object_accel_mb_max_leaf_size = 1;
+
+    max_spatial_split_replications = 1.2f;
+    useSpatialPreSplits = false;
+
+    tessellation_cache_size = 128*1024*1024;
+
+    subdiv_accel = "default";
+    subdiv_accel_mb = "default";
+
+    grid_accel = "default";
+    grid_builder = "default";
+    grid_accel_mb = "default";
+    grid_builder_mb = "default";
+
+    instancing_open_min = 0;
+    instancing_block_size = 0;
+    instancing_open_factor = 8.0f; 
+    instancing_open_max_depth = 32;
+    instancing_open_max = 50000000;
+
+    ignore_config_files = false;
+    float_exceptions = false;
+    quality_flags = -1;
+    scene_flags = -1;
+    verbose = 0;
+    benchmark = 0;
+
+    numThreads = 0;
+    numUserThreads = 0;
+
+#if TASKING_INTERNAL
+    set_affinity = true;
+#else
+    set_affinity = false;
+#endif
+    /* per default enable affinity on KNL */
+    if (hasISA(AVX512KNL)) set_affinity = true;
+
+    start_threads = false;
+    enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+    hugepages = true;
+#else
+    hugepages = false;
+#endif
+    hugepages_success = true;
+
+    alloc_main_block_size = 0;
+    alloc_num_main_slots = 0;
+    alloc_thread_block_size = 0;
+    alloc_single_thread_alloc = -1;
+
+    error_function = nullptr;
+    error_function_userptr = nullptr;
+
+    memory_monitor_function = nullptr;
+    memory_monitor_userptr = nullptr;
+  }
+
+  State::~State() {
+  }
+
+  bool State::hasISA(const int isa) {
+    return (enabled_cpu_features & isa) == isa;
+  }
+
+  bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
+    return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
+  }
+  
+  void State::verify()
+  {
+    /* verify that calculations stay in range */
+    assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+    /* here we verify that CPP files compiled for a specific ISA only
+     * call that same or lower ISA version of non-inlined class member
+     * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
+    assert(sse2::getISA() <= SSE2);
+#endif
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512KNL)
+    assert(avx512knl::getISA() <= AVX512KNL);
+#endif
+#if defined (EMBREE_TARGET_AVX512SKX)
+    assert(avx512skx::getISA() <= AVX512SKX);
+#endif
+#endif
+  }
+
+  const char* symbols[3] = { "=", ",", "|" };
+
+  bool State::parseFile(const FileName& fileName)
+  {
+    FILE* f = fopen(fileName.c_str(),"r");
+    if (!f) return false;
+    Ref<Stream<int> > file = new FileStream(f,fileName);
+    
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+    return true;
+  }
+
+  void State::parseString(const char* cfg)
+  {
+    if (cfg == nullptr) return;
+
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+  }
+  
+  int string_to_cpufeatures(const std::string& isa)
+  {
+    if      (isa == "sse" ) return SSE;
+    else if (isa == "sse2") return SSE2;
+    else if (isa == "sse3") return SSE3;
+    else if (isa == "ssse3") return SSSE3;
+    else if (isa == "sse41") return SSE41;
+    else if (isa == "sse4.1") return SSE41;
+    else if (isa == "sse42") return SSE42;
+    else if (isa == "sse4.2") return SSE42;
+    else if (isa == "avx") return AVX;
+    else if (isa == "avxi") return AVXI;
+    else if (isa == "avx2") return AVX2;
+    else if (isa == "avx512knl") return AVX512KNL;
+    else if (isa == "avx512skx") return AVX512SKX;
+    else return SSE2;
+  }
+
+  void State::parse(Ref<TokenStream> cin)
+  {
+    /* parse until end of stream */
+    while (cin->peek() != Token::Eof())
+    {
+      const Token tok = cin->get();
+
+      if (tok == Token::Id("threads") && cin->trySymbol("=")) 
+        numThreads = cin->get().Int();
+
+      else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) 
+        numUserThreads = cin->get().Int();
+
+      else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+
+      else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+      
+      else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) 
+        start_threads = cin->get().Int();
+      
+      else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features = string_to_cpufeatures(isa);
+        enabled_builder_cpu_features = enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features &= string_to_cpufeatures(isa);
+        enabled_builder_cpu_features &= enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_builder_cpu_features &= string_to_cpufeatures(isa);
+      }
+
+      else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+        std::string freq = cin->get().Identifier();
+        if      (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+        else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+        else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+      }
+
+      else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+        enable_selockmemoryprivilege = cin->get().Int();
+      }
+      else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+        hugepages = cin->get().Int();
+      }
+
+      else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("="))
+        ignore_config_files = cin->get().Int();
+      else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) 
+        float_exceptions = cin->get().Int();
+
+      else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+        tri_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+        tri_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+        tri_traverser = cin->get().Identifier();
+     
+      else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+        tri_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+        tri_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+        tri_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+        quad_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+        quad_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+        quad_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+        quad_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+        quad_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+        quad_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+        line_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+        line_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+        line_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+        line_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+        line_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+        line_traverser_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+        hair_accel = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+        hair_builder = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+        hair_traverser = cin->get().Identifier();
+
+      else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+        hair_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+        hair_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+        hair_traverser_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+        object_accel = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+        object_builder = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+        object_accel_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+        object_accel_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+        object_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+        object_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+        instancing_open_min = cin->get().Int();
+      else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+        instancing_block_size = cin->get().Int();
+        instancing_open_factor = 0.0f;
+      }
+      else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+        instancing_open_max_depth = cin->get().Int();
+      else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+        instancing_block_size = 0;
+        instancing_open_factor = cin->get().Float();
+      }
+      else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+        instancing_open_max = cin->get().Int();
+
+      else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+        subdiv_accel = cin->get().Identifier();
+      else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+        subdiv_accel_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+        grid_accel = cin->get().Identifier();
+      else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+        grid_accel_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+        verbose = cin->get().Int();
+      else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+        benchmark = cin->get().Int();
+      
+      else if (tok == Token::Id("quality")) {
+        if (cin->trySymbol("=")) {
+          Token flag = cin->get();
+          if      (flag == Token::Id("low"))    quality_flags = RTC_BUILD_QUALITY_LOW;
+          else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+          else if (flag == Token::Id("high"))   quality_flags = RTC_BUILD_QUALITY_HIGH;
+        }
+      }
+
+      else if (tok == Token::Id("scene_flags")) {
+        scene_flags = 0;
+        if (cin->trySymbol("=")) {
+          do {
+            Token flag = cin->get();
+            if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+            else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+            else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+          } while (cin->trySymbol("|"));
+        }
+      }
+      
+      else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+        max_spatial_split_replications = cin->get().Float();
+
+      else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+        useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+      else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+      else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+      else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+        alloc_main_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+        alloc_num_main_slots = cin->get().Int();
+       else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+         alloc_thread_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+         alloc_single_thread_alloc = cin->get().Int();
+
+      cin->trySymbol(","); // optional , separator
+    }
+  }
+
+  bool State::verbosity(size_t N) {
+    return N <= verbose;
+  }
+
+  void State::print()
+  {
+    std::cout << "general:" << std::endl;
+    std::cout << "  build threads      = " << numThreads   << std::endl;
+    std::cout << "  build user threads = " << numUserThreads   << std::endl;
+    std::cout << "  start_threads      = " << start_threads << std::endl;
+    std::cout << "  affinity           = " << set_affinity << std::endl;
+    std::cout << "  frequency_level    = ";
+    switch (frequency_level) {
+    case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+    case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+    case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+    default: std::cout << "error" << std::endl; break;
+    }
+    
+    std::cout << "  hugepages          = ";
+    if (!hugepages) std::cout << "disabled" << std::endl;
+    else if (hugepages_success) std::cout << "enabled" << std::endl;
+    else std::cout << "failed" << std::endl;
+
+    std::cout << "  verbosity          = " << verbose << std::endl;
+    std::cout << "  cache_size         = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+    std::cout << "  max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+    
+    std::cout << "triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel << std::endl;
+    std::cout << "  builder            = " << tri_builder << std::endl;
+    std::cout << "  traverser          = " << tri_traverser << std::endl;
+        
+    std::cout << "motion blur triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel_mb << std::endl;
+    std::cout << "  builder            = " << tri_builder_mb << std::endl;
+    std::cout << "  traverser          = " << tri_traverser_mb << std::endl;
+
+    std::cout << "quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel << std::endl;
+    std::cout << "  builder            = " << quad_builder << std::endl;
+    std::cout << "  traverser          = " << quad_traverser << std::endl;
+
+    std::cout << "motion blur quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel_mb << std::endl;
+    std::cout << "  builder            = " << quad_builder_mb << std::endl;
+    std::cout << "  traverser          = " << quad_traverser_mb << std::endl;
+
+    std::cout << "line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel << std::endl;
+    std::cout << "  builder            = " << line_builder << std::endl;
+    std::cout << "  traverser          = " << line_traverser << std::endl;
+
+    std::cout << "motion blur line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel_mb << std::endl;
+    std::cout << "  builder            = " << line_builder_mb << std::endl;
+    std::cout << "  traverser          = " << line_traverser_mb << std::endl;
+    
+    std::cout << "hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel << std::endl;
+    std::cout << "  builder            = " << hair_builder << std::endl;
+    std::cout << "  traverser          = " << hair_traverser << std::endl;
+
+    std::cout << "motion blur hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel_mb << std::endl;
+    std::cout << "  builder            = " << hair_builder_mb << std::endl;
+    std::cout << "  traverser          = " << hair_traverser_mb << std::endl;
+    
+    std::cout << "subdivision surfaces:" << std::endl;
+    std::cout << "  accel              = " << subdiv_accel << std::endl;
+
+    std::cout << "grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel << std::endl;
+    std::cout << "  builder            = " << grid_builder << std::endl;
+
+    std::cout << "motion blur grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel_mb << std::endl;
+    std::cout << "  builder            = " << grid_builder_mb << std::endl;
+
+    std::cout << "object_accel:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_max_leaf_size << std::endl;
+
+    std::cout << "object_accel_mb:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_mb_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_mb_max_leaf_size << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h
new file mode 100644
index 0000000000..d0fccc023f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* mutex to make printing to cout thread safe */
+  extern MutexSys g_printMutex;
+
+  struct State : public RefCount
+  {
+  public:
+    /*! state construction */
+    State ();
+
+    /*! state destruction */
+    ~State();
+
+    /*! verifies that state is correct */
+    void verify();
+
+    /*! parses state from a configuration file */
+    bool parseFile(const FileName& fileName);
+
+    /*! parses the state from a string */
+    void parseString(const char* cfg);
+
+    /*! parses the state from a stream */
+    void parse(Ref<TokenStream> cin);
+
+    /*! prints the state */
+    void print();
+
+    /*! checks if verbosity level is at least N */
+    bool verbosity(size_t N);
+
+    /*! checks if some particular ISA is enabled */
+    bool hasISA(const int isa);
+
+    /*! check whether selected ISA is supported by the HW */    
+    bool checkISASupport();
+    
+  public:
+    std::string tri_accel;                 //!< acceleration structure to use for triangles
+    std::string tri_builder;               //!< builder to use for triangles
+    std::string tri_traverser;             //!< traverser to use for triangles
+    
+  public:
+    std::string tri_accel_mb;              //!< acceleration structure to use for motion blur triangles
+    std::string tri_builder_mb;            //!< builder to use for motion blur triangles
+    std::string tri_traverser_mb;          //!< traverser to use for triangles
+
+  public:
+    std::string quad_accel;                 //!< acceleration structure to use for quads
+    std::string quad_builder;               //!< builder to use for quads
+    std::string quad_traverser;             //!< traverser to use for quads
+
+  public:
+    std::string quad_accel_mb;             //!< acceleration structure to use for motion blur quads
+    std::string quad_builder_mb;           //!< builder to use for motion blur quads
+    std::string quad_traverser_mb;         //!< traverser to use for motion blur quads
+
+  public:
+    std::string line_accel;                 //!< acceleration structure to use for line segments
+    std::string line_builder;               //!< builder to use for line segments
+    std::string line_traverser;             //!< traverser to use for line segments
+
+  public:
+    std::string line_accel_mb;             //!< acceleration structure to use for motion blur line segments
+    std::string line_builder_mb;           //!< builder to use for motion blur line segments
+    std::string line_traverser_mb;         //!< traverser to use for motion blur line segments
+
+  public:
+    std::string hair_accel;                //!< hair acceleration structure to use
+    std::string hair_builder;              //!< builder to use for hair
+    std::string hair_traverser;            //!< traverser to use for hair
+
+  public:
+    std::string hair_accel_mb;             //!< acceleration structure to use for motion blur hair
+    std::string hair_builder_mb;           //!< builder to use for motion blur hair
+    std::string hair_traverser_mb;         //!< traverser to use for motion blur hair
+
+  public:
+    std::string object_accel;               //!< acceleration structure for user geometries
+    std::string object_builder;             //!< builder for user geometries
+    int object_accel_min_leaf_size;         //!< minimum leaf size for object acceleration structure
+    int object_accel_max_leaf_size;         //!< maximum leaf size for object acceleration structure
+
+  public:
+    std::string object_accel_mb;            //!< acceleration structure for user geometries
+    std::string object_builder_mb;          //!< builder for user geometries
+    int object_accel_mb_min_leaf_size;      //!< minimum leaf size for mblur object acceleration structure
+    int object_accel_mb_max_leaf_size;      //!< maximum leaf size for mblur object acceleration structure
+
+  public:
+    std::string subdiv_accel;              //!< acceleration structure to use for subdivision surfaces
+    std::string subdiv_accel_mb;           //!< acceleration structure to use for subdivision surfaces
+
+  public:
+    std::string grid_accel;              //!< acceleration structure to use for grids
+    std::string grid_builder;            //!< builder for grids
+    std::string grid_accel_mb;           //!< acceleration structure to use for motion blur grids
+    std::string grid_builder_mb;         //!< builder for motion blur grids
+
+  public:
+    float max_spatial_split_replications;  //!< maximally replications*N many primitives in accel for spatial splits
+    bool useSpatialPreSplits;              //!< use spatial pre-splits instead of the full spatial split builder
+    size_t tessellation_cache_size;        //!< size of the shared tessellation cache 
+
+  public:
+    size_t instancing_open_min;            //!< instancing opens tree to minimally that number of subtrees
+    size_t instancing_block_size;          //!< instancing opens tree up to average block size of primitives
+    float  instancing_open_factor;         //!< instancing opens tree up to x times the number of instances
+    size_t instancing_open_max_depth;      //!< maximum open depth for geometries
+    size_t instancing_open_max;            //!< instancing opens tree to maximally that number of subtrees
+
+  public:
+    bool ignore_config_files;              //!< if true no more config files get parse
+    bool float_exceptions;                 //!< enable floating point exceptions
+    int quality_flags;
+    int scene_flags;
+    size_t verbose;                        //!< verbosity of output
+    size_t benchmark;                      //!< true
+    
+  public:
+    size_t numThreads;                     //!< number of threads to use in builders
+    size_t numUserThreads;                 //!< number of user provided threads to use in builders
+    bool set_affinity;                     //!< sets affinity for worker threads
+    bool start_threads;                    //!< true when threads should be started at device creation time
+    int enabled_cpu_features;              //!< CPU ISA features to use
+    int enabled_builder_cpu_features;      //!< CPU ISA features to use for builders only
+    enum FREQUENCY_LEVEL {
+      FREQUENCY_SIMD128,
+      FREQUENCY_SIMD256,
+      FREQUENCY_SIMD512
+    } frequency_level;                     //!< frequency level the app wants to run on (default is SIMD256)
+    bool enable_selockmemoryprivilege;     //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+    bool hugepages;                        //!< true if huge pages should get used
+    bool hugepages_success;                //!< status for enabling huge pages
+
+  public:
+    size_t alloc_main_block_size;          //!< main allocation block size (shared between threads)
+    int alloc_num_main_slots;              //!< number of such shared blocks to be used to allocate
+    size_t alloc_thread_block_size;        //!< size of thread local allocator block size
+    int alloc_single_thread_alloc;         //!< in single mode nodes and leaves use same thread local allocator
+
+  public:
+
+    /*! checks if we can use AVX */
+    bool canUseAVX() {
+      return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+    }
+
+    /*! checks if we can use AVX2 */
+    bool canUseAVX2() {
+      return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+    }
+    
+    struct ErrorHandler
+    {
+    public:
+      ErrorHandler();
+      ~ErrorHandler();
+      RTCError* error();
+
+    public:
+      tls_t thread_error;
+      std::vector<RTCError*> thread_errors;
+      MutexSys errors_mutex;
+    };
+    ErrorHandler errorHandler;
+    static ErrorHandler g_errorHandler;
+
+  public:
+    void setErrorFunction(RTCErrorFunction fptr, void* uptr) 
+    {
+      error_function = fptr;
+      error_function_userptr = uptr;
+    }
+
+    RTCErrorFunction error_function;
+    void* error_function_userptr;
+
+  public:
+    void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) 
+    {
+      memory_monitor_function = fptr;
+      memory_monitor_userptr = uptr;
+    }
+      
+    RTCMemoryMonitorFunction memory_monitor_function;
+    void* memory_monitor_userptr;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h
new file mode 100644
index 0000000000..b478762240
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+  /*! invokes the memory monitor callback */
+  struct MemoryMonitorInterface {
+    virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+  };
+
+  /*! allocator that performs aligned monitored allocations */
+  template<typename T, size_t alignment = 64>
+    struct aligned_monitored_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) 
+        : device(device), hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) 
+      {
+        if (n) {
+          assert(device);
+          device->memoryMonitor(n*sizeof(T),false);
+        }
+        if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+        {
+          pointer p =  (pointer) os_malloc(n*sizeof(value_type),hugepages);
+          assert(p);
+          return p;
+        }
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) 
+      {
+        if (p)
+        {
+          if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+            os_free(p,n*sizeof(value_type),hugepages); 
+          else
+            alignedFree(p);
+        }
+        else assert(n == 0);
+
+        if (n) {
+          assert(device);
+          device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+        }
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+    private:
+      MemoryMonitorInterface* device;
+      bool hugepages;
+    };
+
+  /*! monitored vector */
+  template<typename T>
+    using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}