51 files changed, 575 insertions, 330 deletions
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_morton.h b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
index 8f21e3254f..cba32ca73c 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_morton.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_morton.h
@@ -411,7 +411,7 @@ namespace embree
           ReductionTy bounds[MAX_BRANCHING_FACTOR];
           if (current.size() > singleThreadThreshold)
           {
-            /*! parallel_for is faster than spawing sub-tasks */
+            /*! parallel_for is faster than spawning sub-tasks */
             parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
                 for (size_t i=r.begin(); i<r.end(); i++) {
                   bounds[i] = recurse(depth+1,children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
index f9a08d65cd..6e73c0d250 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_msmblur.h
@@ -374,7 +374,7 @@ namespace embree
 
             const size_t begin = set.begin();
             const size_t end   = set.end();
-            const size_t center = (begin + end)/2;
+            const size_t center = (begin + end + 1) / 2;
 
             PrimInfoMB linfo = empty;
             for (size_t i=begin; i<center; i++)
@@ -594,7 +594,7 @@ namespace embree
             /* spawn tasks */
             if (unlikely(current.size() > cfg.singleThreadThreshold))
             {
-              /*! parallel_for is faster than spawing sub-tasks */
+              /*! parallel_for is faster than spawning sub-tasks */
               parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
                   for (size_t i=r.begin(); i<r.end(); i++) {
                     values[i] = recurse(children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/bvh_builder_sah.h b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
index fff4bf2a35..24c5faf8be 100644
--- a/thirdparty/embree/kernels/builders/bvh_builder_sah.h
+++ b/thirdparty/embree/kernels/builders/bvh_builder_sah.h
@@ -298,7 +298,7 @@ namespace embree
             /* spawn tasks */
             if (current.size() > cfg.singleThreadThreshold)
             {
-              /*! parallel_for is faster than spawing sub-tasks */
+              /*! parallel_for is faster than spawning sub-tasks */
               parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
                   for (size_t i=r.begin(); i<r.end(); i++) {
                     values[i] = recurse(children[i],nullptr,true);
diff --git a/thirdparty/embree/kernels/builders/heuristic_binning.h b/thirdparty/embree/kernels/builders/heuristic_binning.h
index ee29d09ac9..41be6183b8 100644
--- a/thirdparty/embree/kernels/builders/heuristic_binning.h
+++ b/thirdparty/embree/kernels/builders/heuristic_binning.h
@@ -57,14 +57,12 @@ namespace embree
         __forceinline Vec3ia bin(const Vec3fa& p) const 
         {
           const vint4 i = floori((vfloat4(p)-ofs)*scale);
-#if 1
           assert(i[0] >= 0 && (size_t)i[0] < num); 
           assert(i[1] >= 0 && (size_t)i[1] < num);
           assert(i[2] >= 0 && (size_t)i[2] < num);
-          return Vec3ia(i);
-#else
+          
+          // we clamp to handle corner cases that could calculate out of bounds bin
           return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
-#endif
         }
 
         /*! faster but unsafe binning */
diff --git a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
index 4249d16ea1..354e283557 100644
--- a/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
+++ b/thirdparty/embree/kernels/builders/heuristic_openmerge_array.h
@@ -275,7 +275,7 @@ namespace embree
               openNodesBasedOnExtend(set);
 #endif
 
-            /* disable opening when unsufficient space for opening a node available */
+            /* disable opening when insufficient space for opening a node available */
             if (set.ext_range_size() < max_open_size-1) 
               set.set_ext_range(set.end()); /* disable opening */
           }
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial.h b/thirdparty/embree/kernels/builders/heuristic_spatial.h
index a6939ba258..8b3499ac8d 100644
--- a/thirdparty/embree/kernels/builders/heuristic_spatial.h
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial.h
@@ -159,27 +159,25 @@ namespace embree
         assert(binID < BINS);
         bounds  [binID][dim].extend(b);        
       }
-      
-      /*! bins an array of triangles */
-      template<typename SplitPrimitive>
-        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
       {
-        for (size_t i=0; i<N; i++)
+        for (size_t i=begin; i<end; i++)
         {
-          const PrimRef prim = prims[i];
+          const PrimRef& prim = source[i];
           unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
-
-          if (unlikely(splits == 1))
+          
+          if (unlikely(splits <= 1))
           {
             const vint4 bin = mapping.bin(center(prim.bounds()));
             for (size_t dim=0; dim<3; dim++) 
             {
               assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
-              numBegin[bin[dim]][dim]++;
-              numEnd  [bin[dim]][dim]++;
-              bounds  [bin[dim]][dim].extend(prim.bounds());
+              add(dim,bin[dim],bin[dim],bin[dim],prim.bounds());
             }
-          } 
+          }
           else
           {
             const vint4 bin0 = mapping.bin(prim.bounds().lower);
@@ -187,89 +185,44 @@ namespace embree
             
             for (size_t dim=0; dim<3; dim++) 
             {
+              if (unlikely(mapping.invalid(dim))) 
+                continue;
+              
               size_t bin;
-              PrimRef rest = prim;
               size_t l = bin0[dim];
               size_t r = bin1[dim];
-
+              
               // same bin optimization
               if (likely(l == r)) 
               {
-                numBegin[l][dim]++;
-                numEnd  [l][dim]++;
-                bounds  [l][dim].extend(prim.bounds());
+                add(dim,l,l,l,prim.bounds());
                 continue;
               }
-
-              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              size_t bin_start = bin0[dim];
+              size_t bin_end   = bin1[dim];
+              BBox3fa rest = prim.bounds();
+              
+              /* assure that split position always overlaps the primitive bounds */
+              while (bin_start < bin_end && mapping.pos(bin_start+1,dim) <= rest.lower[dim]) bin_start++;
+              while (bin_start < bin_end && mapping.pos(bin_end    ,dim) >= rest.upper[dim]) bin_end--;
+              
+              const auto splitter = splitterFactory(prim);
+              for (bin=bin_start; bin<bin_end; bin++) 
               {
                 const float pos = mapping.pos(bin+1,dim);
+                BBox3fa left,right;
+                splitter(rest,dim,pos,left,right);
                 
-                PrimRef left,right;
-                splitPrimitive(rest,(int)dim,pos,left,right);
-                if (unlikely(left.bounds().empty())) l++;                
-                bounds[bin][dim].extend(left.bounds());
+                if (unlikely(left.empty())) l++;                
+                extend(dim,bin,left);
                 rest = right;
               }
-              if (unlikely(rest.bounds().empty())) r--;
-              numBegin[l][dim]++;
-              numEnd  [r][dim]++;
-              bounds  [bin][dim].extend(rest.bounds());
+              if (unlikely(rest.empty())) r--;
+              add(dim,l,r,bin,rest);
             }
-          }
+          }              
         }
       }
-      
-      /*! bins a range of primitives inside an array */
-      template<typename SplitPrimitive>
-        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
-	bin(splitPrimitive,prims+begin,end-begin,mapping);
-      }
-
-      /*! bins an array of primitives */
-      template<typename PrimitiveSplitterFactory>
-        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
-      {
-        for (size_t i=begin; i<end; i++)
-        {
-          const PrimRef &prim = source[i];
-          const vint4 bin0 = mapping.bin(prim.bounds().lower);
-          const vint4 bin1 = mapping.bin(prim.bounds().upper);
-          
-          for (size_t dim=0; dim<3; dim++) 
-          {
-            if (unlikely(mapping.invalid(dim))) 
-              continue;
-            
-            size_t bin;
-            size_t l = bin0[dim];
-            size_t r = bin1[dim];
-            
-            // same bin optimization
-            if (likely(l == r)) 
-            {
-              add(dim,l,l,l,prim.bounds());
-              continue;
-            }
-            const size_t bin_start = bin0[dim];
-            const size_t bin_end   = bin1[dim];
-            BBox3fa rest = prim.bounds();
-            const auto splitter = splitterFactory(prim);
-            for (bin=bin_start; bin<bin_end; bin++) 
-            {
-              const float pos = mapping.pos(bin+1,dim);
-              BBox3fa left,right;
-              splitter(rest,dim,pos,left,right);
-              if (unlikely(left.empty())) l++;                
-              extend(dim,bin,left);
-              rest = right;
-            }
-            if (unlikely(rest.empty())) r--;
-            add(dim,l,r,bin,rest);
-          }
-        }              
-      }
-
 
 
       /*! bins an array of primitives */
diff --git a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
index 60d235f48d..2584c19bda 100644
--- a/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
+++ b/thirdparty/embree/kernels/builders/heuristic_spatial_array.h
@@ -241,7 +241,7 @@ namespace embree
           SpatialBinner binner(empty); 
           const SpatialBinMapping<SPATIAL_BINS> mapping(set);
           binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
-          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
           return binner.best(mapping,logBlockSize); //,set.ext_size());
         }
 
@@ -256,7 +256,7 @@ namespace embree
                                      binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
                                      return binner; },
                                    [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
-          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          /* todo: best spatial split not exceeding the extended range does not provide any benefit ?*/
           return binner.best(mapping,logBlockSize); //,set.ext_size());
         }
 
@@ -286,6 +286,7 @@ namespace embree
                 //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
                 //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
                 //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+
                 if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
                 {
                   assert(splits > 1);
@@ -384,8 +385,8 @@ namespace embree
           new (&lset) PrimInfoExtRange(begin,center,center,local_left);
           new (&rset) PrimInfoExtRange(center,end,end,local_right);
 
-          assert(area(lset.geomBounds) >= 0.0f);
-          assert(area(rset.geomBounds) >= 0.0f);
+          assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
+          assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
           return std::pair<size_t,size_t>(left_weight,right_weight);
         }
 
@@ -410,7 +411,7 @@ namespace embree
                                               begin,end,local_left,local_right,
                                               [&] (const PrimRef& ref) {
                                                 const Vec3fa c = ref.bounds().center();
-                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask);
                                               },
                                               [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
 
@@ -419,8 +420,8 @@ namespace embree
           
           new (&lset) PrimInfoExtRange(begin,center,center,local_left);
           new (&rset) PrimInfoExtRange(center,end,end,local_right);
-          assert(area(lset.geomBounds) >= 0.0f);
-          assert(area(rset.geomBounds) >= 0.0f);
+          assert(!lset.geomBounds.empty() && area(lset.geomBounds) >= 0.0f);
+          assert(!rset.geomBounds.empty() && area(rset.geomBounds) >= 0.0f);
           return std::pair<size_t,size_t>(left_weight,right_weight);
         }
 
diff --git a/thirdparty/embree/kernels/builders/primrefgen.cpp b/thirdparty/embree/kernels/builders/primrefgen.cpp
index d279dc4993..e2d7c27bd8 100644
--- a/thirdparty/embree/kernels/builders/primrefgen.cpp
+++ b/thirdparty/embree/kernels/builders/primrefgen.cpp
@@ -184,9 +184,7 @@ namespace embree
 
     // special variants for grid meshes
 
-// -- GODOT start --
 #if defined(EMBREE_GEOMETRY_GRID)
-// -- GODOT end --
     PrimInfo createPrimRefArrayGrids(Scene* scene, mvector<PrimRef>& prims, mvector<SubGridBuildData>& sgrids)
     {
       PrimInfo pinfo(empty);
@@ -296,9 +294,7 @@ namespace embree
 
       return pinfo;
     }
-// -- GODOT start --
 #endif
-// -- GODOT end --
     
     // ====================================================================================================
     // ====================================================================================================
diff --git a/thirdparty/embree/kernels/builders/primrefgen_presplit.h b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
index 8cd251ddd2..aa2026a85e 100644
--- a/thirdparty/embree/kernels/builders/primrefgen_presplit.h
+++ b/thirdparty/embree/kernels/builders/primrefgen_presplit.h
@@ -266,7 +266,7 @@ namespace embree
       /* anything to split ? */
       if (center < numPrimitives)
       {
-        const size_t numPrimitivesToSplit = numPrimitives - center;
+        size_t numPrimitivesToSplit = numPrimitives - center;
         assert(presplitItem[center].priority >= 1.0f);
 
         /* sort presplit items in ascending order */
@@ -279,8 +279,8 @@ namespace embree
             });
           );
 
-        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
-        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+        unsigned int* primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int* primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
 
         /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
         const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
@@ -317,11 +317,16 @@ namespace embree
             sum += numSubPrims;
           }
           new_center++;
+
+          primOffset0 += new_center - center;
+          numPrimitivesToSplit -= new_center - center;
           center = new_center;
+          assert(numPrimitivesToSplit == (numPrimitives - center));
         }
 
         /* parallel prefix sum to compute offsets for storing sub-primitives */
         const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+        assert(numPrimitives+offset <= alloc_numPrimitives);
 
         /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
         parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
@@ -338,7 +343,7 @@ namespace embree
               unsigned int numSubPrims = 0;
               splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
               const size_t newID = numPrimitives + primOffset1[j-center];              
-              assert(newID+numSubPrims <= alloc_numPrimitives);
+              assert(newID+numSubPrims-1 <= alloc_numPrimitives);
               prims[primrefID] = subPrims[0];
               for (size_t i=1;i<numSubPrims;i++)
                 prims[newID+i-1] = subPrims[i];
diff --git a/thirdparty/embree/kernels/builders/splitter.h b/thirdparty/embree/kernels/builders/splitter.h
index f7720bd284..da89d0b178 100644
--- a/thirdparty/embree/kernels/builders/splitter.h
+++ b/thirdparty/embree/kernels/builders/splitter.h
@@ -128,28 +128,30 @@ namespace embree
         const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
         const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
         QuadMesh::Quad quad = mesh->quad(prim.primID());
-        v[0] = mesh->vertex(quad.v[0]);
-        v[1] = mesh->vertex(quad.v[1]);
-        v[2] = mesh->vertex(quad.v[2]);
-        v[3] = mesh->vertex(quad.v[3]);
-        v[4] = mesh->vertex(quad.v[0]);
-        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
-        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
-        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
-        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+        v[0] = mesh->vertex(quad.v[1]);
+        v[1] = mesh->vertex(quad.v[2]);
+        v[2] = mesh->vertex(quad.v[3]);
+        v[3] = mesh->vertex(quad.v[0]);
+        v[4] = mesh->vertex(quad.v[1]);
+        v[5] = mesh->vertex(quad.v[3]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1] - v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2] - v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3] - v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[4] - v[3]);
+        inv_length[4] = Vec3fa(1.0f) / (v[5] - v[4]);
       }
       
       __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
-        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+        splitPolygon<5>(prim,dim,pos,v,left_o,right_o);
       }
       
       __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
-        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+        splitPolygon<5>(prim,dim,pos,v,inv_length,left_o,right_o);
       }
       
     private:
-      Vec3fa v[5];
-      Vec3fa inv_length[4];
+      Vec3fa v[6];
+      Vec3fa inv_length[5];
     };
     
     struct QuadSplitterFactory
diff --git a/thirdparty/embree/kernels/bvh/bvh.cpp b/thirdparty/embree/kernels/bvh/bvh.cpp
index a84295f0da..f6cf626465 100644
--- a/thirdparty/embree/kernels/bvh/bvh.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh.cpp
@@ -183,7 +183,7 @@ namespace embree
   template class BVHN<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
   template class BVHN<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
index 6e9a5a538e..1d393fd06b 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
@@ -230,7 +230,7 @@ namespace embree
             continue;
 
           /* switch to single ray traversal */
-#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
 #if FORCE_SINGLE_MODE == 0
           if (single)
 #endif
@@ -676,7 +676,7 @@ namespace embree
           continue;
 
         /* switch to single ray traversal */
-#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if (!defined(__WIN32__) || defined(__X86_64__)) && ((defined(__aarch64__)) || defined(__SSE4_2__))
 #if FORCE_SINGLE_MODE == 0
         if (single)
 #endif
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
index 717f559677..c7e040fadb 100644
--- a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.h
@@ -170,12 +170,23 @@ namespace embree
           TravRayKStream<K,robust> &p = packets[rayID / K];
           const size_t i = rayID % K;
           const vint<N> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+          const vfloat<N> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<N> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<N> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+          const vfloat<N> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<N> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<N> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+#else
           const vfloat<N> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<N> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<N> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
           const vfloat<N> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
           const vfloat<N> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
           const vfloat<N> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+#endif
+
           const vfloat<N> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<N>(p.tnear[i]));
           const vfloat<N> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<N>(p.tfar[i]));      
 
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
index 57530692bc..3fd9fc7d18 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb.h
@@ -46,6 +46,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         AABBNode_t* node = ref.getAABBNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         return ref;
@@ -60,6 +68,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         AABBNode_t* node = ref.getAABBNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
index c4cea7d8ba..001f526c25 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb.h
@@ -31,6 +31,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i].ref == NodeRef::emptyNode);
+          assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
+        }
+#endif
         AABBNodeMB_t* node = ref.getAABBNodeMB();
         
         LBBox3fa bounds = empty;
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
index 46a81d7581..3b966fd054 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -41,6 +41,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i].ref == NodeRef::emptyNode);
+          assert(emptyChild == (children[i].ref == NodeRef::emptyNode));
+        }
+#endif
         if (likely(ref.isAABBNodeMB())) {
           for (size_t i=0; i<num; i++)
             ref.getAABBNodeMB()->set(i, children[i]);
diff --git a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
index 2afc8c98e7..99671ddc5a 100644
--- a/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
+++ b/thirdparty/embree/kernels/bvh/bvh_node_qaabb.h
@@ -190,6 +190,14 @@ namespace embree
       template<typename BuildRecord>
       __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
       {
+#if defined(DEBUG)
+        // check that empty children are only at the end of the child list
+        bool emptyChild = false;
+        for (size_t i=0; i<num; i++) {
+          emptyChild |= (children[i] == NodeRef::emptyNode);
+          assert(emptyChild == (children[i] == NodeRef::emptyNode));
+        }
+#endif
         QuantizedNode_t* node = ref.quantizedNode();
         for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
         return ref;
diff --git a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
index d857ff7d95..57f75bfd7e 100644
--- a/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
+++ b/thirdparty/embree/kernels/bvh/bvh_statistics.cpp
@@ -162,7 +162,7 @@ namespace embree
   template class BVHNStatistics<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
   template class BVHNStatistics<4>;
 #endif
 }
diff --git a/thirdparty/embree/kernels/bvh/node_intersector1.h b/thirdparty/embree/kernels/bvh/node_intersector1.h
index 1ec4fc63fc..17641fa888 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector1.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector1.h
@@ -5,6 +5,15 @@
 
 #include "node_intersector.h"
 
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
 namespace embree
 {
   namespace isa
@@ -29,9 +38,15 @@ namespace embree
         org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
         dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
         rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
         const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
         org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+          //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+          //x86 will use msub
+        neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
 #endif
         nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
         nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
@@ -49,8 +64,12 @@ namespace embree
         org  = Vec3vf<N>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
         dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
         rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
-#if defined(__AVX2__) || defined(__ARM_NEON)
-	org_rdir = org*rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+        org_rdir = org*rdir;
+#else
+        neg_org_rdir = -(org*rdir);
+#endif
 #endif
 	nearX = nearXYZ.x[k];
 	nearY = nearXYZ.y[k];
@@ -62,8 +81,14 @@ namespace embree
 
       Vec3fa org_xyz, dir_xyz;
       Vec3vf<N> org, dir, rdir;
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
       Vec3vf<N> org_rdir;
+#else
+        //aarch64 version are keeping negation of the org_rdir and use madd
+        //x86 uses msub
+      Vec3vf<N> neg_org_rdir;
+#endif
 #endif
       size_t nearX, nearY, nearZ;
       size_t farX, farY, farZ;
@@ -404,13 +429,22 @@ namespace embree
     template<>
       __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
     {
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -450,13 +484,23 @@ namespace embree
     template<>
       __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
     {
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+
 #else
       const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
@@ -522,13 +566,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -537,7 +590,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
       const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool<N> vmask = asInt(tNear) > asInt(tFar);
@@ -598,13 +651,22 @@ namespace embree
       const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
-#if defined (__AVX2__) || defined(__ARM_NEON)
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
@@ -613,7 +675,7 @@ namespace embree
       const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
       const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
 #endif
-#if defined(__AVX2__) && !defined(__AVX512F__)
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
       const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
       const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
 #else
@@ -687,13 +749,22 @@ namespace embree
       const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
       const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -703,7 +774,7 @@ namespace embree
       const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
 #endif
       
-#if defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
       const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool4 vmask = asInt(tNear) > asInt(tFar);
@@ -775,13 +846,22 @@ namespace embree
       const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
       const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
@@ -857,13 +937,22 @@ namespace embree
       const vfloat<N> upper_y   = node->dequantizeUpperY(time);
       const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
       const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
       const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
 #else
       const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
       const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
index 1f7215e5df..cad4e6de2d 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_frustum.h
@@ -75,9 +75,13 @@ namespace embree
         min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
         max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
 
+#if defined (__aarch64__)
+        neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+        neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
         min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
         max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
-
+#endif
         min_dist = reduced_min_dist;
         max_dist = reduced_max_dist;
 
@@ -95,9 +99,13 @@ namespace embree
       Vec3fa min_rdir;
       Vec3fa max_rdir;
 
+#if defined (__aarch64__)
+      Vec3fa neg_min_org_rdir;
+      Vec3fa neg_max_org_rdir;
+#else
       Vec3fa min_org_rdir;
       Vec3fa max_org_rdir;
-
+#endif
       float min_dist;
       float max_dist;
     };
@@ -191,13 +199,21 @@ namespace embree
       const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
       const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x));
+      const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y));
+      const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z));
+      const vfloat<N> fmaxX = madd(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.neg_max_org_rdir.x));
+      const vfloat<N> fmaxY = madd(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.neg_max_org_rdir.y));
+      const vfloat<N> fmaxZ = madd(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.neg_max_org_rdir.z));
+#else
       const vfloat<N> fminX = msub(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.min_org_rdir.x));
       const vfloat<N> fminY = msub(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.min_org_rdir.y));
       const vfloat<N> fminZ = msub(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.min_org_rdir.z));
       const vfloat<N> fmaxX = msub(bmaxX, vfloat<N>(frustum.max_rdir.x), vfloat<N>(frustum.max_org_rdir.x));
       const vfloat<N> fmaxY = msub(bmaxY, vfloat<N>(frustum.max_rdir.y), vfloat<N>(frustum.max_org_rdir.y));
       const vfloat<N> fmaxZ = msub(bmaxZ, vfloat<N>(frustum.max_rdir.z), vfloat<N>(frustum.max_org_rdir.z));
-
+#endif
       const vfloat<N> fmin  = maxi(fminX, fminY, fminZ, vfloat<N>(frustum.min_dist));
       dist = fmin;
       const vfloat<N> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<N>(frustum.max_dist));
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet.h b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
index d5498fc5db..4deacd620d 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet.h
@@ -39,7 +39,9 @@ namespace embree
         org = ray_org;
         dir = ray_dir;
         rdir = rcp_safe(ray_dir);
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+        neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
         org_rdir = org * rdir;
 #endif
 
@@ -55,7 +57,9 @@ namespace embree
       Vec3vf<K> org;
       Vec3vf<K> dir;
       Vec3vf<K> rdir;
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
       Vec3vf<K> org_rdir;
 #endif
       Vec3vi<K> nearXYZ;
@@ -119,7 +123,14 @@ namespace embree
                                          const TravRayKFast<K>& ray, vfloat<K>& dist)
 
     {
-  #if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -199,7 +210,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -302,7 +320,14 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
@@ -464,7 +489,14 @@ namespace embree
       const vfloat<N> lower_z = node->dequantizeLowerZ();
       const vfloat<N> upper_z = node->dequantizeUpperZ();
 
-  #if defined(__AVX2__) || defined(__ARM_NEON)
+  #if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+  #elif defined(__AVX2__)
       const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
@@ -549,7 +581,14 @@ namespace embree
         const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
         const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
         
-#if defined(__AVX2__) || defined(__ARM_NEON)
+#if defined(__aarch64__)
+        const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+        const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
         const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
         const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
         const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
diff --git a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
index 55b2c27231..943fd7043f 100644
--- a/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
+++ b/thirdparty/embree/kernels/bvh/node_intersector_packet_stream.h
@@ -32,11 +32,19 @@ namespace embree
       __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
       {
         rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(ray_org * rdir);
+#else
         org_rdir = ray_org * rdir;
+#endif
       }
 
       Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#else
       Vec3vf<K> org_rdir;
+#endif
       vfloat<K> tnear;
       vfloat<K> tfar;
     };
@@ -87,12 +95,21 @@ namespace embree
       const vfloat<N> bmaxY = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
       const vfloat<N> bmaxZ = vfloat<N>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
 
+#if defined (__aarch64__)
+      const vfloat<N> rminX = madd(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
+      const vfloat<N> rminY = madd(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
+      const vfloat<N> rminZ = madd(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
+      const vfloat<N> rmaxX = madd(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.neg_org_rdir.x[k]));
+      const vfloat<N> rmaxY = madd(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.neg_org_rdir.y[k]));
+      const vfloat<N> rmaxZ = madd(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.neg_org_rdir.z[k]));
+#else
       const vfloat<N> rminX = msub(bminX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
       const vfloat<N> rminY = msub(bminY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
       const vfloat<N> rminZ = msub(bminZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
       const vfloat<N> rmaxX = msub(bmaxX, vfloat<N>(ray.rdir.x[k]), vfloat<N>(ray.org_rdir.x[k]));
       const vfloat<N> rmaxY = msub(bmaxY, vfloat<N>(ray.rdir.y[k]), vfloat<N>(ray.org_rdir.y[k]));
       const vfloat<N> rmaxZ = msub(bmaxZ, vfloat<N>(ray.rdir.z[k]), vfloat<N>(ray.org_rdir.z[k]));
+#endif
       const vfloat<N> rmin  = maxi(rminX, rminY, rminZ, vfloat<N>(ray.tnear[k]));
       const vfloat<N> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<N>(ray.tfar[k]));
 
@@ -113,12 +130,21 @@ namespace embree
       const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
       const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
 
+#if defined (__aarch64__)
+      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
       const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
       const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
       const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
       const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
 
       const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
       const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
diff --git a/thirdparty/embree/kernels/common/accel.h b/thirdparty/embree/kernels/common/accel.h
index cc4ea1805b..d24326ce92 100644
--- a/thirdparty/embree/kernels/common/accel.h
+++ b/thirdparty/embree/kernels/common/accel.h
@@ -332,7 +332,7 @@ namespace embree
         intersectorN.intersect(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         intersect4(&mask,(RTCRayHit4&)ray,context);
@@ -388,7 +388,7 @@ namespace embree
         intersectorN.occluded(this,rayN,N,context);
       }
       
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
       __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
         const vint<4> mask = valid.mask32();
         occluded4(&mask,(RTCRay4&)ray,context);
diff --git a/thirdparty/embree/kernels/common/acceln.cpp b/thirdparty/embree/kernels/common/acceln.cpp
index 32a27c560a..111c62083d 100644
--- a/thirdparty/embree/kernels/common/acceln.cpp
+++ b/thirdparty/embree/kernels/common/acceln.cpp
@@ -97,7 +97,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded4(valid,ray,context);
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       if (unlikely(none(valid0 & hit0))) break;
@@ -111,7 +111,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded8(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
@@ -127,7 +127,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded16(valid,ray,context);
-#if defined(__SSE2__) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
diff --git a/thirdparty/embree/kernels/common/accelset.h b/thirdparty/embree/kernels/common/accelset.h
index 90b184a07b..1b67120c97 100644
--- a/thirdparty/embree/kernels/common/accelset.h
+++ b/thirdparty/embree/kernels/common/accelset.h
@@ -14,21 +14,14 @@ namespace embree
   struct IntersectFunctionNArguments;
   struct OccludedFunctionNArguments;
   
-  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
-  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
-  
   struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
   {
-    IntersectContext* internal_context;
     Geometry* geometry;
-    ReportIntersectionFunc report;
   };
 
   struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
   {
-    IntersectContext* internal_context;
     Geometry* geometry;
-    ReportOcclusionFunc report;
   };
 
   /*! Base class for set of acceleration structures. */
@@ -145,7 +138,7 @@ namespace embree
   public:
 
       /*! Intersects a single ray with the scene. */
-      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
       {
         assert(primID < size());
         assert(intersectorN.intersect);
@@ -159,15 +152,13 @@ namespace embree
         args.N = 1;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.intersect(&args);
       }
 
       /*! Tests if single ray is occluded by the scene. */
-      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
       {
         assert(primID < size());
         assert(intersectorN.occluded);
@@ -181,16 +172,14 @@ namespace embree
         args.N = 1;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.occluded(&args);
       }
    
       /*! Intersects a packet of K rays with the scene. */
       template<int K>
-        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context) 
       {
         assert(primID < size());
         assert(intersectorN.intersect);
@@ -204,16 +193,14 @@ namespace embree
         args.N = K;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
          
         intersectorN.intersect(&args);
       }
 
       /*! Tests if a packet of K rays is occluded by the scene. */
       template<int K>
-        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context)
       {
         assert(primID < size());
         assert(intersectorN.occluded);
@@ -227,9 +214,7 @@ namespace embree
         args.N = K;
         args.geomID = geomID;
         args.primID = primID;
-        args.internal_context = context;
         args.geometry = this;
-        args.report = report;
         
         intersectorN.occluded(&args);
       }
diff --git a/thirdparty/embree/kernels/common/alloc.cpp b/thirdparty/embree/kernels/common/alloc.cpp
index 1a0e1aeed3..38a76225f4 100644
--- a/thirdparty/embree/kernels/common/alloc.cpp
+++ b/thirdparty/embree/kernels/common/alloc.cpp
@@ -3,6 +3,9 @@
 
 #include "alloc.h"
 #include "../../common/sys/thread.h"
+#if defined(APPLE) && defined(__aarch64__)
+#include "../../common/sys/barrier.h"
+#endif
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/common/alloc.h b/thirdparty/embree/kernels/common/alloc.h
index 4458e35c24..12769df2c8 100644
--- a/thirdparty/embree/kernels/common/alloc.h
+++ b/thirdparty/embree/kernels/common/alloc.h
@@ -8,6 +8,10 @@
 #include "scene.h"
 #include "primref.h"
 
+#if defined(APPLE) && defined(__aarch64__)
+#include <mutex>
+#endif
+
 namespace embree
 {
   class FastAllocator
@@ -26,7 +30,7 @@ namespace embree
   public:
 
     struct ThreadLocal2;
-    enum AllocationType { ALIGNED_MALLOC, OS_MALLOC, SHARED, ANY_TYPE };
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
 
     /*! Per thread structure holding the current memory block. */
     struct __aligned(64) ThreadLocal
@@ -132,7 +136,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() == alloc_i) return;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
         if (alloc.load()) {
           alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
@@ -150,7 +158,11 @@ namespace embree
       {
         assert(alloc_i);
         if (alloc.load() != alloc_i) return;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(mutex);
+#else
         Lock<SpinLock> lock(mutex);
+#endif
         if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
         alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
         alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
@@ -161,7 +173,11 @@ namespace embree
       }
 
     public:
+#if defined(APPLE) && defined(__aarch64__)
+      std::mutex mutex;
+#else
       SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
       std::atomic<FastAllocator*> alloc;  //!< parent allocator
       ThreadLocal alloc0;
       ThreadLocal alloc1;
@@ -169,7 +185,7 @@ namespace embree
 
     FastAllocator (Device* device, bool osAllocation) 
       : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
-        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? OS_MALLOC : ALIGNED_MALLOC),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
         primrefarray(device,0)
     {
       for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
@@ -206,7 +222,7 @@ namespace embree
 
     void setOSallocation(bool flag)
     {
-      atype = flag ? OS_MALLOC : ALIGNED_MALLOC;
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
     }
 
   private:
@@ -217,7 +233,11 @@ namespace embree
       ThreadLocal2* alloc = thread_local_allocator2;
       if (alloc == nullptr) {
         thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(APPLE) && defined(__aarch64__)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
         Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
         s_thread_local_allocators.push_back(make_unique(alloc));
       }
       return alloc;
@@ -227,7 +247,11 @@ namespace embree
 
     __forceinline void join(ThreadLocal2* alloc)
     {
+#if defined(APPLE) && defined(__aarch64__)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
       Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
       thread_local_allocators.push_back(alloc);
     }
 
@@ -492,7 +516,11 @@ namespace embree
         /* parallel block creation in case of no freeBlocks, avoids single global mutex */
         if (likely(freeBlocks.load() == nullptr))
         {
+#if defined(APPLE) && defined(__aarch64__)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
           Lock<SpinLock> lock(slotMutex[slot]);
+#endif
           if (myUsedBlocks == threadUsedBlocks[slot]) {
             const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
             const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
@@ -505,7 +533,11 @@ namespace embree
 
         /* if this fails allocate new block */
         {
-          Lock<SpinLock> lock(mutex);
+#if defined(APPLE) && defined(__aarch64__)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
 	  if (myUsedBlocks == threadUsedBlocks[slot])
 	  {
             if (freeBlocks.load() != nullptr) {
@@ -527,7 +559,11 @@ namespace embree
     /*! add new block */
     void addBlock(void* ptr, ssize_t bytes)
     {
+#if defined(APPLE) && defined(__aarch64__)
+      std::scoped_lock lock(mutex);
+#else
       Lock<SpinLock> lock(mutex);
+#endif
       const size_t sizeof_Header = offsetof(Block,data[0]);
       void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
       size_t ofs = (size_t) aptr - (size_t) ptr;
@@ -613,8 +649,8 @@ namespace embree
         bytesWasted(alloc->bytesWasted),
         stat_all(alloc,ANY_TYPE),
         stat_malloc(alloc,ALIGNED_MALLOC),
-        stat_4K(alloc,OS_MALLOC,false),
-        stat_2M(alloc,OS_MALLOC,true),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
         stat_shared(alloc,SHARED) {}
 
       AllStatistics (size_t bytesUsed,
@@ -707,7 +743,7 @@ namespace embree
         /* We avoid using os_malloc for small blocks as this could
          * cause a risk of fragmenting the virtual address space and
          * reach the limit of vm.max_map_count = 65k under Linux. */
-        if (atype == OS_MALLOC && bytesAllocate < maxAllocationSize)
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
           atype = ALIGNED_MALLOC;
 
         /* we need to additionally allocate some header */
@@ -716,7 +752,7 @@ namespace embree
         bytesReserve  = sizeof_Header+bytesReserve;
 
         /* consume full 4k pages with using os_malloc */
-        if (atype == OS_MALLOC) {
+        if (atype == EMBREE_OS_MALLOC) {
           bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
           bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
         }
@@ -748,11 +784,11 @@ namespace embree
             return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
           }
         }
-        else if (atype == OS_MALLOC)
+        else if (atype == EMBREE_OS_MALLOC)
         {
           if (device) device->memoryMonitor(bytesAllocate,false);
           bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
-          return new (ptr) Block(OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
         }
         else
           assert(false);
@@ -796,7 +832,7 @@ namespace embree
           if (device) device->memoryMonitor(-sizeof_Alloced,true);
         }
 
-        else if (atype == OS_MALLOC) {
+        else if (atype == EMBREE_OS_MALLOC) {
          size_t sizeof_This = sizeof_Header+reserveEnd;
          os_free(this,sizeof_This,huge_pages);
          if (device) device->memoryMonitor(-sizeof_Alloced,true);
@@ -857,7 +893,7 @@ namespace embree
       bool hasType(AllocationType atype_i, bool huge_pages_i) const
       {
         if      (atype_i == ANY_TYPE ) return true;
-        else if (atype   == OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
         else                           return atype_i == atype;
       }
 
@@ -906,7 +942,7 @@ namespace embree
       void print_block() const
       {
         if (atype == ALIGNED_MALLOC) std::cout << "A";
-        else if (atype == OS_MALLOC) std::cout << "O";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
         else if (atype == SHARED) std::cout << "S";
         if (huge_pages) std::cout << "H";
         size_t bytesUsed = getBlockUsedBytes();
@@ -936,7 +972,11 @@ namespace embree
     std::atomic<Block*> freeBlocks;
 
     std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
-    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(APPLE) && defined(__aarch64__)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+    PaddedSpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
 
     bool use_single_mode;
     size_t defaultBlockSize;
@@ -950,7 +990,11 @@ namespace embree
     static __thread ThreadLocal2* thread_local_allocator2;
     static SpinLock s_thread_local_allocators_lock;
     static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(APPLE) && defined(__aarch64__)
+    std::mutex thread_local_allocators_lock;
+#else
     SpinLock thread_local_allocators_lock;
+#endif
     std::vector<ThreadLocal2*> thread_local_allocators;
     AllocationType atype;
     mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
diff --git a/thirdparty/embree/kernels/common/device.cpp b/thirdparty/embree/kernels/common/device.cpp
index 068e0c2983..833ec65139 100644
--- a/thirdparty/embree/kernels/common/device.cpp
+++ b/thirdparty/embree/kernels/common/device.cpp
@@ -66,7 +66,11 @@ namespace embree
     case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
     case CPU::XEON_PHI_KNIGHTS_MILL   : frequency_level = FREQUENCY_SIMD512; break;
     case CPU::XEON_PHI_KNIGHTS_LANDING: frequency_level = FREQUENCY_SIMD512; break;
+#if defined(__APPLE__)
+    case CPU::ARM:             frequency_level = FREQUENCY_SIMD256; break; // Apple M1 supports high throughput for SIMD4
+#else
     case CPU::ARM:             frequency_level = FREQUENCY_SIMD128; break;
+#endif
     }
 
     /* initialize global state */
diff --git a/thirdparty/embree/kernels/common/geometry.h b/thirdparty/embree/kernels/common/geometry.h
index 2f9f2e7c94..593990f5b1 100644
--- a/thirdparty/embree/kernels/common/geometry.h
+++ b/thirdparty/embree/kernels/common/geometry.h
@@ -91,7 +91,7 @@ namespace embree
 
     size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
     size_t numTriangles;             //!< number of enabled triangles
-    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numMBTriangles;           //!< number of enabled motion blurred triangles
     size_t numQuads;                 //!< number of enabled quads
     size_t numMBQuads;               //!< number of enabled motion blurred quads
     size_t numBezierCurves;          //!< number of enabled curves
@@ -99,7 +99,7 @@ namespace embree
     size_t numLineSegments;          //!< number of enabled line segments
     size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
     size_t numSubdivPatches;         //!< number of enabled subdivision patches
-    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blurred subdivision patches
     size_t numUserGeometries;        //!< number of enabled user geometries
     size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
     size_t numInstancesCheap;        //!< number of enabled cheap instances
diff --git a/thirdparty/embree/kernels/common/isa.h b/thirdparty/embree/kernels/common/isa.h
index ae6556336c..9e1132e1a0 100644
--- a/thirdparty/embree/kernels/common/isa.h
+++ b/thirdparty/embree/kernels/common/isa.h
@@ -44,7 +44,7 @@ namespace embree
 #define SELECT_SYMBOL_DEFAULT(features,intersector) \
   intersector = isa::intersector;
 
-#if defined(__SSE__)
+#if defined(__SSE__) || defined(__ARM_NEON)
 #if !defined(EMBREE_TARGET_SIMD4)
 #define EMBREE_TARGET_SIMD4
 #endif
diff --git a/thirdparty/embree/kernels/common/ray.h b/thirdparty/embree/kernels/common/ray.h
index 7b951cc1e8..3c8ee3989c 100644
--- a/thirdparty/embree/kernels/common/ray.h
+++ b/thirdparty/embree/kernels/common/ray.h
@@ -6,7 +6,7 @@
 #include "default.h"
 #include "instance_stack.h"
 
-// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+// FIXME: if ray gets separated into ray* and hit, uload4 needs to be adjusted
 
 namespace embree
 {
diff --git a/thirdparty/embree/kernels/common/rtcore.cpp b/thirdparty/embree/kernels/common/rtcore.cpp
index 94b3819e42..a6ea55bfc4 100644
--- a/thirdparty/embree/kernels/common/rtcore.cpp
+++ b/thirdparty/embree/kernels/common/rtcore.cpp
@@ -7,6 +7,7 @@
 #include "device.h"
 #include "scene.h"
 #include "context.h"
+#include "../geometry/filter.h"
 #include "../../include/embree3/rtcore_ray.h"
 using namespace embree;
 
@@ -482,7 +483,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray4* ray4 = (Ray4*) rayhit;
+    RayHit4* ray4 = (RayHit4*) rayhit;
     for (size_t i=0; i<4; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray4->get(i,ray1);
@@ -513,7 +514,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray8* ray8 = (Ray8*) rayhit;
+    RayHit8* ray8 = (RayHit8*) rayhit;
     for (size_t i=0; i<8; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray8->get(i,ray1);
@@ -546,7 +547,7 @@ RTC_NAMESPACE_BEGIN;
 
     IntersectContext context(scene,user_context);
 #if !defined(EMBREE_RAY_PACKETS)
-    Ray16* ray16 = (Ray16*) rayhit;
+    RayHit16* ray16 = (RayHit16*) rayhit;
     for (size_t i=0; i<16; i++) {
       if (!valid[i]) continue;
       RayHit ray1; ray16->get(i,ray1);
@@ -1097,13 +1098,13 @@ RTC_NAMESPACE_BEGIN;
   RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
-    args->report(args,filter_args);
+    isa::reportIntersection1(args, filter_args);
   }
 
   RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
   {
     OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
-    args->report(args,filter_args);
+    isa::reportOcclusion1(args,filter_args);
   }
   
   RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
@@ -1763,4 +1764,19 @@ RTC_NAMESPACE_BEGIN;
     return nullptr;
   }
 
+  RTC_API RTCGeometry rtcGetGeometryThreadSafe (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryThreadSafe);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    Ref<Geometry> geom = scene->get_locked(geomID);
+    return (RTCGeometry) geom.ptr; 
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
 RTC_NAMESPACE_END
diff --git a/thirdparty/embree/kernels/common/rtcore.h b/thirdparty/embree/kernels/common/rtcore.h
index f8aad7c7cb..ac58a84d6f 100644
--- a/thirdparty/embree/kernels/common/rtcore.h
+++ b/thirdparty/embree/kernels/common/rtcore.h
@@ -26,56 +26,59 @@ namespace embree
 
 /*! Macros used in the rtcore API implementation */
 // -- GODOT start --
-// #define RTC_CATCH_BEGIN try {
 #define RTC_CATCH_BEGIN
-  
-// #define RTC_CATCH_END(device)                                                \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device::process_error(device,e.error,e.what());                             \
-//   } catch (std::exception& e) {                                                 \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//   } catch (...) {                                                               \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//   }
 #define RTC_CATCH_END(device)
-  
-// #define RTC_CATCH_END2(scene)                                                \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,e.error,e.what());                             \
-//   } catch (std::exception& e) {                                                 \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//   } catch (...) {                                                               \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//   }
 #define RTC_CATCH_END2(scene)
-
-// #define RTC_CATCH_END2_FALSE(scene)                                             \
-//   } catch (std::bad_alloc&) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
-//     return false;                                                               \
-//   } catch (rtcore_error& e) {                                                   \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,e.error,e.what());                             \
-//     return false;                                                               \
-//   } catch (std::exception& e) {                                                 \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
-//     return false;                                                               \
-//   } catch (...) {                                                               \
-//     Device* device = scene ? scene->device : nullptr;                           \
-//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
-//     return false;                                                               \
-//   }
 #define RTC_CATCH_END2_FALSE(scene) return false;
+
+#if 0
+#define RTC_CATCH_BEGIN try {
+  
+#define RTC_CATCH_END(device)                                                \
+  } catch (std::bad_alloc&) {                                                   \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+  } catch (rtcore_error& e) {                                                   \
+    Device::process_error(device,e.error,e.what());                             \
+  } catch (std::exception& e) {                                                 \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+  } catch (...) {                                                               \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+  }
+  
+#define RTC_CATCH_END2(scene)                                                \
+  } catch (std::bad_alloc&) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+  } catch (rtcore_error& e) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,e.error,e.what());                             \
+  } catch (std::exception& e) {                                                 \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+  } catch (...) {                                                               \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+  }
+
+#define RTC_CATCH_END2_FALSE(scene)                                             \
+  } catch (std::bad_alloc&) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+    return false;                                                               \
+  } catch (rtcore_error& e) {                                                   \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,e.error,e.what());                             \
+    return false;                                                               \
+  } catch (std::exception& e) {                                                 \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+    return false;                                                               \
+  } catch (...) {                                                               \
+    Device* device = scene ? scene->device : nullptr;                           \
+    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+    return false;                                                               \
+  }
+#endif
 // -- GODOT end --
 
 #define RTC_VERIFY_HANDLE(handle)                               \
@@ -103,39 +106,35 @@ namespace embree
 #define RTC_TRACE(x) 
 #endif
 
-// -- GODOT begin --
-//   /*! used to throw embree API errors */
-//   struct rtcore_error : public std::exception
-//   {
-//     __forceinline rtcore_error(RTCError error, const std::string& str)
-//       : error(error), str(str) {}
-//     
-//     ~rtcore_error() throw() {}
-//     
-//     const char* what () const throw () {
-//       return str.c_str();
-//     }
-//     
-//     RTCError error;
-//     std::string str;
-//   };
-// -- GODOT end --
+// -- GODOT start --
+#if 0
+  /*! used to throw embree API errors */
+  struct rtcore_error : public std::exception
+  {
+    __forceinline rtcore_error(RTCError error, const std::string& str)
+      : error(error), str(str) {}
+    
+    ~rtcore_error() throw() {}
+    
+    const char* what () const throw () {
+      return str.c_str();
+    }
+    
+    RTCError error;
+    std::string str;
+  };
+#endif
 
 #if defined(DEBUG) // only report file and line in debug mode
-  // -- GODOT begin --
-  // #define throw_RTCError(error,str) \
-  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
   #define throw_RTCError(error,str) \
     printf("%s (%d): %s", __FILE__, __LINE__, std::string(str).c_str()), abort();
-  // -- GODOT end --
+    // throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
 #else
-  // -- GODOT begin --
-  // #define throw_RTCError(error,str) \
-  //   throw rtcore_error(error,str);
   #define throw_RTCError(error,str) \
     abort();
-  // -- GODOT end --
+    // throw rtcore_error(error,str);
 #endif
+// -- GODOT end --
 
 #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
   (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
diff --git a/thirdparty/embree/kernels/common/rtcore_builder.cpp b/thirdparty/embree/kernels/common/rtcore_builder.cpp
index 1f1b6f6ddf..29e3bdca20 100644
--- a/thirdparty/embree/kernels/common/rtcore_builder.cpp
+++ b/thirdparty/embree/kernels/common/rtcore_builder.cpp
@@ -371,7 +371,7 @@ RTC_NAMESPACE_BEGIN
       bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
       bvh->allocator.reset();
 
-      /* switch between differnet builders based on quality level */
+      /* switch between different builders based on quality level */
       if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
         return rtcBuildBVHMorton(arguments);
       else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
diff --git a/thirdparty/embree/kernels/common/scene.cpp b/thirdparty/embree/kernels/common/scene.cpp
index 408d7eae6f..65d31d0f81 100644
--- a/thirdparty/embree/kernels/common/scene.cpp
+++ b/thirdparty/embree/kernels/common/scene.cpp
@@ -629,9 +629,7 @@ namespace embree
     if (geometry == null)
       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
     
-    if (geometry->isEnabled()) {
-      setModified ();
-    }
+    setModified ();
     accels_deleteGeometry(unsigned(geomID));
     id_pool.deallocate((unsigned)geomID);
     geometries[geomID] = null;
diff --git a/thirdparty/embree/kernels/common/scene_curves.h b/thirdparty/embree/kernels/common/scene_curves.h
index a5a39e42d4..a1ea45d3c7 100644
--- a/thirdparty/embree/kernels/common/scene_curves.h
+++ b/thirdparty/embree/kernels/common/scene_curves.h
@@ -452,6 +452,10 @@ namespace embree
           const Vec3fa n1 = normal(index+1,itime);
           if (!isvalid(n0) || !isvalid(n1))
             return false;
+
+	  const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
+	  if (!isvalid(b))
+	    return false;
         }
       }
       
@@ -612,6 +616,10 @@ namespace embree
           const Vec3fa dn1 = dnormal(index+1,itime);
           if (!isvalid(dn0) || !isvalid(dn1))
             return false;
+
+	  const BBox3fa b = getOrientedCurveScaledRadius(i,itime).accurateBounds();
+	  if (!isvalid(b))
+	    return false;
         }
       }
       
diff --git a/thirdparty/embree/kernels/common/state.cpp b/thirdparty/embree/kernels/common/state.cpp
index 01c862da0c..db6b803041 100644
--- a/thirdparty/embree/kernels/common/state.cpp
+++ b/thirdparty/embree/kernels/common/state.cpp
@@ -144,7 +144,20 @@ namespace embree
   }
 
   bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
     return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
   }
   
   void State::verify()
@@ -157,8 +170,10 @@ namespace embree
      * functions */
 #if defined(DEBUG)
 #if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
     assert(sse2::getISA() <= SSE2);
 #endif
+#endif
 #if defined(EMBREE_TARGET_SSE42)
     assert(sse42::getISA() <= SSE42);
 #endif
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
index 2bf7e93587..84ac27d103 100644
--- a/thirdparty/embree/kernels/config.h
+++ b/thirdparty/embree/kernels/config.h
@@ -1,5 +1,4 @@
-
-// Copyright 2009-2020 Intel Corporation
+// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 /* #undef EMBREE_RAY_MASK */
@@ -20,6 +19,7 @@
 /* #undef EMBREE_COMPACT_POLYS */
 
 #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+#define EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE
 
 #if defined(EMBREE_GEOMETRY_TRIANGLE)
   #define IF_ENABLED_TRIS(x) x
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
index 3d8900c2aa..75532f5ae0 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_oriented.h
@@ -225,7 +225,7 @@ namespace embree
           /* exit if convergence cannot get proven, but terminate if we are very small */
           if (unlikely(!subset(K,x) && !very_small)) return false;
 
-          /* solve using newton raphson iteration of convergence is guarenteed */
+          /* solve using newton raphson iteration of convergence is guaranteed */
           solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
           return true;
         }
diff --git a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
index 2d4abd73ac..ed827d583f 100644
--- a/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
+++ b/thirdparty/embree/kernels/geometry/curve_intersector_sweep.h
@@ -60,7 +60,7 @@ namespace embree
       const Vec3fa dir = ray.dir;
       const float length_ray_dir = length(dir);
 
-      /* error of curve evaluations is propertional to largest coordinate */
+      /* error of curve evaluations is proportional to largest coordinate */
       const BBox3ff box = curve.bounds();
       const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
      
diff --git a/thirdparty/embree/kernels/geometry/disc_intersector.h b/thirdparty/embree/kernels/geometry/disc_intersector.h
index 816c066899..ec6fa9c4f3 100644
--- a/thirdparty/embree/kernels/geometry/disc_intersector.h
+++ b/thirdparty/embree/kernels/geometry/disc_intersector.h
@@ -68,15 +68,15 @@ namespace embree
         const Vec3vf<M> center = v0.xyz();
         const vfloat<M> radius = v0.w;
 
+        /* compute ray distance projC0 to hit point with ray oriented plane */
         const Vec3vf<M> c0     = center - ray_org;
         const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
 
         valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
-        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
-          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
         if (unlikely(none(valid)))
           return false;
-        
+
+        /* check if hit point lies inside disc */
         const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
         const vfloat<M> l2     = dot(perp, perp);
         const vfloat<M> r2     = radius * radius;
@@ -84,6 +84,15 @@ namespace embree
         if (unlikely(none(valid)))
           return false;
 
+        /* We reject hits where the ray origin lies inside the ray
+         * oriented disc to avoid self intersections. */
+#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
+        const vfloat<M> m2 = dot(c0, c0);
+        valid &= (m2 > r2);
+        if (unlikely(none(valid)))
+          return false;
+#endif
+        
         DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
         return epilog(valid, hit);
       }
@@ -152,15 +161,15 @@ namespace embree
         const Vec3vf<M> center = v0.xyz();
         const vfloat<M> radius = v0.w;
 
+        /* compute ray distance projC0 to hit point with ray oriented plane */
         const Vec3vf<M> c0     = center - ray_org;
         const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
 
         valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
-        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
-          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
         if (unlikely(none(valid)))
           return false;
 
+        /* check if hit point lies inside disc */
         const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
         const vfloat<M> l2     = dot(perp, perp);
         const vfloat<M> r2     = radius * radius;
@@ -168,6 +177,15 @@ namespace embree
         if (unlikely(none(valid)))
           return false;
 
+        /* We reject hits where the ray origin lies inside the ray
+         * oriented disc to avoid self intersections. */
+#if defined(EMBREE_DISC_POINT_SELF_INTERSECTION_AVOIDANCE)
+        const vfloat<M> m2 = dot(c0, c0);
+        valid &= (m2 > r2);
+        if (unlikely(none(valid)))
+          return false;
+#endif
+
         DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
         return epilog(valid, hit);
       }
diff --git a/thirdparty/embree/kernels/geometry/filter.h b/thirdparty/embree/kernels/geometry/filter.h
index 3b4d924ea7..d64320bf78 100644
--- a/thirdparty/embree/kernels/geometry/filter.h
+++ b/thirdparty/embree/kernels/geometry/filter.h
@@ -51,20 +51,11 @@ namespace embree
     __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
     {
 #if defined(EMBREE_FILTER_FUNCTION)
-      IntersectContext* MAYBE_UNUSED context = args->internal_context;
-      const Geometry* const geometry = args->geometry;
-      if (geometry->intersectionFilterN) {
-        assert(context->scene->hasGeometryFilterFunction());
-        geometry->intersectionFilterN(filter_args);
-      }
+      if (args->geometry->intersectionFilterN)
+        args->geometry->intersectionFilterN(filter_args);
       
-      //if (args->valid[0] == 0)
-      //  return;
-
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(filter_args);
-      }
+      if (args->context->filter)
+        args->context->filter(filter_args);
 #endif
     }
     
@@ -105,20 +96,11 @@ namespace embree
     __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
     {
 #if defined(EMBREE_FILTER_FUNCTION)
-      IntersectContext* MAYBE_UNUSED context = args->internal_context;
-      const Geometry* const geometry = args->geometry;
-      if (geometry->occlusionFilterN) {
-        assert(context->scene->hasGeometryFilterFunction());
-        geometry->occlusionFilterN(filter_args);
-      }
-      
-      //if (args->valid[0] == 0)
-      //  return false;
+      if (args->geometry->occlusionFilterN)
+        args->geometry->occlusionFilterN(filter_args);
       
-      if (context->user->filter) {
-        assert(context->scene->hasContextFilterFunction());
-        context->user->filter(filter_args);
-      }
+      if (args->context->filter)
+        args->context->filter(filter_args);
 #endif
     }
 
diff --git a/thirdparty/embree/kernels/geometry/object_intersector.h b/thirdparty/embree/kernels/geometry/object_intersector.h
index 11ceb2f7fe..e4ad01852f 100644
--- a/thirdparty/embree/kernels/geometry/object_intersector.h
+++ b/thirdparty/embree/kernels/geometry/object_intersector.h
@@ -32,7 +32,7 @@ namespace embree
           return;
 #endif
 
-        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+        accel->intersect(ray,prim.geomID(),prim.primID(),context);
       }
       
       static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
@@ -44,7 +44,7 @@ namespace embree
           return false;
 #endif
 
-        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        accel->occluded(ray,prim.geomID(),prim.primID(),context);
         return ray.tfar < 0.0f;
       }
       
@@ -89,7 +89,7 @@ namespace embree
         valid &= (ray.mask & accel->mask) != 0;
         if (none(valid)) return;
 #endif
-        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context);
       }
 
       static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
@@ -102,7 +102,7 @@ namespace embree
         valid &= (ray.mask & accel->mask) != 0;
         if (none(valid)) return false;
 #endif
-        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context);
         return ray.tfar < 0.0f;
       }
       
diff --git a/thirdparty/embree/kernels/geometry/quadv.h b/thirdparty/embree/kernels/geometry/quadv.h
index 2137356ff2..514e519b0c 100644
--- a/thirdparty/embree/kernels/geometry/quadv.h
+++ b/thirdparty/embree/kernels/geometry/quadv.h
@@ -152,7 +152,7 @@ namespace embree
     Vec3vf<M> v0;      // 1st vertex of the quads
     Vec3vf<M> v1;      // 2nd vertex of the quads
     Vec3vf<M> v2;      // 3rd vertex of the quads
-    Vec3vf<M> v3;      // 4rd vertex of the quads
+    Vec3vf<M> v3;      // 4th vertex of the quads
   private:
     vuint<M> geomIDs; // geometry ID
     vuint<M> primIDs; // primitive ID
diff --git a/thirdparty/embree/kernels/geometry/roundline_intersector.h b/thirdparty/embree/kernels/geometry/roundline_intersector.h
index 0e9393442b..764ff93fec 100644
--- a/thirdparty/embree/kernels/geometry/roundline_intersector.h
+++ b/thirdparty/embree/kernels/geometry/roundline_intersector.h
@@ -19,7 +19,7 @@
 
   For multiple connected round linear curve segments this construction
   yield a proper shape when viewed from the outside. Using the
-  following CSG we can also handle the interiour in most common cases:
+  following CSG we can also handle the interior in most common cases:
 
      round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
        cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
@@ -431,7 +431,7 @@ namespace embree
              Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
            
            Inserting the definition of w0 and dw and refactoring
-           yield a furhter scaled Ng'':
+           yield a further scaled Ng'':
            
              Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
            
diff --git a/thirdparty/embree/kernels/geometry/subgrid_intersector.h b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
index ad5fee2e4e..e241073812 100644
--- a/thirdparty/embree/kernels/geometry/subgrid_intersector.h
+++ b/thirdparty/embree/kernels/geometry/subgrid_intersector.h
@@ -264,8 +264,8 @@ namespace embree
           const Vec3vf<K> p2 = vtx[i*4+2];
           const Vec3vf<K> p3 = vtx[i*4+3];
           STAT3(shadow.trav_prims,1,popcnt(valid0),K);
-          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-            break;
+          pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i));
+          if (none(valid0)) break;
         }
         return !valid0;
       }
@@ -408,10 +408,8 @@ namespace embree
           const Vec3vf<K> p2 = vtx[i*4+2];
           const Vec3vf<K> p3 = vtx[i*4+3];
           STAT3(shadow.trav_prims,1,popcnt(valid0),K);
-          //if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-          if (pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
-	    
-            break;
+          pre.occludedK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i));
+          if (none(valid0)) break;
         }
         return !valid0;
       }
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
index 470e15f03e..39d50e2354 100644
--- a/thirdparty/embree/kernels/hash.h
+++ b/thirdparty/embree/kernels/hash.h
@@ -1,5 +1,4 @@
-
-// Copyright 2009-2020 Intel Corporation
+// Copyright 2009-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"
+#define RTC_HASH "698442324ccddd11725fb8875275dc1384f7fb40"
diff --git a/thirdparty/embree/kernels/subdiv/bezier_patch.h b/thirdparty/embree/kernels/subdiv/bezier_patch.h
index 2ff03902a7..0a2aef321f 100644
--- a/thirdparty/embree/kernels/subdiv/bezier_patch.h
+++ b/thirdparty/embree/kernels/subdiv/bezier_patch.h
@@ -94,7 +94,7 @@ namespace embree
       matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
       matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
       
-      /* compute buttom edge control points */
+      /* compute bottom edge control points */
       matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
       matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
       
diff --git a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
index e5ad5dadfe..eab91d9ee6 100644
--- a/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
+++ b/thirdparty/embree/kernels/subdiv/catmullclark_ring.h
@@ -388,7 +388,7 @@ namespace embree
       return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
     }
     
-    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    /* gets limit tangent in the direction of edge vtx -> ring[0] */
     __forceinline Vertex getLimitTangent() const 
     {
       if (unlikely(std::isinf(vertex_crease_weight)))
@@ -429,7 +429,7 @@ namespace embree
       return sigma * (alpha + beta);
     }
     
-    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */
     __forceinline Vertex getSecondLimitTangent() const 
     {
       if (unlikely(std::isinf(vertex_crease_weight)))
@@ -763,7 +763,7 @@ namespace embree
     }
 
 
-    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    /* gets limit tangent in the direction of edge vtx -> ring[0] */
     __forceinline Vertex getLimitTangent() const 
     {
       CatmullClark1Ring cc_vtx;
@@ -779,7 +779,7 @@ namespace embree
       return 2.0f * cc_vtx.getLimitTangent();
     }
 
-    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    /* gets limit tangent in the direction of edge vtx -> ring[edge_valence-2] */
     __forceinline Vertex getSecondLimitTangent() const 
     {
       CatmullClark1Ring cc_vtx;
diff --git a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
index 74fc4c1230..9532287d98 100644
--- a/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
+++ b/thirdparty/embree/kernels/subdiv/catmullrom_curve.h
@@ -8,7 +8,7 @@
 
 /*
 
-  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  Implements Catmull-Rom curves with control points p0, p1, p2, p3. At
   t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
   the curve goes through p2 with tangent (p3-p2)/2.
 
@@ -91,11 +91,11 @@ namespace embree
         : v0(v0), v1(v1), v2(v2), v3(v3) {}
 
       __forceinline Vertex begin() const {
-        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+        return v1;
       }
 
       __forceinline Vertex end() const {
-        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+        return v2;
       }
 
       __forceinline Vertex center() const {
diff --git a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
index f8e8a25f35..dcdb101d7c 100644
--- a/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
+++ b/thirdparty/embree/kernels/subdiv/linear_bezier_patch.h
@@ -81,29 +81,29 @@ namespace embree
         {
           SourceCurve<Vec3ff> vcurve = center;
           SourceCurve<Vec3fa> ncurve = normal;
-          
+
           /* here we construct a patch which follows the curve l(t) =
            * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
           
           const Vec3ff p0   = vcurve.eval(0.0f);
           const Vec3ff dp0  = vcurve.eval_du(0.0f);
-          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+          //const Vec3ff ddp0 = vcurve.eval_dudu(0.0f); // ddp0 is assumed to be 0
 
           const Vec3fa n0   = ncurve.eval(0.0f);
           const Vec3fa dn0  = ncurve.eval_du(0.0f);
 
           const Vec3ff p1   = vcurve.eval(1.0f);
           const Vec3ff dp1  = vcurve.eval_du(1.0f);
-          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+          //const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);  // ddp1 is assumed to be 0
 
           const Vec3fa n1   = ncurve.eval(1.0f);
           const Vec3fa dn1  = ncurve.eval_du(1.0f);
 
           const Vec3fa bt0  = cross(n0,dp0);
-          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+          const Vec3fa dbt0 = cross(dn0,dp0);// + cross(n0,ddp0);
 
           const Vec3fa bt1  = cross(n1,dp1);
-          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+          const Vec3fa dbt1 = cross(dn1,dp1);// + cross(n1,ddp1);
             
           const Vec3fa k0  = normalize(bt0);
           const Vec3fa dk0 = dnormalize(bt0,dbt0);