18 files changed, 7286 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..755ce255fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHair
+    {
+      /*! settings for builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t finished_range_threshold;  //!< finished range threshold
+      };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          friend struct BVHBuilderHair;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+          typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+          typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          static const size_t travCostAligned = 1;
+          static const size_t travCostUnaligned = 5;
+          static const size_t intCost = 6;
+
+          BuilderT (Scene* scene,
+                    PrimRef* prims,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeFunc& createAABBNode,
+                    const SetAABBNodeFunc& setAABBNode,
+                    const CreateOBBNodeFunc& createOBBNode,
+                    const SetOBBNodeFunc& setOBBNode,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const ReportFinishedRangeFunc& reportFinishedRange,
+                    const Settings settings)
+
+            : cfg(settings),
+            prims(prims),
+            createAlloc(createAlloc),
+            createAABBNode(createAABBNode),
+            setAABBNode(setAABBNode),
+            createOBBNode(createOBBNode),
+            setOBBNode(setOBBNode),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            reportFinishedRange(reportFinishedRange),
+            alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const PrimInfoRange& range)
+          {
+            if (range.size() == 0) return true;
+            unsigned int firstGeomID = prims[range.begin()].geomID();
+            for (size_t i=range.begin()+1; i<range.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+              return createLeaf(prims,pinfo,alloc);
+
+            /* fill all children by always splitting the largest one */
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+            unsigned numChildren = 1;
+            children[0] = pinfo;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              __aligned(64) PrimInfoRange left, right;
+              if (!sameGeometry(children[bestChild])) {
+                alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+              } else {
+                alignedHeuristic.splitFallback(children[bestChild],left,right);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* create node */
+            auto node = createAABBNode(alloc);
+
+            for (size_t i=0; i<numChildren; i++) {
+              const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+              setAABBNode(node,i,child,children[i].geomBounds);
+            }
+
+            return node;
+          }
+
+          /*! performs split */
+          __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+            const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+            /* try standard binning in aligned space */
+            float alignedObjectSAH = inf;
+            HeuristicBinningSAH::Split alignedObjectSplit;
+            if (aligned) {
+              alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+              alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+              bestSAH = min(alignedObjectSAH,bestSAH);
+            }
+
+            /* try standard binning in unaligned space */
+            UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (bestSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+              const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* try splitting into two strands */
+            HeuristicStrandSplitSAH::Split strandSplit;
+            float strandSAH = inf;
+            if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+              strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+              strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+              bestSAH = min(strandSAH,bestSAH);
+            }
+
+            /* fallback if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH)))
+            {
+              alignedHeuristic.deterministic_order(pinfo);
+              alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+            }
+
+            /* perform aligned split if this is best */
+            else if (bestSAH == alignedObjectSAH) {
+              alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+            }
+
+            /* perform unaligned split if this is best */
+            else if (bestSAH == unalignedObjectSAH) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* perform strand split if this is best */
+            else if (bestSAH == strandSAH) {
+              strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* can never happen */
+            else
+              assert(false);
+          }
+
+          /*! recursive build */
+          NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(pinfo.size());
+
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+            /* create leaf node */
+            if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+              alignedHeuristic.deterministic_order(pinfo);
+              return createLargeLeaf(depth,pinfo,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            size_t numChildren = 1;
+            children[0] = pinfo;
+            bool aligned = true;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                if (area(children[i].geomBounds) > bestArea) {
+                  bestArea = area(children[i].geomBounds);
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              PrimInfoRange left, right;
+              split(children[bestChild],left,right,aligned);
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            NodeRef node;
+
+            /* create aligned node */
+            if (aligned)
+            {
+              node = createAABBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else {
+                for (size_t i=0; i<numChildren; i++) {
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+                }
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              node = createOBBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                      const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                      const OBBox3fa obounds(space,sinfo.geomBounds);
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<numChildren; i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                  const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                  const OBBox3fa obounds(space,sinfo.geomBounds);
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+                }
+              }
+            }
+
+            /* reports a finished range of primrefs */
+            if (unlikely(alloc_barrier))
+              reportFinishedRange(pinfo);
+
+            return node;
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeFunc& createAABBNode;
+          const SetAABBNodeFunc& setAABBNode;
+          const CreateOBBNodeFunc& createOBBNode;
+          const SetOBBNodeFunc& setOBBNode;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+          const ReportFinishedRangeFunc& reportFinishedRange;
+
+        private:
+          HeuristicBinningSAH alignedHeuristic;
+          UnalignedHeuristicBinningSAH unalignedHeuristic;
+          HeuristicStrandSplitSAH strandHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        static NodeRef build (const CreateAllocFunc& createAlloc,
+                              const CreateAABBNodeFunc& createAABBNode,
+                              const SetAABBNodeFunc& setAABBNode,
+                              const CreateOBBNodeFunc& createOBBNode,
+                              const SetOBBNodeFunc& setOBBNode,
+                              const CreateLeafFunc& createLeaf,
+                              const ProgressMonitor& progressMonitor,
+                              const ReportFinishedRangeFunc& reportFinishedRange,
+                              Scene* scene,
+                              PrimRef* prims,
+                              const PrimInfo& pinfo,
+                              const Settings settings)
+        {
+          typedef BuilderT<NodeRef,
+            CreateAllocFunc,
+            CreateAABBNodeFunc,SetAABBNodeFunc,
+            CreateOBBNodeFunc,SetOBBNodeFunc,
+            CreateLeafFunc,ProgressMonitor,
+            ReportFinishedRangeFunc> Builder;
+
+          Builder builder(scene,prims,createAlloc,
+                          createAABBNode,setAABBNode,
+                          createOBBNode,setOBBNode,
+                          createLeaf,progressMonitor,reportFinishedRange,settings);
+
+          NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..92be2f7e65
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderMorton
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
+
+      /*! settings for morton builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+        : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      /*! Build primitive consisting of morton code and primitive ID. */
+      struct __aligned(8) BuildPrim
+      {
+        union {
+          struct {
+            unsigned int code;     //!< morton code
+            unsigned int index;    //!< i'th primitive
+          };
+          uint64_t t;
+        };
+
+        /*! interface for radix sort */
+        __forceinline operator unsigned() const { return code; }
+
+        /*! interface for standard sort */
+        __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+      };
+
+      /*! maps bounding box to morton code */
+      struct MortonCodeMapping
+      {
+        static const size_t LATTICE_BITS_PER_DIM = 10;
+        static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+        vfloat4 base;
+        vfloat4 scale;
+
+        __forceinline MortonCodeMapping(const BBox3fa& bounds)
+        {
+          base  = (vfloat4)bounds.lower;
+          const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+          scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+        }
+
+        __forceinline const vint4 bin (const BBox3fa& box) const
+        {
+          const vfloat4 lower = (vfloat4)box.lower;
+          const vfloat4 upper = (vfloat4)box.upper;
+          const vfloat4 centroid = lower+upper;
+          return vint4((centroid-base)*scale);
+        }
+
+        __forceinline unsigned int code (const BBox3fa& box) const
+        {
+          const vint4 binID = bin(box);
+          const unsigned int x = extract<0>(binID);
+          const unsigned int y = extract<1>(binID);
+          const unsigned int z = extract<2>(binID);
+          const unsigned int xyz = bitInterleave(x,y,z);
+          return xyz;
+        }
+      };
+
+#if defined (__AVX2__)
+
+      /*! for AVX2 there is a fast scalar bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest) {}
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          dest->index = index;
+          dest->code = mapping.code(b);
+          dest++;
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+      };
+
+#else
+
+      /*! before AVX2 is it better to use the SSE version of bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+        __forceinline ~MortonCodeGenerator()
+        {
+          if (slots != 0)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            for (size_t i=0; i<slots; i++) {
+              dest[currentID-slots+i].index = ai[i];
+              dest[currentID-slots+i].code = code[i];
+            }
+          }
+        }
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          const vint4 binID = mapping.bin(b);
+          ax[slots] = extract<0>(binID);
+          ay[slots] = extract<1>(binID);
+          az[slots] = extract<2>(binID);
+          ai[slots] = index;
+          slots++;
+          currentID++;
+
+          if (slots == 4)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+            vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+            slots = 0;
+          }
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+        size_t slots;
+        vint4 ax, ay, az, ai;
+      };
+
+#endif
+
+      template<
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocator,
+        typename CreateNodeFunc,
+        typename SetNodeBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBounds,
+        typename ProgressMonitor>
+
+        class BuilderT : private Settings
+      {
+        ALIGNED_CLASS_(16);
+
+      public:
+
+        BuilderT (CreateAllocator& createAllocator,
+                  CreateNodeFunc& createNode,
+                  SetNodeBoundsFunc& setBounds,
+                  CreateLeafFunc& createLeaf,
+                  CalculateBounds& calculateBounds,
+                  ProgressMonitor& progressMonitor,
+                  const Settings& settings)
+
+          : Settings(settings),
+          createAllocator(createAllocator),
+          createNode(createNode),
+          setBounds(setBounds),
+          createLeaf(createLeaf),
+          calculateBounds(calculateBounds),
+          progressMonitor(progressMonitor),
+          morton(nullptr) {}
+
+        ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+        {
+          /* this should never occur but is a fatal error */
+          if (depth > maxDepth)
+            throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+          /* create leaf for few primitives */
+          if (current.size() <= maxLeafSize)
+            return createLeaf(current,alloc);
+
+          /* fill all children by always splitting the largest one */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          size_t numChildren = 1;
+          children[0] = current;
+
+          do {
+
+            /* find best child with largest number of primitives */
+            size_t bestChild = -1;
+            size_t bestSize = 0;
+            for (size_t i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= maxLeafSize)
+                continue;
+
+              /* remember child with largest size */
+              if (children[i].size() > bestSize) {
+                bestSize = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == size_t(-1)) break;
+
+            /*! split best child into left and right child */
+            auto split = children[bestChild].split();
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = split.first;
+            children[numChildren+0] = split.second;
+            numChildren++;
+
+          } while (numChildren < branchingFactor);
+
+          /* create node */
+          auto node = createNode(alloc,numChildren);
+
+          /* recurse into each child */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<numChildren; i++)
+            bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /*! recreates morton codes when reaching a region where all codes are identical */
+        __noinline void recreateMortonCodes(const range<unsigned>& current) const
+        {
+          /* fast path for small ranges */
+          if (likely(current.size() < 1024))
+          {
+            /*! recalculate centroid bounds */
+            BBox3fa centBounds(empty);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              centBounds.extend(center2(calculateBounds(morton[i])));
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+            /* sort morton codes */
+            std::sort(morton+current.begin(),morton+current.end());
+          }
+          else
+          {
+            /*! recalculate centroid bounds */
+            auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+              BBox3fa centBounds = empty;
+              for (size_t i=r.begin(); i<r.end(); i++)
+                centBounds.extend(center2(calculateBounds(morton[i])));
+              return centBounds;
+            };
+            const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+                                                       BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  morton[i].code = mapping.code(calculateBounds(morton[i]));
+                }
+              });
+
+            /*! sort morton codes */
+#if defined(TASKING_TBB)
+            tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+            radixsort32(morton+current.begin(),current.size());
+#endif
+          }
+        }
+
+        __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+        {
+          const unsigned int code_start = morton[current.begin()].code;
+          const unsigned int code_end   = morton[current.end()-1].code;
+          unsigned int bitpos = lzcnt(code_start^code_end);
+
+          /* if all items mapped to same morton code, then re-create new morton codes for the items */
+          if (unlikely(bitpos == 32))
+          {
+            recreateMortonCodes(current);
+            const unsigned int code_start = morton[current.begin()].code;
+            const unsigned int code_end   = morton[current.end()-1].code;
+            bitpos = lzcnt(code_start^code_end);
+
+            /* if the morton code is still the same, goto fall back split */
+            if (unlikely(bitpos == 32)) {
+              current.split(left,right);
+              return;
+            }
+          }
+
+          /* split the items at the topmost different morton code bit */
+          const unsigned int bitpos_diff = 31-bitpos;
+          const unsigned int bitmask = 1 << bitpos_diff;
+
+          /* find location where bit differs using binary search */
+          unsigned begin = current.begin();
+          unsigned end   = current.end();
+          while (begin + 1 != end) {
+            const unsigned mid = (begin+end)/2;
+            const unsigned bit = morton[mid].code & bitmask;
+            if (bit == 0) begin = mid; else end = mid;
+          }
+          unsigned center = end;
+#if defined(DEBUG)
+          for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
+          for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+          left = make_range(current.begin(),center);
+          right = make_range(center,current.end());
+        }
+
+        ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+        {
+          /* get thread local allocator */
+          if (!alloc)
+            alloc = createAllocator();
+
+          /* call memory monitor function to signal progress */
+          if (toplevel && current.size() <= singleThreadThreshold)
+            progressMonitor(current.size());
+
+          /* create leaf node */
+          if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+            return createLargeLeaf(depth,current,alloc);
+
+          /* fill all children by always splitting the one with the largest surface area */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          split(current,children[0],children[1]);
+          size_t numChildren = 2;
+
+          while (numChildren < branchingFactor)
+          {
+            /* find best child with largest number of primitives */
+            int bestChild = -1;
+            unsigned bestItems = 0;
+            for (unsigned int i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= minLeafSize)
+                continue;
+
+              /* remember child with largest area */
+              if (children[i].size() > bestItems) {
+                bestItems = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == -1) break;
+
+            /*! split best child into left and right child */
+            range<unsigned> left, right;
+            split(children[bestChild],left,right);
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = left;
+            children[numChildren+0] = right;
+            numChildren++;
+          }
+
+          /* create leaf node if no split is possible */
+          if (unlikely(numChildren == 1))
+            return createLeaf(current,alloc);
+
+          /* allocate node */
+          auto node = createNode(alloc,numChildren);
+
+          /* process top parts of tree parallel */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          if (current.size() > singleThreadThreshold)
+          {
+            /*! parallel_for is faster than spawing sub-tasks */
+            parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  bounds[i] = recurse(depth+1,children[i],nullptr,true);
+                  _mm_mfence(); // to allow non-temporal stores during build
+                }
+              });
+          }
+
+          /* finish tree sequentially */
+          else
+          {
+            for (size_t i=0; i<numChildren; i++)
+              bounds[i] = recurse(depth+1,children[i],alloc,false);
+          }
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /* build function */
+        ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+        {
+          /* sort morton codes */
+          morton = src;
+          radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+          /* build BVH */
+          const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+
+      public:
+        CreateAllocator& createAllocator;
+        CreateNodeFunc& createNode;
+        SetNodeBoundsFunc& setBounds;
+        CreateLeafFunc& createLeaf;
+        CalculateBounds& calculateBounds;
+        ProgressMonitor& progressMonitor;
+
+      public:
+        BuildPrim* morton;
+      };
+
+
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBoundsFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAllocator,
+                                 CreateNodeFunc createNode,
+                                 SetBoundsFunc setBounds,
+                                 CreateLeafFunc createLeaf,
+                                 CalculateBoundsFunc calculateBounds,
+                                 ProgressMonitor progressMonitor,
+                                 BuildPrim* src,
+                                 BuildPrim* tmp,
+                                 size_t numPrimitives,
+                                 const Settings& settings)
+        {
+          typedef BuilderT<
+            ReductionTy,
+            decltype(createAllocator()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetBoundsFunc,
+            CreateLeafFunc,
+            CalculateBoundsFunc,
+            ProgressMonitor> Builder;
+
+          Builder builder(createAllocator,
+                          createNode,
+                          setBounds,
+                          createLeaf,
+                          calculateBounds,
+                          progressMonitor,
+                          settings);
+
+          return builder.build(src,tmp,numPrimitives);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..4c138dacdb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS   32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      struct SharedVector
+      {
+        __forceinline SharedVector() {}
+
+        __forceinline SharedVector(T* ptr, size_t refCount = 1)
+          : prims(ptr), refCount(refCount) {}
+
+        __forceinline void incRef() {
+          refCount++;
+        }
+
+        __forceinline void decRef()
+        {
+          if (--refCount == 0)
+            delete prims;
+        }
+
+        T* prims;
+        size_t refCount;
+      };
+
+    template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+      struct LocalChildListT
+      {
+        typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+        __forceinline LocalChildListT (const BuildRecord& record)
+          : numChildren(1), numSharedPrimVecs(1)
+        {
+          /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+          children[0] = record;
+          primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+        }
+
+        __forceinline ~LocalChildListT()
+        {
+          for (size_t i = 0; i < numChildren; i++)
+            primvecs[i]->decRef();
+        }
+
+        __forceinline BuildRecord& operator[] ( const size_t i ) {
+          return children[i];
+        }
+
+        __forceinline size_t size() const {
+          return numChildren;
+        }
+
+        __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+        {
+          SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+          if (lrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[bestChild] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+          }
+
+          if (rrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[numChildren] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+          }
+          bsharedPrimVec->decRef();
+          new_vector.release();
+
+          children[bestChild] = lrecord;
+          children[numChildren] = rrecord;
+          numChildren++;
+        }
+
+      public:
+        array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+        array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+        size_t numChildren;
+
+        array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+        size_t numSharedPrimVecs;
+      };
+
+    template<typename Mesh>
+      struct RecalculatePrimRef
+      {
+        Scene* scene;
+
+        __forceinline RecalculatePrimRef (Scene* scene)
+          : scene(scene) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+        }
+      };
+
+    struct VirtualRecalculatePrimRef
+    {
+      Scene* scene;
+      
+      __forceinline VirtualRecalculatePrimRef (Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+        return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+      }
+    };
+
+    struct BVHBuilderMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+          travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+          singleThreadThreshold(1024) {}
+
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        bool singleLeafTimeSegment; //!< split time to single time range
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+          return a.prims.size() < b.prims.size();
+        }
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;                     //!< Depth of the root of this subtree.
+	SetMB prims;                      //!< The list of primitives.
+      };
+
+      struct BuildRecordSplit : public BuildRecord
+      {
+        __forceinline BuildRecordSplit () {}
+
+        __forceinline BuildRecordSplit (size_t depth) 
+          : BuildRecord(depth) {}
+
+        __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+          : BuildRecord(record), split(split) {}
+        
+        BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+      };
+
+      template<
+        typename NodeRef,
+        typename RecalculatePrimRef,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor	  
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;        //!< create balanced tree if we are that many levels before the maximum tree depth
+
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+          typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+          typedef mvector<PrimRefMB>* PrimRefVector;
+          typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+          typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+        public:
+
+          BuilderT (MemoryMonitorInterface* device,
+                    const RecalculatePrimRef recalculatePrimRef,
+                    const CreateAllocFunc createAlloc,
+                    const CreateNodeFunc createNode,
+                    const SetNodeFunc setNode,
+                    const CreateLeafFunc createLeaf,
+                    const ProgressMonitor progressMonitor,
+                    const Settings& settings)
+            : cfg(settings),
+            heuristicObjectSplit(),
+            heuristicTemporalSplit(device, recalculatePrimRef),
+            recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+            progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          /*! finds the best split */
+          const Split find(const SetMB& set)
+          {
+            /* first try standard object split */
+            const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+            const float object_split_sah = object_split.splitSAH();
+
+            /* test temporal splits only when object split was bad */
+            const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+            if (object_split_sah < 0.50f*leaf_sah)
+              return object_split;
+
+            /* do temporal splits only if the the time range is big enough */
+            if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+            {
+              const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+              const float temporal_split_sah = temporal_split.splitSAH();
+
+              /* take temporal split if it improved SAH */
+              if (temporal_split_sah < object_split_sah)
+                return temporal_split;
+            }
+
+            return object_split;
+          }
+
+          /*! array partitioning */
+          __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            /* perform object split */
+            if (likely(split.data == Split::SPLIT_OBJECT)) {
+              heuristicObjectSplit.split(split,set,lset,rset);
+            }
+            /* perform temporal split */
+            else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+              return heuristicTemporalSplit.split(split,set,lset,rset);
+            }
+            /* perform fallback split */
+            else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+              set.deterministic_order();
+              splitFallback(set,lset,rset);
+            }
+            /* split by geometry */
+            else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+              set.deterministic_order();
+              splitByGeometry(set,lset,rset);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! finds the best fallback split */
+          __noinline Split findFallback(const SetMB& set)
+          {
+            /* split if primitives are not from same geometry */
+            if (!sameGeometry(set))
+              return Split(0.0f,Split::SPLIT_GEOMID);
+            
+            /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+            if (cfg.singleLeafTimeSegment)
+            {
+              /* test if one primitive has more than one time segment in time range, if so split time */
+              for (size_t i=set.begin(); i<set.end(); i++)
+              {
+                const PrimRefMB& prim = (*set.prims)[i];
+                const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+                const int localTimeSegments = itime_range.size();
+                assert(localTimeSegments > 0);
+                if (localTimeSegments > 1) {
+                  const int icenter = (itime_range.begin() + itime_range.end())/2;
+                  const float splitTime = prim.timeStep(icenter);
+                  return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+                }
+              }
+            }        
+
+            /* otherwise return fallback split */
+            return Split(0.0f,Split::SPLIT_FALLBACK);
+          }
+
+          /*! performs fallback split */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            if (set.size() == 0) return true;
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            unsigned int firstGeomID = prims[begin].geomID();
+            for (size_t i=begin+1; i<end; i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /* split by geometry ID */
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            
+            PrimInfoMB left(empty);
+            PrimInfoMB right(empty);
+            unsigned int geomID = prims[begin].geomID();
+            size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+            
+            new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (in.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* replace already found split by fallback split */
+            const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+	    
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+              return createLeaf(current,alloc);
+	  
+            /* fill all children by always splitting the largest one */
+            bool hasTimeSplits = false;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildListSplit children(current);
+
+            do {
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+                  continue;
+
+                force_split = false;
+                
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecordSplit& brecord = children[bestChild];
+              BuildRecordSplit lrecord(current.depth+1);
+              BuildRecordSplit rrecord(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+
+              /* find new splits */
+              lrecord.split = findFallback(lrecord.prims);
+              rrecord.split = findFallback(rrecord.prims);
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = in.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create node */
+            auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+            /* recurse into each child and perform reduction */
+            LBBox3fa gbounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              gbounds.extend(values[i].lbounds);
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (hasTimeSplits)
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            const Split csplit = find(current.prims);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+            assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            SetMB lprims,rprims;
+            std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+            bool hasTimeSplits = new_vector != nullptr;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            {
+              BuildRecord lrecord(lprims,current.depth+1);
+              BuildRecord rrecord(rprims,current.depth+1);
+              children.split(0,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /*! split until node is full or SAH tells us to stop */
+            while (children.size() < cfg.branchingFactor) 
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                if (children[i].size() <= cfg.minLeafSize) continue;
+                if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              Split csplit = find(brecord.prims);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* sort buildrecords for simpler shadow ray traversal */
+            //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+            /*! create an inner node */
+            auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+            LBBox3fa gbounds = empty;
+
+            /* spawn tasks */
+            if (unlikely(current.size() > cfg.singleThreadThreshold))
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              /*! merge bounding boxes */
+              for (size_t i=0; i<children.size(); i++)
+                gbounds.extend(values[i].lbounds);
+            }
+            /* recurse into each child */
+            else
+            {
+              //for (size_t i=0; i<children.size(); i++)
+              for (ssize_t i=children.size()-1; i>=0; i--) {
+                values[i] = recurse(children[i],alloc,false);
+                gbounds.extend(values[i].lbounds);
+              }
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (unlikely(hasTimeSplits))
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          /*! builder entry function */
+          __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            const SetMB set(pinfo,&prims);
+            auto ret = recurse(BuildRecord(set,1),nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return ret;
+          }
+
+        private:
+          Settings cfg;
+          HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+          HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+          const RecalculatePrimRef recalculatePrimRef;
+          const CreateAllocFunc createAlloc;
+          const CreateNodeFunc createNode;
+          const SetNodeFunc setNode;
+          const CreateLeafFunc createLeaf;
+          const ProgressMonitor progressMonitor;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitorFunc>
+
+        static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+                                                      const PrimInfoMB& pinfo,
+                                                      MemoryMonitorInterface* device,
+                                                      const RecalculatePrimRef recalculatePrimRef,
+                                                      const CreateAllocFunc createAlloc,
+                                                      const CreateNodeFunc createNode,
+                                                      const SetNodeFunc setNode,
+                                                      const CreateLeafFunc createLeaf,
+                                                      const ProgressMonitorFunc progressMonitor,
+                                                      const Settings& settings)
+      {
+          typedef BuilderT<
+            NodeRef,
+            RecalculatePrimRef,
+            decltype(createAlloc()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetNodeFunc,
+            CreateLeafFunc,
+            ProgressMonitorFunc> Builder;
+
+          Builder builder(device,
+                          recalculatePrimRef,
+                          createAlloc,
+                          createNode,
+                          setNode,
+                          createLeaf,
+                          progressMonitor,
+                          settings);
+
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..e477c313a3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHairMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;       //!< depth of the root of this subtree
+	SetMB prims;        //!< the list of primitives
+      };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+          typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+          typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+          typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+        public:
+
+          BuilderT (Scene* scene,
+                    const RecalculatePrimRef& recalculatePrimRef,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeMBFunc& createAABBNodeMB,
+                    const SetAABBNodeMBFunc& setAABBNodeMB,
+                    const CreateOBBNodeMBFunc& createOBBNodeMB,
+                    const SetOBBNodeMBFunc& setOBBNodeMB,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings settings)
+
+            : cfg(settings),
+            scene(scene),
+            recalculatePrimRef(recalculatePrimRef),
+            createAlloc(createAlloc),
+            createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+            createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            unalignedHeuristic(scene),
+            temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+        private:
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+            unsigned int firstGeomID = prims[set.begin()].geomID();
+            for (size_t i=set.begin()+1; i<set.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+          
+          /*! performs some split if SAH approaches fail */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            PrimInfoMB linfo(empty);
+            PrimInfoMB rinfo(empty);
+            unsigned int geomID = (*set.prims)[begin].geomID();
+            size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+              return createLeaf(current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            LocalChildList children(current);
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+                  continue;
+
+                force_split = false;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!sameGeometry(children[bestChild].prims)) {
+                splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+              } else {
+                splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+              children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+            } while (children.size() < cfg.branchingFactor);
+
+
+            /* detect time_ranges that have shrunken */
+            bool timesplit = false;
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+            
+            /* create node */
+            NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+            LBBox3fa bounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              bounds.extend(values[i].lbounds);
+            }
+
+            setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+            if (timesplit)
+              bounds = current.prims.linearBounds(recalculatePrimRef);
+              
+            return NodeRecordMB4D(node,bounds,current.prims.time_range);
+          }
+
+          /*! performs split */
+          std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+            /* perform standard binning in aligned space */
+            HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+            float alignedObjectSAH = alignedObjectSplit.splitSAH();
+            bestSAH = min(alignedObjectSAH,bestSAH);
+
+            /* perform standard binning in unaligned space */
+            UnalignedHeuristicBinning::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (alignedObjectSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+              const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+            float temporal_split_sah = inf;
+            typename HeuristicTemporal::Split temporal_split;
+            if (bestSAH > 0.5f*leafSAH) {
+              if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+                temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+                temporal_split_sah = temporal_split.splitSAH();
+                bestSAH = min(temporal_split_sah,bestSAH);
+              }
+            }
+
+            /* perform fallback split if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH))) {
+              current.prims.deterministic_order();
+              splitFallback(current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform aligned split if this is best */
+            else if (likely(bestSAH == alignedObjectSAH)) {
+              alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform unaligned split if this is best */
+            else if (likely(bestSAH == unalignedObjectSAH)) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+              aligned = false;
+            }
+            /* perform temporal split if this is best */
+            else if (likely(bestSAH == temporal_split_sah)) {
+              timesplit = true;
+              return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! recursive build */
+          NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(current.size());
+
+            /* create leaf node */
+            if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            bool aligned = true;
+            bool timesplit = false;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                const float A = children[i].prims.halfArea();
+                if (A > bestArea) {
+                  bestArea = children[i].prims.halfArea();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+              children.split(bestChild,left,right,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create time split node */
+            if (timesplit)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequential */
+              else {
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                }
+              }
+
+              setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+
+            /* create aligned node */
+            else if (aligned)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      cbounds[i] = values[i].lbounds;
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++)
+                  bounds.extend(cbounds[i]);
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+              /* ... continue sequentially */
+              else
+              {
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                  bounds.extend(values[i].lbounds);
+                }
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              const NodeRef node = createOBBNodeMB(alloc);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                      const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                      const auto child = recurse(children[i],nullptr,true);
+                      setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<children.size(); i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                  const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                  const auto child = recurse(children[i],alloc,false);
+                  setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                }
+              }
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+          }
+
+        public:
+
+          /*! entry point into builder */
+          NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            BuildRecord record(SetMB(pinfo,&prims),1);
+            auto root = recurse(record,nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return root;
+          }
+
+        private:
+          Settings cfg;
+          Scene* scene;
+          const RecalculatePrimRef& recalculatePrimRef;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeMBFunc& createAABBNodeMB;
+          const SetAABBNodeMBFunc& setAABBNodeMB;
+          const CreateOBBNodeMBFunc& createOBBNodeMB;
+          const SetOBBNodeMBFunc& setOBBNodeMB;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+
+        private:
+          HeuristicBinning alignedHeuristic;
+          UnalignedHeuristicBinning unalignedHeuristic;
+          HeuristicTemporal temporalSplitHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+                                               const RecalculatePrimRef& recalculatePrimRef,
+                                               const CreateAllocFunc& createAlloc,
+                                               const CreateAABBNodeMBFunc& createAABBNodeMB,
+                                               const SetAABBNodeMBFunc& setAABBNodeMB,
+                                               const CreateOBBNodeMBFunc& createOBBNodeMB,
+                                               const SetOBBNodeMBFunc& setOBBNodeMB,
+                                               const CreateLeafFunc& createLeaf,
+                                               const ProgressMonitor& progressMonitor,
+                                               const Settings settings)
+        {
+          typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+            CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+            CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+            CreateLeafFunc,ProgressMonitor> Builder;
+
+          Builder builder(scene,recalculatePrimRef,createAlloc,
+                          createAABBNodeMB,setAABBNodeMB,
+                          createOBBNodeMB,setOBBNodeMB,
+                          createLeaf,progressMonitor,settings);
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..3f7e678a10
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+#  define NUM_OBJECT_BINS 16
+#  define NUM_SPATIAL_BINS 16
+#else
+#  define NUM_OBJECT_BINS 32
+#  define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+  namespace isa
+  {
+    MAYBE_UNUSED static const float travCost = 1.0f;
+    MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    struct GeneralBVHBuilder
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor      
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;       //!< create balanced tree of we are that many levels before the maximum tree depth
+      
+
+      /*! settings for SAH builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(static_cast<size_t>(settings.sahBlockSize));
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost  )) intCost         = settings.intersectionCost;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+        size_t primrefarrayalloc;  //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+      };
+
+      /*! recursive state of builder */
+      template<typename Set, typename Split>
+        struct BuildRecordT
+        {
+        public:
+          __forceinline BuildRecordT () {}
+
+          __forceinline BuildRecordT (size_t depth)
+            : depth(depth), alloc_barrier(false), prims(empty) {}
+
+          __forceinline BuildRecordT (size_t depth, const Set& prims)
+            : depth(depth), alloc_barrier(false), prims(prims) {}
+
+          __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+          __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+          __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size();  }
+
+          __forceinline size_t size() const { return prims.size(); }
+
+        public:
+          size_t depth;       //!< Depth of the root of this subtree.
+          bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+          Set prims;          //!< The list of primitives.
+        };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafFunc
+      {
+        __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+      };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafSplitFunc
+      {
+        __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+      };
+
+      template<typename BuildRecord,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          friend struct GeneralBVHBuilder;
+
+          BuilderT (PrimRef* prims,
+                    Heuristic& heuristic,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateNodeFunc& createNode,
+                    const UpdateNodeFunc& updateNode,
+                    const CreateLeafFunc& createLeaf,
+                    const CanCreateLeafFunc& canCreateLeaf,
+                    const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings& settings) :
+                    cfg(settings),
+                    prims(prims),
+                    heuristic(heuristic),
+                    createAlloc(createAlloc),
+                    createNode(createNode),
+                    updateNode(updateNode),
+                    createLeaf(createLeaf),
+                    canCreateLeaf(canCreateLeaf),
+                    canCreateLeafSplit(canCreateLeafSplit),
+                    progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+              return createLeaf(prims,current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            size_t numChildren = 1;
+            children[0] = current;
+            do {
+
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].prims.size() > bestSize) {
+                  bestSize = children[i].prims.size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == (size_t)-1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!canCreateLeaf(prims,children[bestChild].prims)) {
+                canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+              } else {
+                heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* create node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* recurse into each child  and perform reduction */
+            for (size_t i=0; i<numChildren; i++)
+              values[i] = createLargeLeaf(children[i],alloc);
+
+            /* perform reduction */
+            return updateNode(current,children,node,values,numChildren);
+          }
+
+          const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+            assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              heuristic.deterministic_order(current.prims);
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            Set lprims,rprims;
+            heuristic.split(split,current.prims,lprims,rprims);
+	    
+            /*! initialize child list with initial split */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            children[0] = BuildRecord(current.depth+1,lprims);
+            children[1] = BuildRecord(current.depth+1,rprims);
+            size_t numChildren = 2;
+
+            /*! split until node is full or SAH tells us to stop */
+            while (numChildren < cfg.branchingFactor)
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+                /* find child with largest surface area */
+                if (halfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i;
+                  bestArea = halfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+              heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+              children[bestChild  ] = lrecord;
+              children[numChildren] = rrecord;
+              numChildren++;
+            }
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* sort buildrecords for faster shadow ray traversal */
+            std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+            /*! create an inner node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* spawn tasks */
+            if (current.size() > cfg.singleThreadThreshold)
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+            /* recurse into each child */
+            else
+            {
+              for (size_t i=0; i<numChildren; i++)
+                values[i] = recurse(children[i],alloc,false);
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          Heuristic& heuristic;
+          const CreateAllocFunc& createAlloc;
+          const CreateNodeFunc& createNode;
+          const UpdateNodeFunc& updateNode;
+          const CreateLeafFunc& createLeaf;
+          const CanCreateLeafFunc& canCreateLeaf;
+          const CanCreateLeafSplitFunc& canCreateLeafSplit;
+          const ProgressMonitor& progressMonitor;
+        };
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          DefaultCanCreateLeafFunc<PrimRef, Set>,
+          DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        DefaultCanCreateLeafFunc<PrimRef, Set>(),
+                        DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const CanCreateLeafFunc& canCreateLeaf,
+                                            const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          CanCreateLeafFunc,
+          CanCreateLeafSplitFunc,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        canCreateLeaf,
+                        canCreateLeafSplit,
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+    };
+
+    /* SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedSAH
+    {
+      typedef PrimInfoRange Set;
+      typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const CanCreateLeafFunc& canCreateLeaf,
+                                 const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          canCreateLeaf,
+          canCreateLeafSplit,
+          progressMonitor,
+          settings);
+      }
+    };
+
+    /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+    struct BVHBuilderBinnedFastSpatialSAH
+    {
+      typedef PrimInfoExtRange Set;
+      typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      static const unsigned int GEOMID_MASK = 0xFFFFFFFF >>     RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+      static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+      template<typename ReductionTy, typename UserCreateLeaf>
+      struct CreateLeafExt
+      {
+        __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+          : userCreateLeaf(userCreateLeaf) {}
+
+        // __noinline is workaround for ICC2016 compiler bug
+        template<typename Allocator>
+        __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+        {
+          for (size_t i=range.begin(); i<range.end(); i++)
+            prims[i].lower.u &= GEOMID_MASK;
+
+          return userCreateLeaf(prims,range,alloc);
+        }
+
+        const UserCreateLeaf userCreateLeaf;
+      };
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename SplitPrimitiveFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode,
+                                 UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 SplitPrimitiveFunc splitPrimitive,
+                                 ProgressMonitor progressMonitor,
+                                 PrimRef* prims,
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo,
+                                 const Settings& settings)
+        {
+          typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+          Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+          /* calculate total surface area */ // FIXME: this sum is not deterministic
+          const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+              double A = 0.0f;
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                A += area(prim.bounds());
+              }
+              return A;
+            },std::plus<double>());
+
+
+          /* calculate maximum number of spatial splits per primitive */
+          const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+          const float f = 10.0f;
+
+          const float invA = 1.0f / A;
+          parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                assert((prim.geomID() & SPLITS_MASK) == 0);
+                // FIXME: is there a better general heuristic ?
+                const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+                unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+                prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+              }
+            });
+
+          return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+            heuristic,
+            prims,
+            PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+            createAlloc,
+            createNode,
+            updateNode,
+            CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+            progressMonitor,
+            settings);
+        }
+    };
+
+    /* Open/Merge SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedOpenMergeSAH
+    {
+      static const size_t NUM_OBJECT_BINS_HQ = 32;
+      typedef PrimInfoExtRange Set;
+      typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+      
+      /*! special builder that propagates reduction over the tree */
+      template<
+        typename ReductionTy, 
+        typename BuildRef,
+        typename CreateAllocFunc, 
+        typename CreateNodeFunc, 
+        typename UpdateNodeFunc, 
+        typename CreateLeafFunc, 
+        typename NodeOpenerFunc, 
+        typename ProgressMonitor>
+        
+        static ReductionTy build(CreateAllocFunc createAlloc, 
+                                 CreateNodeFunc createNode, 
+                                 UpdateNodeFunc updateNode, 
+                                 const CreateLeafFunc& createLeaf, 
+                                 NodeOpenerFunc nodeOpenerFunc,
+                                 ProgressMonitor progressMonitor,
+                                 BuildRef* prims, 
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo, 
+                                 const Settings& settings)
+      {
+        typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+        Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+          heuristic,
+          prims,
+          PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..a4d3b68e46
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
@@ -0,0 +1,972 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct BinMapping
+      {
+      public:
+        __forceinline BinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline BinMapping(size_t N, const BBox3fa& centBounds) 
+        {
+          num = min(BINS,size_t(4.0f + 0.05f*N));
+          assert(num >= 1);
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        __forceinline BinMapping(const BBox3fa& centBounds) 
+        {
+          num = BINS;
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        template<typename PrimInfo>
+        __forceinline BinMapping(const PrimInfo& pinfo) 
+        {
+          const vfloat4 eps = 1E-34f;
+          num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) pinfo.centBounds.lower;
+        }
+
+        /*! returns number of bins */
+        __forceinline size_t size() const { return num; }
+        
+        /*! slower but safe binning */
+        __forceinline Vec3ia bin(const Vec3fa& p) const 
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+          assert(i[0] >= 0 && (size_t)i[0] < num); 
+          assert(i[1] >= 0 && (size_t)i[1] < num);
+          assert(i[2] >= 0 && (size_t)i[2] < num);
+          return Vec3ia(i);
+#else
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+        }
+
+        /*! faster but unsafe binning */
+        __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+          return bin_unsafe(p.binCenter());
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef, typename BinBoundsAndCenter>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+          return bin_unsafe(binBoundsAndCenter.binCenter(p));
+        }
+
+        template<typename PrimRef>
+        __forceinline bool bin_unsafe(const PrimRef& ref,
+                                      const vint4&   vSplitPos,
+                                      const vbool4&  splitDimMask) const // FIXME: rename to isLeft
+        {
+          return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+        }
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+        }
+
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+          return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+        }
+        
+      public:
+        size_t num;
+        vfloat4 ofs,scale;        //!< linear function that maps to bin ID
+      };
+    
+    /*! stores all information to perform some split */
+    template<size_t BINS>
+      struct BinSplit
+      {
+        enum
+        {
+          SPLIT_OBJECT   = 0,
+          SPLIT_FALLBACK = 1,
+          SPLIT_ENFORCE  = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+          SPLIT_TEMPORAL = 2,
+          SPLIT_GEOMID   = 3,
+        };
+
+        /*! construct an invalid split by default */
+        __forceinline BinSplit()
+          : sah(inf), dim(-1), pos(0), data(0) {}
+
+        __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+          : sah(sah), dim(dim), fpos(fpos), data(data) {}
+        
+        /*! constructs specified split */
+        __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+          return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+        }
+        
+      public:
+        float sah;                //!< SAH cost of the split
+        int dim;                  //!< split dimension
+        union { int pos; float fpos; };                  //!< bin index for splitting
+        unsigned int data;        //!< extra optional split data
+        BinMapping<BINS> mapping; //!< mapping into bins
+      };
+    
+    /*! stores extended information about the split */
+    template<typename BBox>
+      struct SplitInfoT
+    {
+
+      __forceinline SplitInfoT () {}
+      
+      __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+	: leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+      
+    public:
+      size_t leftCount,rightCount;
+      BBox leftBounds,rightBounds;
+    };
+
+    typedef SplitInfoT<BBox3fa> SplitInfo;
+    typedef SplitInfoT<LBBox3fa> SplitInfo2;
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef, typename BBox>
+      struct __aligned(64) BinInfoT
+    {		  
+      typedef BinSplit<BINS> Split;
+      typedef vbool4 vbool;
+      typedef vint4 vint;
+      typedef vfloat4 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+
+      /*! bin access function */
+      __forceinline BBox &bounds(const size_t binID, const size_t dimID)             { return _bounds[binID][dimID]; }
+      __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+      __forceinline unsigned int &counts(const size_t binID, const size_t dimID)             { return _counts[binID][dimID]; }
+      __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+      __forceinline vuint4 &counts(const size_t binID)             { return _counts[binID]; }
+      __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+	for (size_t i=0; i<BINS; i++) {
+	  bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+	  counts(i) = vuint4(zero);
+	}
+      }
+      
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+      {
+	if (unlikely(N == 0)) return;
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i+0].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          BBox prim1; Vec3fa center1;
+          prims[i+1].binBoundsAndCenter(prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); 
+          const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); 
+          const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); 
+          const unsigned int s0 = (unsigned int)prims[i+0].size();
+          counts(b00,0)+=s0;
+          counts(b01,1)+=s0;
+          counts(b02,2)+=s0;
+
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int b10 = extract<0>(bin1);  bounds(b10,0).extend(prim1); 
+          const unsigned int b11 = extract<1>(bin1);  bounds(b11,1).extend(prim1); 
+          const unsigned int b12 = extract<2>(bin1);  bounds(b12,2).extend(prim1); 
+          const unsigned int s1 = (unsigned int)prims[i+1].size();
+          counts(b10,0)+=s1;
+          counts(b11,1)+=s1;
+          counts(b12,2)+=s1;
+        }
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = (unsigned int)prims[i].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+
+      /*! bins an array of primitives */
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+      {
+	if (N == 0) return;
+        
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+          
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int s1 = prims[i+1].size();
+          const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+          const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+          const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+        }
+	
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+      
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+	bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+		
+	for (size_t i=0; i<numBins; i++) 
+        {
+          counts(i) += other.counts(i);
+          bounds(i,0).extend(other.bounds(i,0));
+          bounds(i,1).extend(other.bounds(i,1));
+          bounds(i,2).extend(other.bounds(i,2));
+        }
+      }
+
+      /*! reduces binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<numBins; i++) 
+        {
+          c.counts(i) = a.counts(i)+b.counts(i);
+          c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+          c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+          c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = (1 << blocks_shift)-1;
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
+      
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+	  return;
+	}
+	
+	size_t leftCount = 0;
+	BBox leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += counts(i,split.dim);
+	  leftBounds.extend(bounds(i,split.dim));
+	}
+	size_t rightCount = 0;
+	BBox rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += counts(i,split.dim);
+	  rightBounds.extend(bounds(i,split.dim));
+	}
+	new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += counts(i, split.dim);
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += counts(i, split.dim);
+        }
+        return rightCount;
+      }
+
+    private:
+      BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+      vuint4   _counts[BINS];    //!< counts number of primitives that map into the bins
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+   /*! mapping into bins */
+   template<>
+     struct BinMapping<16>
+   {
+   public:
+     __forceinline BinMapping() {}
+      
+     /*! calculates the mapping */
+     template<typename PrimInfo>
+     __forceinline BinMapping(const PrimInfo& pinfo)
+     {
+       num = 16;
+       const vfloat4 eps = 1E-34f;
+       const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+       scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+       ofs  = (vfloat4) pinfo.centBounds.lower;
+       scale16 = scale;
+       ofs16 = ofs;
+     }
+
+     /*! returns number of bins */
+     __forceinline size_t size() const { return num; }
+
+     __forceinline vint16 bin16(const Vec3fa& p) const {
+       return vint16(vint4(floori((vfloat4(p)-ofs)*scale)));
+     }
+
+     __forceinline vint16 bin16(const vfloat16& p) const {
+       return floori((p-ofs16)*scale16);
+     }
+
+     __forceinline int bin_unsafe(const PrimRef& ref,
+                                  const vint16&  vSplitPos,
+                                  const vbool16& splitDimMask) const // FIXME: rename to isLeft
+     {
+       const vfloat16 lower(*(vfloat4*)&ref.lower);
+       const vfloat16 upper(*(vfloat4*)&ref.upper);
+       const vfloat16 p = lower + upper;
+       const vint16 i = floori((p-ofs16)*scale16);
+       return lt(splitDimMask,i,vSplitPos);
+     }
+
+     /*! returns true if the mapping is invalid in some dimension */
+     __forceinline bool invalid(const size_t dim) const {
+       return scale[dim] == 0.0f;
+     }
+        
+    public:
+      size_t num;
+      vfloat4 ofs,scale;         //!< linear function that maps to bin ID
+      vfloat16 ofs16,scale16;    //!< linear function that maps to bin ID
+    };
+
+    /* 16 bins in-register binner */
+    template<typename PrimRef>
+      struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa>
+    {
+      typedef BinSplit<16> Split;
+      typedef vbool16 vbool;
+      typedef vint16 vint;
+      typedef vfloat16 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+      
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        lower[0] = lower[1] = lower[2] = pos_inf;
+        upper[0] = upper[1] = upper[2] = neg_inf;
+        count[0] = count[1] = count[2] = 0;
+      }
+
+
+      static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = reverse_prefix_min(min_x);
+        const vfloat16 r_min_y = reverse_prefix_min(min_y);
+        const vfloat16 r_min_z = reverse_prefix_min(min_z);
+        const vfloat16 r_max_x = reverse_prefix_max(max_x);
+        const vfloat16 r_max_y = reverse_prefix_max(max_y);
+        const vfloat16 r_max_z = reverse_prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_rl;
+      }
+
+      static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = prefix_min(min_x);
+        const vfloat16 r_min_y = prefix_min(min_y);
+        const vfloat16 r_min_z = prefix_min(min_z);
+        const vfloat16 r_max_x = prefix_max(max_x);
+        const vfloat16 r_max_y = prefix_max(max_y);
+        const vfloat16 r_max_z = prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_lr;
+      }
+
+
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping)
+      {
+        if (unlikely(N == 0)) return;
+
+        const vfloat16 init_min(pos_inf);
+        const vfloat16 init_max(neg_inf);
+
+        vfloat16 min_x0,min_x1,min_x2;
+        vfloat16 min_y0,min_y1,min_y2;
+        vfloat16 min_z0,min_z1,min_z2;
+        vfloat16 max_x0,max_x1,max_x2;
+        vfloat16 max_y0,max_y1,max_y2;
+        vfloat16 max_z0,max_z1,max_z2;
+        vuint16 count0,count1,count2;
+
+        min_x0 = init_min;
+        min_x1 = init_min;
+        min_x2 = init_min;
+        min_y0 = init_min;
+        min_y1 = init_min;
+        min_y2 = init_min;
+        min_z0 = init_min;
+        min_z1 = init_min;
+        min_z2 = init_min;
+
+        max_x0 = init_max;
+        max_x1 = init_max;
+        max_x2 = init_max;
+        max_y0 = init_max;
+        max_y1 = init_max;
+        max_y2 = init_max;
+        max_z0 = init_max;
+        max_z1 = init_max;
+        max_z2 = init_max;
+
+        count0 = zero;
+        count1 = zero;
+        count2 = zero;
+
+        const vint16 step16(step);
+        size_t i;
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          const BBox3fa primA = prims[i+0].bounds();
+          const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper);
+          const vint16 binA = mapping.bin16(centerA);
+
+          const BBox3fa primB = prims[i+1].bounds();
+          const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper); 
+          const vint16 binB = mapping.bin16(centerB);
+
+          /* A */
+          {
+            const vfloat16 b_min_x = prims[i+0].lower.x;
+            const vfloat16 b_min_y = prims[i+0].lower.y;
+            const vfloat16 b_min_z = prims[i+0].lower.z;
+            const vfloat16 b_max_x = prims[i+0].upper.x;
+            const vfloat16 b_max_y = prims[i+0].upper.y;
+            const vfloat16 b_max_z = prims[i+0].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binA);
+            const vint16 bin1 = shuffle<1>(binA);
+            const vint16 bin2 = shuffle<2>(binA);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+
+          /* B */
+          {
+            const vfloat16 b_min_x = prims[i+1].lower.x;
+            const vfloat16 b_min_y = prims[i+1].lower.y;
+            const vfloat16 b_min_z = prims[i+1].lower.z;
+            const vfloat16 b_max_x = prims[i+1].upper.x;
+            const vfloat16 b_max_y = prims[i+1].upper.y;
+            const vfloat16 b_max_z = prims[i+1].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binB);
+            const vint16 bin1 = shuffle<1>(binB);
+            const vint16 bin2 = shuffle<2>(binB);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+        }
+
+        if (i < N)
+        {
+          const BBox3fa prim0 = prims[i].bounds();
+          const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); 
+          const vint16 bin = mapping.bin16(center0);
+
+          const vfloat16 b_min_x = prims[i].lower.x;
+          const vfloat16 b_min_y = prims[i].lower.y;
+          const vfloat16 b_min_z = prims[i].lower.z;
+          const vfloat16 b_max_x = prims[i].upper.x;
+          const vfloat16 b_max_y = prims[i].upper.y;
+          const vfloat16 b_max_z = prims[i].upper.z;
+
+          const vint16 bin0 = shuffle<0>(bin);
+          const vint16 bin1 = shuffle<1>(bin);
+          const vint16 bin2 = shuffle<2>(bin);
+
+          const vbool16 m_update_x = step16 == bin0;
+          const vbool16 m_update_y = step16 == bin1;
+          const vbool16 m_update_z = step16 == bin2;
+
+          assert(popcnt((size_t)m_update_x) == 1);
+          assert(popcnt((size_t)m_update_y) == 1);
+          assert(popcnt((size_t)m_update_z) == 1);
+
+          min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+          min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+          min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+          max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+          max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+          min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+          min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+          // ------------------------------------------------------------------------      
+          max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+          max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+          max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+          min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+          min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+          max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+          max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+          // ------------------------------------------------------------------------
+          count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+          count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+          count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+        }
+
+        lower[0] = Vec3vf16( min_x0, min_y0, min_z0 );
+        lower[1] = Vec3vf16( min_x1, min_y1, min_z1 );
+        lower[2] = Vec3vf16( min_x2, min_y2, min_z2 );
+
+        upper[0] = Vec3vf16( max_x0, max_y0, max_z0 );
+        upper[1] = Vec3vf16( max_x1, max_y1, max_z1 );
+        upper[2] = Vec3vf16( max_x2, max_y2, max_z2 );
+
+        count[0] = count0;
+        count[1] = count1;
+        count[2] = count2;
+      }
+
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+        for (size_t i=0; i<3; i++)
+        {
+          lower[i]  = min(lower[i],other.lower[i]);
+          upper[i]  = max(upper[i],other.upper[i]);
+          count[i] += other.count[i];
+        }
+      }
+
+      /*! reducesr binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<3; i++) 
+        {
+          c.counts[i] = a.counts[i] + b.counts[i];
+          c.lower[i]  = min(a.lower[i],b.lower[i]);
+          c.upper[i]  = max(a.upper[i],b.upper[i]);
+        }
+        return c;
+      }
+
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const
+      {
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	const vuint16 blocks_add = (1 << blocks_shift)-1;
+        const vfloat16 inf(pos_inf);
+	for (size_t dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+
+          const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vuint16  lCount16 = prefix_sum(count[dim]);
+          const vuint16  rCount16 = reverse_prefix_sum(count[dim]); 
+
+          /* compute best split in this dimension */
+          const vfloat16 leftArea  = lArea16;
+          const vfloat16 rightArea = align_shift_right<1>(zero,rArea16);
+          const vuint16 lC = lCount16;
+          const vuint16 rC = align_shift_right<1>(zero,rCount16);
+          const vuint16 leftCount  = ( lC + blocks_add) >> blocks_shift;
+          const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift;
+          const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries
+          const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf));
+          /* test if this is a better dimension */
+          if (any(sah < vfloat16(bestSAH))) 
+          {
+            const size_t index = select_min(sah);            
+            assert(index < 15);
+            assert(sah[index] < bestSAH);
+            bestDim = dim;
+            bestPos = index+1;
+            bestSAH = sah[index];
+          }
+        }
+	
+	return Split(bestSAH,bestDim,bestPos,mapping);
+
+      }
+
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfo(0,empty,0,empty);
+	  return;
+	}
+	// FIXME: horizontal reduction!
+
+	size_t leftCount = 0;
+	BBox3fa leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  leftBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	size_t rightCount = 0;
+	BBox3fa rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  rightBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += count[split.dim][i];
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += count[split.dim][i];
+        }
+        return rightCount;
+      }
+            
+    private:
+      Vec3vf16 lower[3];
+      Vec3vf16 upper[3];
+      vuint16   count[3];
+    };
+#endif
+  }
+
+  template<typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..a4c272f015
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,205 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+    {
+      __forceinline PrimInfoRange () {
+      }
+
+      __forceinline PrimInfoRange(const PrimInfo& pinfo)
+        : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+      __forceinline PrimInfoRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+      __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+    
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct HeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 4*768; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+        __forceinline HeuristicArrayBinningSAH ()
+          : prims(nullptr) {}
+
+        /*! remember prim array */
+        __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+          : prims(prims) {}
+
+        /*! finds the best split */
+        __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_template<false>(pinfo,logBlockSize);
+          else
+            return find_template<true>(pinfo,logBlockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            split_template<false>(split,pinfo,linfo,rinfo);
+          else
+            split_template<true>(split,pinfo,linfo,rinfo);
+        }
+
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename Binner::vint vSplitPos(splitPos);
+          const typename Binner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          size_t center = 0;
+          if (!parallel)
+            center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(
+              prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+              [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+              [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+              PARALLEL_PARTITION_BLOCK_SIZE);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+
+        void deterministic_order(const PrimInfoRange& pinfo)
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+        }
+
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          const size_t begin = pinfo.begin();
+          const size_t end   = pinfo.end();
+          const size_t center = (begin + end)/2;
+
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&linfo) PrimInfoRange(begin,center,left);
+
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rinfo) PrimInfoRange(center,end,right);
+        }
+
+        void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          assert(range.size() > 1);
+          CentGeomBBox3fa left(empty);
+          CentGeomBBox3fa right(empty);
+          unsigned int geomID = prims[range.begin()].geomID();
+          size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+                                              [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+                                              [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+          new (&linfo) PrimInfoRange(range.begin(),center,left);
+          new (&rinfo) PrimInfoRange(center,range.end(),right);
+        }
+
+      private:
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct HeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..1370244586
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct UnalignedHeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+          : scene(nullptr), prims(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+          : scene(scene), prims(prims) {}
+
+        const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+        {
+          Vec3fa axis(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const unsigned int geomID = prims[i].geomID();
+            const unsigned int primID = prims[i].primID();
+            const uint64_t geomprimID = prims[i].ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+          return frame(axis).transposed();
+        }
+        
+        const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+        {
+          auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+            {
+              CentGeomBBox3fa bounds(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                Geometry* mesh = scene->get(prims[i].geomID());
+                bounds.extend(mesh->vbounds(space,prims[i].primID()));
+              }
+              return bounds;
+            };
+          
+          const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), 
+                                                         CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+          return PrimInfo(set.begin(),set.end(),bounds);
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+            : scene(scene), space(space) {}
+          
+            /*! returns center for binning */
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            return embree::center2(bounds);
+          }
+          
+          /*! returns bounds and centroid used for binning */
+          __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            bounds_o = bounds;
+            center_o = embree::center2(bounds);
+          }
+
+        private:
+          Scene* scene;
+          const LinearSpace3fa space;
+        };
+        
+        /*! finds the best split */
+        __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          if (likely(pinfo.size() < 10000))
+            return find_template<false>(pinfo,logBlockSize,space);
+          else
+            return find_template<true>(pinfo,logBlockSize,space);
+        }
+
+        /*! finds the best split */
+        template<bool parallel>
+        const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(set);
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+          bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (likely(set.size() < 10000))
+            split_template<false>(split,space,set,lset,rset);
+          else
+            split_template<true>(split,space,set,lset,rset);
+        }
+
+        /*! array partitioning */
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const int splitPos = split.pos;
+          const int splitDim = split.dim;
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+          size_t center = 0;
+          if (likely(set.size() < 10000))
+            center = serial_partitioning(prims,begin,end,local_left,local_right,
+                                         [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+                                           [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                           [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+                                           [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+                                           128);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+        
+        void deterministic_order(const range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[set.begin()],&prims[set.end()]);
+        }
+        
+        void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+          
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&lset) PrimInfoRange(begin,center,left);
+          
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rset) PrimInfoRange(center,end,right);
+        }
+        
+      private:
+        Scene* const scene;
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct UnalignedHeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        UnalignedHeuristicArrayBinningMB(Scene* scene)
+        : scene(scene) {}
+
+        const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+        {
+          Vec3fa axis0(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const PrimRefMB& prim = (*set.prims)[i];
+            const unsigned int geomID = prim.geomID();
+            const unsigned int primID = prim.primID();
+            const uint64_t geomprimID = prim.ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            
+            const Geometry* mesh = scene->get(geomID);
+            const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+            if (tbounds.size() == 0) continue;
+
+            const size_t t = (tbounds.begin()+tbounds.end())/2;
+            const Vec3fa axis1 = mesh->computeDirection(primID,t);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis0 = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+
+          return frame(axis0).transposed();
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+            : scene(scene), time_range(time_range), space(space) {}
+          
+          /*! returns center for binning */
+          template<typename PrimRef>
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            return center2(lbounds.interpolate(0.5f));
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds.interpolate(0.5f);
+            center_o = center2(bounds_o);
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds;
+            center_o = center2(lbounds.interpolate(0.5f));
+          }
+          
+        private:
+          Scene* scene;
+          BBox1f time_range;
+          const LinearSpace3fa space;
+        };
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+        }
+
+      private:
+        Scene* scene;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..21f18c0208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO: 
+//       - adjust parallel build thresholds
+//       - openNodesBasedOnExtend should consider max extended size
+  
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD   0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+      struct HeuristicArrayOpenMergeSAH
+      {
+        typedef BinSplit<OBJECT_BINS> Split;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+        
+        static const size_t PARALLEL_THRESHOLD = 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+        __forceinline HeuristicArrayOpenMergeSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+          : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) 
+        {
+          assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+        }
+
+        struct OpenHeuristic
+        {
+          __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+          {
+            const Vec3fa diag = pinfo.geomBounds.size();
+            dim = maxDim(diag);
+            assert(diag[dim] > 0.0f);
+            inv_max_extend = 1.0f / diag[dim];
+          }
+
+          __forceinline bool operator () ( PrimRef& prim ) const {
+            return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+          }
+
+        private:
+          size_t dim;
+          float inv_max_extend;
+        };
+
+        /*! compute extended ranges */
+        __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+        __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const unsigned int geomID = prims0[set.begin()].geomID();
+          
+          auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { 
+            bool commonGeomID = true;
+            size_t opens = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              commonGeomID &= prims0[i].geomID() == geomID; 
+              if (heuristic(prims0[i]))
+                opens += prims0[i].node.getN()-1; // coarse approximation
+            }
+            return std::pair<size_t,bool>(opens,commonGeomID); 
+          };
+          auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { 
+            return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); 
+          };
+          return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+        }
+
+        // FIXME: should consider maximum available extended size 
+        __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const size_t ext_range_start = set.end();
+
+          if (false && set.size() < PARALLEL_THRESHOLD) 
+          {
+            size_t extra_elements = 0;
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0; j<n; j++)
+                  set.extend_center2(tmp[j]);
+
+                prims0[i] = tmp[0];
+                for (size_t j=1; j<n; j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+              }
+            }
+            set._end += extra_elements;
+          }
+          else 
+          {
+            std::atomic<size_t> ext_elements;
+            ext_elements.store(0);
+            PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+                PrimInfo info(empty);
+                for (size_t i=r.begin(); i<r.end(); i++)
+                  if (heuristic(prims0[i]))
+                  {
+                    PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                    const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                    const size_t ID = ext_elements.fetch_add(n-1);
+                    assert(ID + n-1 <= set.ext_range_size());
+
+                    for (size_t j=0; j<n; j++)
+                      info.extend_center2(tmp[j]);
+
+                    prims0[i] = tmp[0];
+                    for (size_t j=1; j<n; j++)
+                      prims0[ext_range_start+ID+j-1] = tmp[j]; 
+                  }
+                return info;
+              }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+            set.centBounds.extend(info.centBounds);
+            assert(ext_elements.load() <= set.ext_range_size());
+            set._end += ext_elements.load();
+          }
+        } 
+
+        __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+        {
+          const OpenHeuristic heuristic(set);
+          size_t next_iteration_extra_elements = est_new_elements;          
+          
+          while (next_iteration_extra_elements <= set.ext_range_size()) 
+          {
+            next_iteration_extra_elements = 0;
+            size_t extra_elements = 0;
+            const size_t ext_range_start = set.end();
+
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0;j<n;j++)
+                  set.extend_center2(tmp[j]);
+                  
+                prims0[i] = tmp[0];
+                for (size_t j=1;j<n;j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+
+                for (size_t j=0; j<n; j++)
+                  if (heuristic(tmp[j]))
+                    next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+              }
+            }
+            assert( extra_elements <= set.ext_range_size());
+            set._end += extra_elements;
+
+            for (size_t i=set.begin();i<set.end();i++)
+              assert(prims0[i].numPrimitives() > 0);
+
+            if (unlikely(next_iteration_extra_elements == 0)) break;
+          }
+        } 
+
+        __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          /* single element */
+          if (set.size() <= 1)
+            return Split();
+
+          /* disable opening if there is no overlap */
+          const size_t D = 4;
+          if (unlikely(set.has_ext_range() && set.size() <= D))
+          {
+            bool disjoint = true;
+            for (size_t j=set.begin(); j<set.end()-1; j++) {
+              for (size_t i=set.begin()+1; i<set.end(); i++) {
+                if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { 
+                  disjoint = false; break; 
+                }
+              }
+            }
+            if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+          }
+
+          std::pair<size_t,bool> p(0,false);
+
+          /* disable opening when all primitives are from same geometry */
+          if (unlikely(set.has_ext_range()))
+          {
+            p =  getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+            if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif         
+          }
+
+          /* open nodes when we have sufficient space available */
+          if (unlikely(set.has_ext_range()))
+          {
+#if USE_LOOP_OPENING == 1
+            openNodesBasedOnExtendLoop(set,p.first);
+#else
+            if (p.first <= set.ext_range_size())
+              openNodesBasedOnExtend(set);
+#endif
+
+            /* disable opening when unsufficient space for opening a node available */
+            if (set.ext_range_size() < max_open_size-1) 
+              set.set_ext_range(set.end()); /* disable opening */
+          }
+                    
+          /* find best split */
+          return object_find(set,logBlockSize);
+        }
+
+
+        /*! finds the best object split */
+        __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+          else                                 return parallel_object_find  (set,logBlockSize);
+        }
+
+        /*! finds the best object split */
+        __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! finds the best split */
+        __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          auto body = [&] (const range<size_t>& r) -> Binner { 
+            Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; 
+          };
+          auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { 
+            Binner r = b0; r.merge(b1,_mapping.size()); return r; 
+          };
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            splitFallback(set,lset,rset);
+            return;
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          /* object split */
+          if (likely(set.size() < PARALLEL_THRESHOLD)) 
+            ext_weights = sequential_object_split(split,set,lset,rset);
+          else
+            ext_weights = parallel_object_split(split,set,lset,rset);
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });          
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+          auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+            [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+
+          return std::pair<size_t,size_t>(left.size(),right.size());
+        }
+
+        void deterministic_order(const extended_range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.add_center2(prims0[i]);
+
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++)
+            right.add_center2(prims0[i]);	
+
+          const size_t rweight = right.end;
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) 
+          {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const NodeOpenerFunc& nodeOpenerFunc;
+        size_t max_open_size;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..d8ca6cb92c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+  static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+  namespace isa
+  {
+
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct SpatialBinMapping
+      {
+      public:
+        __forceinline SpatialBinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+        {
+          const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+          const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+          const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+          scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+          ofs  = (vfloat4) pinfo.geomBounds.lower;
+          inv_scale = 1.0f / scale; 
+        }
+
+        /*! slower but safe binning */
+        __forceinline vint4 bin(const Vec3fa& p) const
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+          return clamp(i,vint4(0),vint4(BINS-1));
+        }
+
+        __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+        {
+#if defined(__AVX__)
+          const vfloat8 ofs8(ofs);
+          const vfloat8 scale8(scale);
+          const vint8 lu   = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+          const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+          return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+          const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+          const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+          const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+          const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+          return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+        }
+
+        
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),inv_scale[dim],ofs[dim]);
+        }
+
+        /*! calculates left spatial position of bin */
+        template<size_t N>
+        __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+          return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+        }
+        
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+      public:
+        vfloat4 ofs,scale,inv_scale;  //!< linear function that maps to bin ID
+      };
+
+    /*! stores all information required to perform some split */
+    template<size_t BINS>
+      struct SpatialBinSplit
+      {
+        /*! construct an invalid split by default */
+        __forceinline SpatialBinSplit() 
+          : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+        
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+          return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+        }
+        
+      public:
+        float sah;                 //!< SAH cost of the split
+        int   dim;                 //!< split dimension
+        int   pos;                 //!< split position
+        int   left;                //!< number of elements on the left side
+        int   right;               //!< number of elements on the right side
+        float factor;              //!< factor splitting the extended range
+        SpatialBinMapping<BINS> mapping; //!< mapping into bins
+      };    
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef>
+      struct __aligned(64) SpatialBinInfo
+    {
+      SpatialBinInfo() {
+      }
+
+      __forceinline SpatialBinInfo(EmptyTy) {
+	clear();
+      }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        for (size_t i=0; i<BINS; i++) { 
+          bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+          numBegin[i] = numEnd[i] = 0;
+        }
+      }
+      
+      /*! adds binning data */
+      __forceinline void add(const size_t dim,
+                             const size_t beginID, 
+                             const size_t endID, 
+                             const size_t binID, 
+                             const BBox3fa &b,
+                             const size_t n = 1) 
+      {
+        assert(beginID < BINS);
+        assert(endID < BINS);
+        assert(binID < BINS);
+
+        numBegin[beginID][dim]+=(unsigned int)n;
+        numEnd  [endID][dim]+=(unsigned int)n;
+        bounds  [binID][dim].extend(b);        
+      }
+
+      /*! extends binning bounds */
+      __forceinline void extend(const size_t dim,
+                                const size_t binID, 
+                                const BBox3fa &b) 
+      {
+        assert(binID < BINS);
+        bounds  [binID][dim].extend(b);        
+      }
+      
+      /*! bins an array of triangles */
+      template<typename SplitPrimitive>
+        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=0; i<N; i++)
+        {
+          const PrimRef prim = prims[i];
+          unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+          if (unlikely(splits == 1))
+          {
+            const vint4 bin = mapping.bin(center(prim.bounds()));
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+              numBegin[bin[dim]][dim]++;
+              numEnd  [bin[dim]][dim]++;
+              bounds  [bin[dim]][dim].extend(prim.bounds());
+            }
+          } 
+          else
+          {
+            const vint4 bin0 = mapping.bin(prim.bounds().lower);
+            const vint4 bin1 = mapping.bin(prim.bounds().upper);
+            
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              size_t bin;
+              PrimRef rest = prim;
+              size_t l = bin0[dim];
+              size_t r = bin1[dim];
+
+              // same bin optimization
+              if (likely(l == r)) 
+              {
+                numBegin[l][dim]++;
+                numEnd  [l][dim]++;
+                bounds  [l][dim].extend(prim.bounds());
+                continue;
+              }
+
+              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              {
+                const float pos = mapping.pos(bin+1,dim);
+                
+                PrimRef left,right;
+                splitPrimitive(rest,(int)dim,pos,left,right);
+                if (unlikely(left.bounds().empty())) l++;                
+                bounds[bin][dim].extend(left.bounds());
+                rest = right;
+              }
+              if (unlikely(rest.bounds().empty())) r--;
+              numBegin[l][dim]++;
+              numEnd  [r][dim]++;
+              bounds  [bin][dim].extend(rest.bounds());
+            }
+          }
+        }
+      }
+      
+      /*! bins a range of primitives inside an array */
+      template<typename SplitPrimitive>
+        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+	bin(splitPrimitive,prims+begin,end-begin,mapping);
+      }
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            size_t bin;
+            size_t l = bin0[dim];
+            size_t r = bin1[dim];
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds());
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            BBox3fa rest = prim.bounds();
+            const auto splitter = splitterFactory(prim);
+            for (bin=bin_start; bin<bin_end; bin++) 
+            {
+              const float pos = mapping.pos(bin+1,dim);
+              BBox3fa left,right;
+              splitter(rest,dim,pos,left,right);
+              if (unlikely(left.empty())) l++;                
+              extend(dim,bin,left);
+              rest = right;
+            }
+            if (unlikely(rest.empty())) r--;
+            add(dim,l,r,bin,rest);
+          }
+        }              
+      }
+
+
+
+      /*! bins an array of primitives */
+      __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            const size_t l = bin0[dim];
+            const size_t r = bin1[dim];
+
+            const unsigned int n  = prim.primID();
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds(),n);
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            for (size_t bin=bin_start; bin<bin_end; bin++) 
+              add(dim,l,r,bin,prim.bounds(),n);
+          }
+        }              
+      }
+      
+      /*! merges in other binning information */
+      void merge (const SpatialBinInfo& other)
+      {
+        for (size_t i=0; i<BINS; i++) 
+        {
+          numBegin[i] += other.numBegin[i];
+          numEnd  [i] += other.numEnd  [i];
+          bounds[i][0].extend(other.bounds[i][0]);
+          bounds[i][1].extend(other.bounds[i][1]);
+          bounds[i][2].extend(other.bounds[i][2]);
+        }
+      }
+
+      /*! merges in other binning information */
+      static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+      {
+        SpatialBinInfo c(empty);
+        for (size_t i=0; i<BINS; i++) 
+        {
+          c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+          c.numEnd  [i] += a.numEnd  [i]+b.numEnd  [i];
+          c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+          c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+          c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const 
+      {
+        /* sweep from right to left and compute parallel prefix of merged bounds */
+        vfloat4 rAreas[BINS];
+        vuint4 rCounts[BINS];
+        vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+        for (size_t i=BINS-1; i>0; i--)
+        {
+          count += numEnd[i];
+          rCounts[i] = count;
+          bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+          by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+          bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+        
+        /* sweep from left to right and compute SAH */
+        vuint4 blocks_add = (1 << blocks_shift)-1;
+        vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+        count = 0; bx = empty; by = empty; bz = empty;
+        for (size_t i=1; i<BINS; i++, ii+=1)
+        {
+          count += numBegin[i-1];
+          bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+          by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+          bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift);
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+          const vbool4 mask = sah < vbestSAH;
+          vbestPos      = select(mask,ii ,vbestPos);
+          vbestSAH      = select(mask,sah,vbestSAH);
+          vbestlCount   = select(mask,count,vbestlCount);
+          vbestrCount   = select(mask,rCounts[i],vbestrCount);
+        }
+        
+        /* find best dimension */
+        float bestSAH = inf;
+        int   bestDim = -1;
+        int   bestPos = 0;
+        unsigned int   bestlCount = 0;
+        unsigned int   bestrCount = 0;
+        for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+            bestlCount = vbestlCount[dim];
+            bestrCount = vbestrCount[dim];
+          }
+        }
+        assert(bestSAH >= 0.0f);
+        
+        /* return invalid split if no split found */
+        if (bestDim == -1) 
+          return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+        
+        /* return best found split */
+        return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+      }
+      
+    private:
+      BBox3fa bounds[BINS][3];  //!< geometry bounds for each bin in each dimension
+      vuint4    numBegin[BINS];   //!< number of primitives starting in bin
+      vuint4    numEnd[BINS];     //!< number of primitives ending in bin
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..911dcf950c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,552 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+    struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+    {
+      __forceinline PrimInfoExtRange() {
+      }
+
+      __forceinline PrimInfoExtRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+      __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+
+    template<typename ObjectSplit, typename SpatialSplit>
+      struct Split2
+      {
+        __forceinline Split2 () {}
+        
+        __forceinline Split2 (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+        }
+        
+        __forceinline Split2& operator= (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+          return *this;
+        }
+          
+          __forceinline     ObjectSplit&  objectSplit()        { return *(      ObjectSplit*)data; }
+        __forceinline const ObjectSplit&  objectSplit() const  { return *(const ObjectSplit*)data; }
+        
+        __forceinline       SpatialSplit& spatialSplit()       { return *(      SpatialSplit*)data; }
+        __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+        
+        __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+          : spatial(false), sah(sah) 
+        {
+          new (data) ObjectSplit(objectSplit);
+        }
+        
+        __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+          : spatial(true), sah(sah) 
+        {
+          new (data) SpatialSplit(spatialSplit);
+        }
+        
+        __forceinline float splitSAH() const { 
+          return sah; 
+        }
+        
+        __forceinline bool valid() const {
+          return sah < float(inf);
+        }
+        
+      public:
+        __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+        bool spatial;
+        float sah;
+      };
+    
+    /*! Performs standard object binning */
+    template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+      struct HeuristicArraySpatialSAH
+      {
+        typedef BinSplit<OBJECT_BINS> ObjectSplit;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+        typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+        typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+        //typedef extended_range<size_t> Set;
+        typedef Split2<ObjectSplit,SpatialSplit> Split;
+        
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 3*1024; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3*1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+        __forceinline HeuristicArraySpatialSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+          : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+        /*! compute extended ranges */
+        __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /*! finds the best split */
+        const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SplitInfo oinfo;
+          const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+          const float object_split_sah = object_split.splitSAH();
+
+          if (unlikely(set.has_ext_range()))
+          {
+            const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+            
+            /* do only spatial splits if the child bounds overlap */
+            if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+                safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+            {              
+              const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+              const float spatial_split_sah = spatial_split.splitSAH();
+
+              /* valid spatial split, better SAH and number of splits do not exceed extended range */
+              if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+                  spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+              {          
+                return Split(spatial_split,spatial_split_sah);
+              }
+            }
+          }
+
+          return Split(object_split,object_split_sah);
+        }
+
+        /*! finds the best object split */
+        __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+          else                                 return parallel_object_find  (set,logBlockSize,info);
+        }
+
+        /*! finds the best object split */
+        __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best split */
+        __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+                                   [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best spatial split */
+        __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+          else                                 return parallel_spatial_find  (set, logBlockSize);
+        }
+
+        /*! finds the best spatial split */
+        __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty); 
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+        __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty);
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> SpatialBinner { 
+                                     SpatialBinner binner(empty); 
+                                     binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+                                     return binner; },
+                                   [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+
+        /*! subdivides primitives based on a spatial split */
+        __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+        {
+          assert(set.has_ext_range());
+          const size_t max_ext_range_size = set.ext_range_size();
+          const size_t ext_range_start = set.end();
+
+          /* atomic counter for number of primref splits */
+          std::atomic<size_t> ext_elements;
+          ext_elements.store(0);
+          
+          const float fpos = split.mapping.pos(split.pos,split.dim);
+        
+          const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+          parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+              for (size_t i=r.begin();i<r.end();i++)
+              {
+                const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+                if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+                //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+                //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+                //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+                if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+                {
+                  assert(splits > 1);
+
+                  PrimRef left,right;
+                  const auto splitter = splitterFactory(prims0[i]);
+                  splitter(prims0[i],split.dim,fpos,left,right);
+                
+                  // no empty splits
+                  if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+                
+                  left.lower.u  = (left.lower.u  & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+                  right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+                  const size_t ID = ext_elements.fetch_add(1);
+
+                  /* break if the number of subdivided elements are greater than the maximum allowed size */
+                  if (unlikely(ID >= max_ext_range_size)) 
+                    break;
+
+                  /* only write within the correct bounds */
+                  assert(ID < max_ext_range_size);
+                  prims0[i] = left;
+                  prims0[ext_range_start+ID] = right;     
+                }
+              }
+            });
+
+          const size_t numExtElements = min(max_ext_range_size,ext_elements.load());          
+          assert(set.end()+numExtElements<=set.ext_end());
+          set._end += numExtElements;
+        }
+        
+        /*! array partitioning */
+        void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+          
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          if (unlikely(split.spatial))
+          {
+            create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); 
+
+            /* spatial split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+          }
+          else
+          {
+            /* object split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+          }
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { 
+                                                return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });          
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) {
+                                                const Vec3fa c = ref.bounds().center();
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          auto isLeft = [&] (const PrimRef &ref) { 
+            const Vec3fa c = ref.bounds().center();
+            return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        void deterministic_order(const PrimInfoExtRange& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        void splitFallback(const PrimInfoExtRange& set, 
+                           PrimInfoExtRange& lset, 
+                           PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++) {
+            left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+          }
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++) {
+            right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));	
+          }
+          const size_t rweight = right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const PrimitiveSplitterFactory& splitterFactory;
+        const CentGeomBBox3fa& root_info;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..ede0d04c78
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    struct HeuristicStrandSplit
+    {
+      typedef range<size_t> Set;
+  
+      static const size_t PARALLEL_THRESHOLD = 10000;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+      /*! stores all information to perform some split */
+      struct Split
+      {    
+	/*! construct an invalid split by default */
+	__forceinline Split()
+	  : sah(inf), axis0(zero), axis1(zero) {}
+	
+	/*! constructs specified split */
+	__forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+	  : sah(sah), axis0(axis0), axis1(axis1) {}
+	
+	/*! calculates standard surface area heuristic for the split */
+	__forceinline float splitSAH() const { return sah; }
+
+        /*! test if this split is valid */
+        __forceinline bool valid() const { return sah != float(inf); }
+		
+      public:
+	float sah;             //!< SAH cost of the split
+	Vec3fa axis0, axis1;   //!< axis the two strands are aligned into
+      };
+
+      __forceinline HeuristicStrandSplit () // FIXME: required?
+        : scene(nullptr), prims(nullptr) {}
+      
+      /*! remember prim array */
+      __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+        : scene(scene), prims(prims) {}
+      
+      __forceinline const Vec3fa direction(const PrimRef& prim) {
+        return scene->get(prim.geomID())->computeDirection(prim.primID());
+      }
+      
+      __forceinline const BBox3fa bounds(const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(prim.primID());
+      }
+
+      __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(space,prim.primID());
+      }
+
+      /*! finds the best split */
+      const Split find(const range<size_t>& set, size_t logBlockSize)
+      {
+        Vec3fa axis0(0,0,1);
+        uint64_t bestGeomPrimID = -1;
+
+        /* curve with minimum ID determines first axis */
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          if (geomprimID >= bestGeomPrimID) continue;
+          const Vec3fa axis = direction(prims[i]);
+          if (sqr_length(axis) > 1E-18f) {
+            axis0 = normalize(axis);
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+        float bestCos = 1.0f;
+        Vec3fa axis1 = axis0;
+        bestGeomPrimID = -1;
+        for (size_t i=set.begin(); i<set.end(); i++) 
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          Vec3fa axisi = direction(prims[i]);
+          float leni = length(axisi);
+          if (leni == 0.0f) continue;
+          axisi /= leni;
+          float cos = abs(dot(axisi,axis0));
+          if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+            bestCos = cos; axis1 = axisi;
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* partition the two strands */
+        size_t lnum = 0, rnum = 0;
+        BBox3fa lbounds = empty, rbounds = empty;
+        const LinearSpace3fa space0 = frame(axis0).transposed();
+        const LinearSpace3fa space1 = frame(axis1).transposed();
+        
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          PrimRef& prim = prims[i];
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,axis0));
+          const float cos1 = abs(dot(axisi,axis1));
+          
+          if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+          else             { rnum++; rbounds.extend(bounds(space1,prim)); }
+        }
+      
+        /*! return an invalid split if we do not partition */
+        if (lnum == 0 || rnum == 0) 
+          return Split(inf,axis0,axis1);
+      
+        /*! calculate sah for the split */
+        const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+        return Split(sah,axis0,axis1);
+      }
+
+      /*! array partitioning */
+      void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) 
+      {
+        if (!split.valid()) {
+          deterministic_order(set);
+          return splitFallback(set,lset,rset);
+        }
+        
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        CentGeomBBox3fa local_left(empty);
+        CentGeomBBox3fa local_right(empty);
+
+        auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { 
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,split.axis0));
+          const float cos1 = abs(dot(axisi,split.axis1));
+          return cos0 > cos1;
+        };
+
+        auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { 
+          pinfo.extend(bounds(ref)); 
+        };
+        
+        size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+        
+        new (&lset) PrimInfoRange(begin,center,local_left);
+        new (&rset) PrimInfoRange(center,end,local_right);
+        assert(area(lset.geomBounds) >= 0.0f);
+        assert(area(rset.geomBounds) >= 0.0f);
+      }
+
+      void deterministic_order(const Set& set) 
+      {
+        /* required as parallel partition destroys original primitive order */
+        std::sort(&prims[set.begin()],&prims[set.end()]);
+      }
+      
+      void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+      {
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        const size_t center = (begin + end)/2;
+        
+        CentGeomBBox3fa left(empty);
+        for (size_t i=begin; i<center; i++)
+          left.extend(bounds(prims[i]));
+        new (&lset) PrimInfoRange(begin,center,left);
+        
+        CentGeomBBox3fa right(empty);
+        for (size_t i=center; i<end; i++)
+          right.extend(bounds(prims[i]));	
+        new (&rset) PrimInfoRange(center,end,right);
+      }
+      
+    private:
+      Scene* const scene;
+      PrimRef* const prims;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..c999941a11
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+      struct HeuristicMBlurTemporalSplit
+      {
+        typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+        typedef mvector<PrimRefMB>* PrimRefVector;
+        typedef typename PrimRefMB::BBox BBox; 
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+          : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+        struct TemporalBinInfo
+        {
+          __forceinline TemporalBinInfo () {
+          }
+
+          __forceinline TemporalBinInfo (EmptyTy)
+          {
+            for (size_t i=0; i<BINS-1; i++)
+            {
+              count0[i] = count1[i] = 0;
+              bounds0[i] = bounds1[i] = empty;
+            }
+          }
+          
+          void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+          {
+            for (int b=0; b<BINS-1; b++)
+            {
+              const float t = float(b+1)/float(BINS);
+              const float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* find linear bounds for both time segments */
+              for (size_t i=begin; i<end; i++) 
+              {
+                if (prims[i].time_range_overlap(dt0))
+                {
+                  const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+                  bounds0[b].extend(bn0);
+#else
+                  bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+                  count0[b] += prims[i].timeSegmentRange(dt0).size();
+                }
+
+                if (prims[i].time_range_overlap(dt1))
+                {
+                  const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+                  bounds1[b].extend(bn1);
+#else
+                  bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+                  count1[b] += prims[i].timeSegmentRange(dt1).size();
+                }
+              }
+            }
+          }
+
+          __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) 
+          {
+            if (likely(end-begin < parallelThreshold)) {
+              bin(prims,begin,end,time_range,set,recalculatePrimRef);
+            } 
+            else 
+            {
+              auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { 
+                TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; 
+              };
+              *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+            }
+          }
+          
+          /*! merges in other binning information */
+          __forceinline void merge (const TemporalBinInfo& other)
+          {
+            for (size_t i=0; i<BINS-1; i++) 
+            {
+              count0[i] += other.count0[i];
+              count1[i] += other.count1[i];
+              bounds0[i].extend(other.bounds0[i]);
+              bounds1[i].extend(other.bounds1[i]);
+            }
+          }
+
+          static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+            TemporalBinInfo r = a; r.merge(b); return r;
+          }
+                    
+          Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+          {
+            float bestSAH = inf;
+            float bestPos = 0.0f;
+            for (int b=0; b<BINS-1; b++)
+            {
+              float t = float(b+1)/float(BINS);
+              float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* calculate sah */
+              const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+              float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+              if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+              if (unlikely(rCount == 0)) sah1 = 0.0f;
+              const float sah = sah0+sah1;
+              if (sah < bestSAH) {
+                bestSAH = sah;
+                bestPos = center_time;
+              }
+            }
+            return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+          }
+          
+        public:
+          size_t count0[BINS-1];
+          size_t count1[BINS-1];
+          BBox bounds0[BINS-1];
+          BBox bounds1[BINS-1];
+        };
+        
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          assert(set.size() > 0);
+          TemporalBinInfo binner(empty);
+          binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+          Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+          if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return tsplit;
+        }
+
+        __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          assert(tsplit.sah != float(inf));
+          assert(tsplit.fpos > set.time_range.lower);
+          assert(tsplit.fpos < set.time_range.upper);
+
+          float center_time = tsplit.fpos;
+          const BBox1f time_range0(set.time_range.lower,center_time);
+          const BBox1f time_range1(center_time,set.time_range.upper);
+          mvector<PrimRefMB>& prims = *set.prims;
+          
+          /* calculate primrefs for first time range */
+          std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+          PrimRefVector lprims = new_vector.get();
+          
+          auto reduction_func0 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range0)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+                (*lprims)[i-set.begin()] = prim;
+                pinfo.add_primref(prim);
+              }
+              else
+              {
+                (*lprims)[i-set.begin()] = prims[i];
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+          /* primrefs for first time range are in lprims[0 .. set.size()) */
+          /* some primitives may need to be filtered out */
+          if (linfo.size() != set.size())
+            linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+                      
+          lset = SetMB(linfo,lprims,time_range0);
+
+          /* calculate primrefs for second time range */
+          auto reduction_func1 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range1)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+                prims[i] = prim;
+                pinfo.add_primref(prim);
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+          rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+          /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+          /* some primitives may need to be filtered out */
+          if (rinfo.size() != set.size())
+            rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+        
+          rset = SetMB(rinfo,&prims,time_range1);
+
+          return new_vector;
+        }
+
+      private:
+        MemoryMonitorInterface* device;              // device to report memory usage to
+        const RecalculatePrimRef recalculatePrimRef;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..06c1388742
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  // FIXME: maybe there's a better place for this util fct
+  __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+  {
+    const Vec3fa e0 = v1-v0;
+    const Vec3fa e1 = v2-v0;
+    const Vec3fa d = cross(e0,e1);
+    return fabs(d.x) + fabs(d.y) + fabs(d.z);
+  }
+
+  //namespace isa
+  //{
+    template<typename BBox>
+      class CentGeom
+    {
+    public:
+      __forceinline CentGeom () {}
+
+      __forceinline CentGeom (EmptyTy) 
+	: geomBounds(empty), centBounds(empty) {}
+      
+      __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) 
+	: geomBounds(geomBounds), centBounds(centBounds) {}
+      
+      template<typename PrimRef> 
+        __forceinline void extend_primref(const PrimRef& prim) 
+      {
+        BBox bounds; Vec3fa center;
+        prim.binBoundsAndCenter(bounds,center);
+        geomBounds.extend(bounds);
+        centBounds.extend(center);
+      }
+
+       template<typename PrimRef> 
+         __forceinline void extend_center2(const PrimRef& prim) 
+       {
+         BBox3fa bounds = prim.bounds();
+         geomBounds.extend(bounds);
+         centBounds.extend(bounds.center2());
+       }
+       
+      __forceinline void extend(const BBox& geomBounds_) {
+	geomBounds.extend(geomBounds_);
+	centBounds.extend(center2(geomBounds_));
+      }
+
+      __forceinline void merge(const CentGeom& other) 
+      {
+	geomBounds.extend(other.geomBounds);
+	centBounds.extend(other.centBounds);
+      }
+
+      static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+        CentGeom r = a; r.merge(b); return r;
+      }
+
+    public:
+      BBox geomBounds;   //!< geometry bounds of primitives
+      BBox3fa centBounds;   //!< centroid bounds of primitives
+    };
+
+    typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoT () {}
+
+      __forceinline PrimInfoT (EmptyTy) 
+	: CentGeom<BBox>(empty), begin(0), end(0) {}
+
+      __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        end++;
+      }
+
+       template<typename PrimRef> 
+         __forceinline void add_center2(const PrimRef& prim) {
+         CentGeom<BBox>::extend_center2(prim);
+         end++;
+       }
+
+        template<typename PrimRef> 
+          __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+          CentGeom<BBox>::extend_center2(prim);
+          end+=i;
+        }
+
+      /*__forceinline void add(const BBox& geomBounds_) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end++;
+      }
+
+      __forceinline void add(const BBox& geomBounds_, const size_t i) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end+=i;
+        }*/
+
+      __forceinline void merge(const PrimInfoT& other) 
+      {
+	CentGeom<BBox>::merge(other);
+        begin += other.begin;
+	end += other.end;
+      }
+
+      static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+        PrimInfoT r = a; r.merge(b); return r;
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return end-begin; 
+      }
+
+      __forceinline float halfArea() {
+        return expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+	//return halfArea(geomBounds)*float((num+3) >> 2);
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+	return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+      }
+      
+    public:
+      size_t begin,end;          //!< number of primitives
+    };
+
+    typedef PrimInfoT<BBox3fa> PrimInfo;
+    //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..e23de3df28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
@@ -0,0 +1,244 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
+    template<typename Mesh>
+    size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+    {
+      size_t numPrimitives = morton.size();
+
+      /* compute scene bounds */
+      std::pair<size_t,BBox3fa> cb_empty(0,empty);
+      auto cb = parallel_reduce 
+        ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+          {
+            size_t num = 0;
+            BBox3fa bounds = empty;
+            
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa prim_bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+              bounds.extend(center2(prim_bounds));
+              num++;
+            }
+            return std::make_pair(num,bounds);
+          }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+          return std::make_pair(a.first + b.first,merge(a.second,b.second)); 
+        });
+      
+      
+      size_t numPrimitivesGen = cb.first;
+      const BBox3fa centBounds = cb.second;
+      
+      /* compute morton codes */
+      if (likely(numPrimitivesGen == numPrimitives))
+      {
+        /* fast path if all primitives were valid */
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+              generator(mesh->bounds(j),unsigned(j));
+          });
+      }
+      else
+      {
+        /* slow path, fallback in case some primitives were invalid */
+        ParallelPrefixSumState<size_t> pstate;
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());
+        
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (!mesh->buildBounds(j,&bounds)) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());          
+      }
+      return numPrimitivesGen;
+    }
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    // template for grid meshes
+
+#if 0
+    template<>
+    PrimInfo createPrimRefArray<GridMesh,false>(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      PING;
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator<GridMesh,false> iter(scene);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k) -> PrimInfo
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!mesh->buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, const PrimInfo& base) -> PrimInfo
+        {
+          k = base.size();
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            BBox3fa bounds = empty;
+            if (!mesh->buildBounds(j,&bounds)) continue;
+            const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+            pinfo.add_center2(prim);
+            prims[k++] = prim;
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+#endif
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..9919c945c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
@@ -0,0 +1,28 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+    template<typename Mesh>
+      size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8bdb38b955
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)   
+#define CHECK_PRESPLIT(x) 
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{  
+  namespace isa
+  {
+
+    struct PresplitItem
+    {
+      union {
+        float priority;    
+        unsigned int data;
+      };
+      unsigned int index;
+      
+      __forceinline operator unsigned() const
+      {
+	return reinterpret_cast<const unsigned&>(priority);
+      }
+      __forceinline bool operator < (const PresplitItem& item) const
+      {
+	return (priority < item.priority);
+      }
+
+      template<typename Mesh>
+      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      {
+	const unsigned int geomID = ref.geomID();
+	const unsigned int primID = ref.primID();
+	const float area_aabb  = area(ref.bounds());
+	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+        const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+        assert(area_prim <= area_aabb);
+        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        assert(priority >= 0.0f && priority < FLT_LARGE);
+	return priority;      
+      }
+
+    
+    };
+
+    inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+      return cout << "index " << item.index << " priority " << item.priority;    
+    };
+
+    template<typename SplitterFactory>    
+      void splitPrimitive(SplitterFactory &Splitter,
+                          const PrimRef &prim,
+                          const unsigned int geomID,
+                          const unsigned int primID,
+                          const unsigned int split_level,
+                          const Vec3fa &grid_base, 
+                          const float grid_scale,
+                          const float grid_extend,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+      if (split_level == 0)
+      {
+        assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+        subPrims[numSubPrims++] = prim;
+      }
+      else
+      {
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+			
+        /* if all bits are equal then we cannot split */
+        if(unlikely(lower_code == upper_code))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = prim;
+          return;
+        }
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+        assert(prim.lower[dim] <= fsplit &&
+               prim.upper[dim] >= fsplit);
+		
+        /* split primitive */
+        const auto splitter = Splitter(prim);
+        BBox3fa left,right;
+        splitter(prim.bounds(),dim,fsplit,left,right);
+        assert(!left.empty());
+        assert(!right.empty());
+
+			    
+        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+      }
+    }
+    
+    
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	  return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	      return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+      return pinfo;	
+    }
+    
+    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    {
+      const Vec3fa lower = ref.lower;
+      const Vec3fa upper = ref.upper;
+      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+      Vec3ia ilower(floor(glower));
+      Vec3ia iupper(floor(gupper));
+      
+      /* this ignores dimensions that are empty */
+      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+      /* compute a morton code for the lower and upper grid coordinates. */
+      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+      return Vec2i(lower_code,upper_code);
+    }
+
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {	
+      static const size_t MIN_STEP_SIZE = 128;
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+      /* use correct number of primitives */
+      size_t numPrimitives = pinfo.size();
+      const size_t alloc_numPrimitives = prims.size(); 
+      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+      /* set up primitive splitter */
+      SplitterFactory Splitter(scene);
+
+
+      DBG_PRESPLIT(
+        const size_t org_numPrimitives = pinfo.size();
+        PRINT(numPrimitives);		
+        PRINT(alloc_numPrimitives);		
+        PRINT(numSplitPrimitivesBudget);
+        );
+
+      /* allocate double buffer presplit items */
+      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+      /* compute grid */
+      const Vec3fa grid_base    = pinfo.geomBounds.lower;
+      const Vec3fa grid_diag    = pinfo.geomBounds.size();
+      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
+      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+      /* init presplit items and get total sum */
+      const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+          float sum = 0.0f;
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {		
+            presplitItem[i].index = (unsigned int)i;
+            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            /* if all bits are equal then we cannot split */
+            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            /* FIXME: sum undeterministic */
+            sum += presplitItem[i].priority;
+          }
+          return sum;
+        },[](const float& a, const float& b) -> float { return a+b; });
+
+      /* compute number of splits per primitive */
+      const float inv_psum = 1.0f / psum;
+      parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {
+            if (presplitItem[i].priority > 0.0f)
+            {
+              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+              {
+                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              }
+              else
+                presplitItem[i].priority = 0.0f;
+            }
+          }
+        });
+
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
+      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+      /* anything to split ? */
+      if (center < numPrimitives)
+      {
+        const size_t numPrimitivesToSplit = numPrimitives - center;
+        assert(presplitItem[center].priority >= 1.0f);
+
+        /* sort presplit items in ascending order */
+        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+        CHECK_PRESPLIT(
+          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+              for (size_t i=r.begin(); i<r.end(); i++)
+                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+            });
+          );
+
+        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+            size_t sum = 0;
+            for (size_t i=t.begin(); i<t.end(); i++)
+            {	
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+              assert(presplitItem[i].priority >= 1.0f);
+              const unsigned int  primrefID = presplitItem[i].index;	
+              const float prio              = presplitItem[i].priority;
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = (unsigned int)prio;
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              assert(numSubPrims);
+              numSubPrims--; // can reuse slot 
+              sum+=numSubPrims;
+              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+              primOffset0[i-center] = numSubPrims;
+            }
+            return sum;
+          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        
+        /* if we are over budget, need to shrink the range */
+        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        {
+          size_t new_center = numPrimitives-1;
+          size_t sum = 0;
+          for (;new_center>=center;new_center--)
+          {
+            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+            sum += numSubPrims;
+          }
+          new_center++;
+          center = new_center;
+        }
+
+        /* parallel prefix sum to compute offsets for storing sub-primitives */
+        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+            for (size_t j=rn.begin(); j<rn.end(); j++)		    
+            {
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+              const unsigned int  primrefID = presplitItem[j].index;	
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+              assert(split_levels);
+              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              const size_t newID = numPrimitives + primOffset1[j-center];              
+              assert(newID+numSubPrims <= alloc_numPrimitives);
+              prims[primrefID] = subPrims[0];
+              for (size_t i=1;i<numSubPrims;i++)
+                prims[newID+i-1] = subPrims[i];
+            }
+          });
+
+        numPrimitives += offset;
+        DBG_PRESPLIT(
+          PRINT(pinfo.size());
+          PRINT(numPrimitives);
+          PRINT((float)numPrimitives/org_numPrimitives));                
+      }
+                
+      /* recompute centroid bounding boxes */
+      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+          PrimInfo p(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            p.add_center2(prims[j]);
+          return p;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+  
+      assert(pinfo.size() == numPrimitives);
+      
+      /* free double buffer presplit items */
+      alignedFree(tmp_presplitItem);		
+      alignedFree(presplitItem);
+      return pinfo;	
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h
new file mode 100644
index 0000000000..dbd6cf07c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
+                                    const Vec3fa (&inv_length)[N],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+      __forceinline void splitPolygon(const PrimRef& prim, 
+                                      const size_t dim, 
+                                      const float pos, 
+                                      const Vec3fa (&v)[N+1],
+                                      PrimRef& left_o, 
+                                      PrimRef& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+      new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+    }
+    
+    struct TriangleSplitter
+    {
+      __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );  
+        TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+        v[0] = mesh->vertex(tri.v[0]);
+        v[1] = mesh->vertex(tri.v[1]);
+        v[2] = mesh->vertex(tri.v[2]);
+        v[3] = mesh->vertex(tri.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[4];
+      Vec3fa inv_length[3];
+    };
+    
+    struct TriangleSplitterFactory
+    {
+      __forceinline TriangleSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+        return TriangleSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+    struct QuadSplitter
+    {
+      __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
+        QuadMesh::Quad quad = mesh->quad(prim.primID());
+        v[0] = mesh->vertex(quad.v[0]);
+        v[1] = mesh->vertex(quad.v[1]);
+        v[2] = mesh->vertex(quad.v[2]);
+        v[3] = mesh->vertex(quad.v[3]);
+        v[4] = mesh->vertex(quad.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[5];
+      Vec3fa inv_length[4];
+    };
+    
+    struct QuadSplitterFactory
+    {
+      __forceinline QuadSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+        return QuadSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+  }
+}
+