summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/builders
diff options
context:
space:
mode:
authorRĂ©mi Verschelde <remi@verschelde.fr>2021-04-27 19:07:12 +0200
committerGitHub <noreply@github.com>2021-04-27 19:07:12 +0200
commit95cfce661bbe9700209c09dfe297ab7ef5ebfe09 (patch)
tree9f3c8a98b619e2a0dd171d6cb6d8dc2e791baa14 /thirdparty/embree-aarch64/kernels/builders
parentb999fbc4bd349cce153c2133dd0487694add1a05 (diff)
parent4d9d99bb827967e2bb931eeb8c3f0e079b39ae1a (diff)
Merge pull request #48050 from JFonS/occlusion_culling
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/builders')
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h411
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h501
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h692
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h526
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h669
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h972
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h205
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h302
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h443
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h414
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h552
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h188
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h237
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/priminfo.h362
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp244
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/primrefgen.h28
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h371
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/splitter.h169
18 files changed, 7286 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..755ce255fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+ namespace isa
+ {
+ struct BVHBuilderHair
+ {
+ /*! settings for builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t logBlockSize; //!< log2 of blocksize for SAH heuristic
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ size_t finished_range_threshold; //!< finished range threshold
+ };
+
+ template<typename NodeRef,
+ typename CreateAllocFunc,
+ typename CreateAABBNodeFunc,
+ typename SetAABBNodeFunc,
+ typename CreateOBBNodeFunc,
+ typename SetOBBNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor,
+ typename ReportFinishedRangeFunc>
+
+ class BuilderT
+ {
+ ALIGNED_CLASS_(16);
+ friend struct BVHBuilderHair;
+
+ typedef FastAllocator::CachedAllocator Allocator;
+ typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+ typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+ typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+ static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth
+ static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build
+
+ static const size_t travCostAligned = 1;
+ static const size_t travCostUnaligned = 5;
+ static const size_t intCost = 6;
+
+ BuilderT (Scene* scene,
+ PrimRef* prims,
+ const CreateAllocFunc& createAlloc,
+ const CreateAABBNodeFunc& createAABBNode,
+ const SetAABBNodeFunc& setAABBNode,
+ const CreateOBBNodeFunc& createOBBNode,
+ const SetOBBNodeFunc& setOBBNode,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ const ReportFinishedRangeFunc& reportFinishedRange,
+ const Settings settings)
+
+ : cfg(settings),
+ prims(prims),
+ createAlloc(createAlloc),
+ createAABBNode(createAABBNode),
+ setAABBNode(setAABBNode),
+ createOBBNode(createOBBNode),
+ setOBBNode(setOBBNode),
+ createLeaf(createLeaf),
+ progressMonitor(progressMonitor),
+ reportFinishedRange(reportFinishedRange),
+ alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+ /*! checks if all primitives are from the same geometry */
+ __forceinline bool sameGeometry(const PrimInfoRange& range)
+ {
+ if (range.size() == 0) return true;
+ unsigned int firstGeomID = prims[range.begin()].geomID();
+ for (size_t i=range.begin()+1; i<range.end(); i++) {
+ if (prims[i].geomID() != firstGeomID){
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*! creates a large leaf that could be larger than supported by the BVH */
+ NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (depth > cfg.maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* create leaf for few primitives */
+ if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+ return createLeaf(prims,pinfo,alloc);
+
+ /* fill all children by always splitting the largest one */
+ PrimInfoRange children[MAX_BRANCHING_FACTOR];
+ unsigned numChildren = 1;
+ children[0] = pinfo;
+
+ do {
+
+ /* find best child with largest bounding box area */
+ int bestChild = -1;
+ size_t bestSize = 0;
+ for (unsigned i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+ continue;
+
+ /* remember child with largest size */
+ if (children[i].size() > bestSize) {
+ bestSize = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ __aligned(64) PrimInfoRange left, right;
+ if (!sameGeometry(children[bestChild])) {
+ alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+ } else {
+ alignedHeuristic.splitFallback(children[bestChild],left,right);
+ }
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = left;
+ children[numChildren+0] = right;
+ numChildren++;
+
+ } while (numChildren < cfg.branchingFactor);
+
+ /* create node */
+ auto node = createAABBNode(alloc);
+
+ for (size_t i=0; i<numChildren; i++) {
+ const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+ setAABBNode(node,i,child,children[i].geomBounds);
+ }
+
+ return node;
+ }
+
+ /*! performs split */
+ __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+ {
+ /* variable to track the SAH of the best splitting approach */
+ float bestSAH = inf;
+ const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+ const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+ /* try standard binning in aligned space */
+ float alignedObjectSAH = inf;
+ HeuristicBinningSAH::Split alignedObjectSplit;
+ if (aligned) {
+ alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+ alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+ bestSAH = min(alignedObjectSAH,bestSAH);
+ }
+
+ /* try standard binning in unaligned space */
+ UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+ LinearSpace3fa uspace;
+ float unalignedObjectSAH = inf;
+ if (bestSAH > 0.7f*leafSAH) {
+ uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+ const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+ unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+ unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+ bestSAH = min(unalignedObjectSAH,bestSAH);
+ }
+
+ /* try splitting into two strands */
+ HeuristicStrandSplitSAH::Split strandSplit;
+ float strandSAH = inf;
+ if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+ strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+ strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+ bestSAH = min(strandSAH,bestSAH);
+ }
+
+ /* fallback if SAH heuristics failed */
+ if (unlikely(!std::isfinite(bestSAH)))
+ {
+ alignedHeuristic.deterministic_order(pinfo);
+ alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+ }
+
+ /* perform aligned split if this is best */
+ else if (bestSAH == alignedObjectSAH) {
+ alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+ }
+
+ /* perform unaligned split if this is best */
+ else if (bestSAH == unalignedObjectSAH) {
+ unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+ aligned = false;
+ }
+
+ /* perform strand split if this is best */
+ else if (bestSAH == strandSAH) {
+ strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+ aligned = false;
+ }
+
+ /* can never happen */
+ else
+ assert(false);
+ }
+
+ /*! recursive build */
+ NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAlloc();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+ progressMonitor(pinfo.size());
+
+ PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+ /* create leaf node */
+ if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+ alignedHeuristic.deterministic_order(pinfo);
+ return createLargeLeaf(depth,pinfo,alloc);
+ }
+
+ /* fill all children by always splitting the one with the largest surface area */
+ size_t numChildren = 1;
+ children[0] = pinfo;
+ bool aligned = true;
+
+ do {
+
+ /* find best child with largest bounding box area */
+ ssize_t bestChild = -1;
+ float bestArea = neg_inf;
+ for (size_t i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= cfg.minLeafSize)
+ continue;
+
+ /* remember child with largest area */
+ if (area(children[i].geomBounds) > bestArea) {
+ bestArea = area(children[i].geomBounds);
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ PrimInfoRange left, right;
+ split(children[bestChild],left,right,aligned);
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = left;
+ children[numChildren+0] = right;
+ numChildren++;
+
+ } while (numChildren < cfg.branchingFactor);
+
+ NodeRef node;
+
+ /* create aligned node */
+ if (aligned)
+ {
+ node = createAABBNode(alloc);
+
+ /* spawn tasks or ... */
+ if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+ {
+ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+ setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+ /* ... continue sequentially */
+ else {
+ for (size_t i=0; i<numChildren; i++) {
+ const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+ setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+ }
+ }
+ }
+
+ /* create unaligned node */
+ else
+ {
+ node = createOBBNode(alloc);
+
+ /* spawn tasks or ... */
+ if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+ {
+ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+ const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+ const OBBox3fa obounds(space,sinfo.geomBounds);
+ const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+ setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+ /* ... continue sequentially */
+ else
+ {
+ for (size_t i=0; i<numChildren; i++) {
+ const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+ const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+ const OBBox3fa obounds(space,sinfo.geomBounds);
+ const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+ setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+ }
+ }
+ }
+
+ /* reports a finished range of primrefs */
+ if (unlikely(alloc_barrier))
+ reportFinishedRange(pinfo);
+
+ return node;
+ }
+
+ private:
+ Settings cfg;
+ PrimRef* prims;
+ const CreateAllocFunc& createAlloc;
+ const CreateAABBNodeFunc& createAABBNode;
+ const SetAABBNodeFunc& setAABBNode;
+ const CreateOBBNodeFunc& createOBBNode;
+ const SetOBBNodeFunc& setOBBNode;
+ const CreateLeafFunc& createLeaf;
+ const ProgressMonitor& progressMonitor;
+ const ReportFinishedRangeFunc& reportFinishedRange;
+
+ private:
+ HeuristicBinningSAH alignedHeuristic;
+ UnalignedHeuristicBinningSAH unalignedHeuristic;
+ HeuristicStrandSplitSAH strandHeuristic;
+ };
+
+ template<typename NodeRef,
+ typename CreateAllocFunc,
+ typename CreateAABBNodeFunc,
+ typename SetAABBNodeFunc,
+ typename CreateOBBNodeFunc,
+ typename SetOBBNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor,
+ typename ReportFinishedRangeFunc>
+
+ static NodeRef build (const CreateAllocFunc& createAlloc,
+ const CreateAABBNodeFunc& createAABBNode,
+ const SetAABBNodeFunc& setAABBNode,
+ const CreateOBBNodeFunc& createOBBNode,
+ const SetOBBNodeFunc& setOBBNode,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ const ReportFinishedRangeFunc& reportFinishedRange,
+ Scene* scene,
+ PrimRef* prims,
+ const PrimInfo& pinfo,
+ const Settings settings)
+ {
+ typedef BuilderT<NodeRef,
+ CreateAllocFunc,
+ CreateAABBNodeFunc,SetAABBNodeFunc,
+ CreateOBBNodeFunc,SetOBBNodeFunc,
+ CreateLeafFunc,ProgressMonitor,
+ ReportFinishedRangeFunc> Builder;
+
+ Builder builder(scene,prims,createAlloc,
+ createAABBNode,setAABBNode,
+ createOBBNode,setOBBNode,
+ createLeaf,progressMonitor,reportFinishedRange,settings);
+
+ NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..92be2f7e65
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct BVHBuilderMorton
+ {
+ static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
+
+ /*! settings for morton builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+ /*! initialize settings from API settings */
+ Settings (const RTCBuildArguments& settings)
+ : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+ {
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
+
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+ : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+ {
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+ };
+
+ /*! Build primitive consisting of morton code and primitive ID. */
+ struct __aligned(8) BuildPrim
+ {
+ union {
+ struct {
+ unsigned int code; //!< morton code
+ unsigned int index; //!< i'th primitive
+ };
+ uint64_t t;
+ };
+
+ /*! interface for radix sort */
+ __forceinline operator unsigned() const { return code; }
+
+ /*! interface for standard sort */
+ __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+ };
+
+ /*! maps bounding box to morton code */
+ struct MortonCodeMapping
+ {
+ static const size_t LATTICE_BITS_PER_DIM = 10;
+ static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+ vfloat4 base;
+ vfloat4 scale;
+
+ __forceinline MortonCodeMapping(const BBox3fa& bounds)
+ {
+ base = (vfloat4)bounds.lower;
+ const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+ scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+ }
+
+ __forceinline const vint4 bin (const BBox3fa& box) const
+ {
+ const vfloat4 lower = (vfloat4)box.lower;
+ const vfloat4 upper = (vfloat4)box.upper;
+ const vfloat4 centroid = lower+upper;
+ return vint4((centroid-base)*scale);
+ }
+
+ __forceinline unsigned int code (const BBox3fa& box) const
+ {
+ const vint4 binID = bin(box);
+ const unsigned int x = extract<0>(binID);
+ const unsigned int y = extract<1>(binID);
+ const unsigned int z = extract<2>(binID);
+ const unsigned int xyz = bitInterleave(x,y,z);
+ return xyz;
+ }
+ };
+
+#if defined (__AVX2__)
+
+ /*! for AVX2 there is a fast scalar bitInterleave */
+ struct MortonCodeGenerator
+ {
+ __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+ : mapping(mapping), dest(dest) {}
+
+ __forceinline void operator() (const BBox3fa& b, const unsigned index)
+ {
+ dest->index = index;
+ dest->code = mapping.code(b);
+ dest++;
+ }
+
+ public:
+ const MortonCodeMapping mapping;
+ BuildPrim* dest;
+ size_t currentID;
+ };
+
+#else
+
+ /*! before AVX2 is it better to use the SSE version of bitInterleave */
+ struct MortonCodeGenerator
+ {
+ __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+ : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+ __forceinline ~MortonCodeGenerator()
+ {
+ if (slots != 0)
+ {
+ const vint4 code = bitInterleave(ax,ay,az);
+ for (size_t i=0; i<slots; i++) {
+ dest[currentID-slots+i].index = ai[i];
+ dest[currentID-slots+i].code = code[i];
+ }
+ }
+ }
+
+ __forceinline void operator() (const BBox3fa& b, const unsigned index)
+ {
+ const vint4 binID = mapping.bin(b);
+ ax[slots] = extract<0>(binID);
+ ay[slots] = extract<1>(binID);
+ az[slots] = extract<2>(binID);
+ ai[slots] = index;
+ slots++;
+ currentID++;
+
+ if (slots == 4)
+ {
+ const vint4 code = bitInterleave(ax,ay,az);
+ vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+ vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+ slots = 0;
+ }
+ }
+
+ public:
+ const MortonCodeMapping mapping;
+ BuildPrim* dest;
+ size_t currentID;
+ size_t slots;
+ vint4 ax, ay, az, ai;
+ };
+
+#endif
+
+ template<
+ typename ReductionTy,
+ typename Allocator,
+ typename CreateAllocator,
+ typename CreateNodeFunc,
+ typename SetNodeBoundsFunc,
+ typename CreateLeafFunc,
+ typename CalculateBounds,
+ typename ProgressMonitor>
+
+ class BuilderT : private Settings
+ {
+ ALIGNED_CLASS_(16);
+
+ public:
+
+ BuilderT (CreateAllocator& createAllocator,
+ CreateNodeFunc& createNode,
+ SetNodeBoundsFunc& setBounds,
+ CreateLeafFunc& createLeaf,
+ CalculateBounds& calculateBounds,
+ ProgressMonitor& progressMonitor,
+ const Settings& settings)
+
+ : Settings(settings),
+ createAllocator(createAllocator),
+ createNode(createNode),
+ setBounds(setBounds),
+ createLeaf(createLeaf),
+ calculateBounds(calculateBounds),
+ progressMonitor(progressMonitor),
+ morton(nullptr) {}
+
+ ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (depth > maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* create leaf for few primitives */
+ if (current.size() <= maxLeafSize)
+ return createLeaf(current,alloc);
+
+ /* fill all children by always splitting the largest one */
+ range<unsigned> children[MAX_BRANCHING_FACTOR];
+ size_t numChildren = 1;
+ children[0] = current;
+
+ do {
+
+ /* find best child with largest number of primitives */
+ size_t bestChild = -1;
+ size_t bestSize = 0;
+ for (size_t i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= maxLeafSize)
+ continue;
+
+ /* remember child with largest size */
+ if (children[i].size() > bestSize) {
+ bestSize = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == size_t(-1)) break;
+
+ /*! split best child into left and right child */
+ auto split = children[bestChild].split();
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = split.first;
+ children[numChildren+0] = split.second;
+ numChildren++;
+
+ } while (numChildren < branchingFactor);
+
+ /* create node */
+ auto node = createNode(alloc,numChildren);
+
+ /* recurse into each child */
+ ReductionTy bounds[MAX_BRANCHING_FACTOR];
+ for (size_t i=0; i<numChildren; i++)
+ bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+ return setBounds(node,bounds,numChildren);
+ }
+
+ /*! recreates morton codes when reaching a region where all codes are identical */
+ __noinline void recreateMortonCodes(const range<unsigned>& current) const
+ {
+ /* fast path for small ranges */
+ if (likely(current.size() < 1024))
+ {
+ /*! recalculate centroid bounds */
+ BBox3fa centBounds(empty);
+ for (size_t i=current.begin(); i<current.end(); i++)
+ centBounds.extend(center2(calculateBounds(morton[i])));
+
+ /* recalculate morton codes */
+ MortonCodeMapping mapping(centBounds);
+ for (size_t i=current.begin(); i<current.end(); i++)
+ morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+ /* sort morton codes */
+ std::sort(morton+current.begin(),morton+current.end());
+ }
+ else
+ {
+ /*! recalculate centroid bounds */
+ auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+ BBox3fa centBounds = empty;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ centBounds.extend(center2(calculateBounds(morton[i])));
+ return centBounds;
+ };
+ const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+ BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+ /* recalculate morton codes */
+ MortonCodeMapping mapping(centBounds);
+ parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ morton[i].code = mapping.code(calculateBounds(morton[i]));
+ }
+ });
+
+ /*! sort morton codes */
+#if defined(TASKING_TBB)
+ tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+ radixsort32(morton+current.begin(),current.size());
+#endif
+ }
+ }
+
+ __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+ {
+ const unsigned int code_start = morton[current.begin()].code;
+ const unsigned int code_end = morton[current.end()-1].code;
+ unsigned int bitpos = lzcnt(code_start^code_end);
+
+ /* if all items mapped to same morton code, then re-create new morton codes for the items */
+ if (unlikely(bitpos == 32))
+ {
+ recreateMortonCodes(current);
+ const unsigned int code_start = morton[current.begin()].code;
+ const unsigned int code_end = morton[current.end()-1].code;
+ bitpos = lzcnt(code_start^code_end);
+
+ /* if the morton code is still the same, goto fall back split */
+ if (unlikely(bitpos == 32)) {
+ current.split(left,right);
+ return;
+ }
+ }
+
+ /* split the items at the topmost different morton code bit */
+ const unsigned int bitpos_diff = 31-bitpos;
+ const unsigned int bitmask = 1 << bitpos_diff;
+
+ /* find location where bit differs using binary search */
+ unsigned begin = current.begin();
+ unsigned end = current.end();
+ while (begin + 1 != end) {
+ const unsigned mid = (begin+end)/2;
+ const unsigned bit = morton[mid].code & bitmask;
+ if (bit == 0) begin = mid; else end = mid;
+ }
+ unsigned center = end;
+#if defined(DEBUG)
+ for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
+ for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+ left = make_range(current.begin(),center);
+ right = make_range(center,current.end());
+ }
+
+ ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAllocator();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && current.size() <= singleThreadThreshold)
+ progressMonitor(current.size());
+
+ /* create leaf node */
+ if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+ return createLargeLeaf(depth,current,alloc);
+
+ /* fill all children by always splitting the one with the largest surface area */
+ range<unsigned> children[MAX_BRANCHING_FACTOR];
+ split(current,children[0],children[1]);
+ size_t numChildren = 2;
+
+ while (numChildren < branchingFactor)
+ {
+ /* find best child with largest number of primitives */
+ int bestChild = -1;
+ unsigned bestItems = 0;
+ for (unsigned int i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= minLeafSize)
+ continue;
+
+ /* remember child with largest area */
+ if (children[i].size() > bestItems) {
+ bestItems = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ range<unsigned> left, right;
+ split(children[bestChild],left,right);
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = left;
+ children[numChildren+0] = right;
+ numChildren++;
+ }
+
+ /* create leaf node if no split is possible */
+ if (unlikely(numChildren == 1))
+ return createLeaf(current,alloc);
+
+ /* allocate node */
+ auto node = createNode(alloc,numChildren);
+
+ /* process top parts of tree parallel */
+ ReductionTy bounds[MAX_BRANCHING_FACTOR];
+ if (current.size() > singleThreadThreshold)
+ {
+ /*! parallel_for is faster than spawing sub-tasks */
+ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ bounds[i] = recurse(depth+1,children[i],nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+
+ /* finish tree sequentially */
+ else
+ {
+ for (size_t i=0; i<numChildren; i++)
+ bounds[i] = recurse(depth+1,children[i],alloc,false);
+ }
+
+ return setBounds(node,bounds,numChildren);
+ }
+
+ /* build function */
+ ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+ {
+ /* sort morton codes */
+ morton = src;
+ radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+ /* build BVH */
+ const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+
+ public:
+ CreateAllocator& createAllocator;
+ CreateNodeFunc& createNode;
+ SetNodeBoundsFunc& setBounds;
+ CreateLeafFunc& createLeaf;
+ CalculateBounds& calculateBounds;
+ ProgressMonitor& progressMonitor;
+
+ public:
+ BuildPrim* morton;
+ };
+
+
+ template<
+ typename ReductionTy,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename SetBoundsFunc,
+ typename CreateLeafFunc,
+ typename CalculateBoundsFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAllocator,
+ CreateNodeFunc createNode,
+ SetBoundsFunc setBounds,
+ CreateLeafFunc createLeaf,
+ CalculateBoundsFunc calculateBounds,
+ ProgressMonitor progressMonitor,
+ BuildPrim* src,
+ BuildPrim* tmp,
+ size_t numPrimitives,
+ const Settings& settings)
+ {
+ typedef BuilderT<
+ ReductionTy,
+ decltype(createAllocator()),
+ CreateAllocFunc,
+ CreateNodeFunc,
+ SetBoundsFunc,
+ CreateLeafFunc,
+ CalculateBoundsFunc,
+ ProgressMonitor> Builder;
+
+ Builder builder(createAllocator,
+ createNode,
+ setBounds,
+ createLeaf,
+ calculateBounds,
+ progressMonitor,
+ settings);
+
+ return builder.build(src,tmp,numPrimitives);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..4c138dacdb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS 32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<typename T>
+ struct SharedVector
+ {
+ __forceinline SharedVector() {}
+
+ __forceinline SharedVector(T* ptr, size_t refCount = 1)
+ : prims(ptr), refCount(refCount) {}
+
+ __forceinline void incRef() {
+ refCount++;
+ }
+
+ __forceinline void decRef()
+ {
+ if (--refCount == 0)
+ delete prims;
+ }
+
+ T* prims;
+ size_t refCount;
+ };
+
+ template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+ struct LocalChildListT
+ {
+ typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+ __forceinline LocalChildListT (const BuildRecord& record)
+ : numChildren(1), numSharedPrimVecs(1)
+ {
+ /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+ children[0] = record;
+ primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+ }
+
+ __forceinline ~LocalChildListT()
+ {
+ for (size_t i = 0; i < numChildren; i++)
+ primvecs[i]->decRef();
+ }
+
+ __forceinline BuildRecord& operator[] ( const size_t i ) {
+ return children[i];
+ }
+
+ __forceinline size_t size() const {
+ return numChildren;
+ }
+
+ __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+ {
+ SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+ if (lrecord.prims.prims == bsharedPrimVec->prims) {
+ primvecs[bestChild] = bsharedPrimVec;
+ bsharedPrimVec->incRef();
+ }
+ else {
+ primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+ }
+
+ if (rrecord.prims.prims == bsharedPrimVec->prims) {
+ primvecs[numChildren] = bsharedPrimVec;
+ bsharedPrimVec->incRef();
+ }
+ else {
+ primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+ }
+ bsharedPrimVec->decRef();
+ new_vector.release();
+
+ children[bestChild] = lrecord;
+ children[numChildren] = rrecord;
+ numChildren++;
+ }
+
+ public:
+ array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+ array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+ size_t numChildren;
+
+ array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+ size_t numSharedPrimVecs;
+ };
+
+ template<typename Mesh>
+ struct RecalculatePrimRef
+ {
+ Scene* scene;
+
+ __forceinline RecalculatePrimRef (Scene* scene)
+ : scene(scene) {}
+
+ __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+ {
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const Mesh* mesh = scene->get<Mesh>(geomID);
+ const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+ const range<int> tbounds = mesh->timeSegmentRange(time_range);
+ return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+ }
+
+ // __noinline is workaround for ICC16 bug under MacOSX
+ __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+ {
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const Mesh* mesh = scene->get<Mesh>(geomID);
+ const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+ const range<int> tbounds = mesh->timeSegmentRange(time_range);
+ return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+ }
+
+ __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+ return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+ }
+
+ // __noinline is workaround for ICC16 bug under MacOSX
+ __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+ return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+ }
+ };
+
+ struct VirtualRecalculatePrimRef
+ {
+ Scene* scene;
+
+ __forceinline VirtualRecalculatePrimRef (Scene* scene)
+ : scene(scene) {}
+
+ __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+ {
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const Geometry* mesh = scene->get(geomID);
+ const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+ const range<int> tbounds = mesh->timeSegmentRange(time_range);
+ return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+ }
+
+ __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+ {
+ const unsigned geomID = prim.geomID();
+ const unsigned primID = prim.primID();
+ const Geometry* mesh = scene->get(geomID);
+ const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+ const range<int> tbounds = mesh->timeSegmentRange(time_range);
+ return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+ }
+
+ __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+ return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+ }
+
+ __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+ return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+ }
+ };
+
+ struct BVHBuilderMSMBlur
+ {
+ /*! settings for msmblur builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+ travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+ singleThreadThreshold(1024) {}
+
+
+ Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+ : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+ travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+ {
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t logBlockSize; //!< log2 of blocksize for SAH heuristic
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ float travCost; //!< estimated cost of one traversal step
+ float intCost; //!< estimated cost of one primitive intersection
+ bool singleLeafTimeSegment; //!< split time to single time range
+ size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+ };
+
+ struct BuildRecord
+ {
+ public:
+ __forceinline BuildRecord () {}
+
+ __forceinline BuildRecord (size_t depth)
+ : depth(depth) {}
+
+ __forceinline BuildRecord (const SetMB& prims, size_t depth)
+ : depth(depth), prims(prims) {}
+
+ __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+ return a.prims.size() < b.prims.size();
+ }
+
+ __forceinline size_t size() const {
+ return prims.size();
+ }
+
+ public:
+ size_t depth; //!< Depth of the root of this subtree.
+ SetMB prims; //!< The list of primitives.
+ };
+
+ struct BuildRecordSplit : public BuildRecord
+ {
+ __forceinline BuildRecordSplit () {}
+
+ __forceinline BuildRecordSplit (size_t depth)
+ : BuildRecord(depth) {}
+
+ __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+ : BuildRecord(record), split(split) {}
+
+ BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+ };
+
+ template<
+ typename NodeRef,
+ typename RecalculatePrimRef,
+ typename Allocator,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename SetNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor>
+
+ class BuilderT
+ {
+ ALIGNED_CLASS_(16);
+ static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth
+
+ typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+ typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+ typedef mvector<PrimRefMB>* PrimRefVector;
+ typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+ typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+ typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+ public:
+
+ BuilderT (MemoryMonitorInterface* device,
+ const RecalculatePrimRef recalculatePrimRef,
+ const CreateAllocFunc createAlloc,
+ const CreateNodeFunc createNode,
+ const SetNodeFunc setNode,
+ const CreateLeafFunc createLeaf,
+ const ProgressMonitor progressMonitor,
+ const Settings& settings)
+ : cfg(settings),
+ heuristicObjectSplit(),
+ heuristicTemporalSplit(device, recalculatePrimRef),
+ recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+ progressMonitor(progressMonitor)
+ {
+ if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+ }
+
+ /*! finds the best split */
+ const Split find(const SetMB& set)
+ {
+ /* first try standard object split */
+ const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+ const float object_split_sah = object_split.splitSAH();
+
+ /* test temporal splits only when object split was bad */
+ const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+ if (object_split_sah < 0.50f*leaf_sah)
+ return object_split;
+
+ /* do temporal splits only if the the time range is big enough */
+ if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+ {
+ const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+ const float temporal_split_sah = temporal_split.splitSAH();
+
+ /* take temporal split if it improved SAH */
+ if (temporal_split_sah < object_split_sah)
+ return temporal_split;
+ }
+
+ return object_split;
+ }
+
+ /*! array partitioning */
+ __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ /* perform object split */
+ if (likely(split.data == Split::SPLIT_OBJECT)) {
+ heuristicObjectSplit.split(split,set,lset,rset);
+ }
+ /* perform temporal split */
+ else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+ return heuristicTemporalSplit.split(split,set,lset,rset);
+ }
+ /* perform fallback split */
+ else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+ set.deterministic_order();
+ splitFallback(set,lset,rset);
+ }
+ /* split by geometry */
+ else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+ set.deterministic_order();
+ splitByGeometry(set,lset,rset);
+ }
+ else
+ assert(false);
+
+ return std::unique_ptr<mvector<PrimRefMB>>();
+ }
+
+ /*! finds the best fallback split */
+ __noinline Split findFallback(const SetMB& set)
+ {
+ /* split if primitives are not from same geometry */
+ if (!sameGeometry(set))
+ return Split(0.0f,Split::SPLIT_GEOMID);
+
+ /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+ if (cfg.singleLeafTimeSegment)
+ {
+ /* test if one primitive has more than one time segment in time range, if so split time */
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ const PrimRefMB& prim = (*set.prims)[i];
+ const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+ const int localTimeSegments = itime_range.size();
+ assert(localTimeSegments > 0);
+ if (localTimeSegments > 1) {
+ const int icenter = (itime_range.begin() + itime_range.end())/2;
+ const float splitTime = prim.timeStep(icenter);
+ return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+ }
+ }
+ }
+
+ /* otherwise return fallback split */
+ return Split(0.0f,Split::SPLIT_FALLBACK);
+ }
+
+ /*! performs fallback split */
+ void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ mvector<PrimRefMB>& prims = *set.prims;
+
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ PrimInfoMB linfo = empty;
+ for (size_t i=begin; i<center; i++)
+ linfo.add_primref(prims[i]);
+
+ PrimInfoMB rinfo = empty;
+ for (size_t i=center; i<end; i++)
+ rinfo.add_primref(prims[i]);
+
+ new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+
+ /*! checks if all primitives are from the same geometry */
+ __forceinline bool sameGeometry(const SetMB& set)
+ {
+ if (set.size() == 0) return true;
+ mvector<PrimRefMB>& prims = *set.prims;
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ unsigned int firstGeomID = prims[begin].geomID();
+ for (size_t i=begin+1; i<end; i++) {
+ if (prims[i].geomID() != firstGeomID){
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /* split by geometry ID */
+ void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ assert(set.size() > 1);
+
+ mvector<PrimRefMB>& prims = *set.prims;
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+
+ PrimInfoMB left(empty);
+ PrimInfoMB right(empty);
+ unsigned int geomID = prims[begin].geomID();
+ size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+ [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+ [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+
+ new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+
+ const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (in.depth > cfg.maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* replace already found split by fallback split */
+ const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+ /* special case when directly creating leaf without any splits that could shrink time_range */
+ bool force_split = false;
+ if (current.depth == 1 && current.size() > 0)
+ {
+ BBox1f c = empty;
+ BBox1f p = current.prims.time_range;
+ for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+ mvector<PrimRefMB>& prims = *current.prims.prims;
+ c.extend(prims[i].time_range);
+ }
+
+ force_split = c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* create leaf for few primitives */
+ if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+ return createLeaf(current,alloc);
+
+ /* fill all children by always splitting the largest one */
+ bool hasTimeSplits = false;
+ NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+ LocalChildListSplit children(current);
+
+ do {
+ /* find best child with largest bounding box area */
+ size_t bestChild = -1;
+ size_t bestSize = 0;
+ for (size_t i=0; i<children.size(); i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+ continue;
+
+ force_split = false;
+
+ /* remember child with largest size */
+ if (children[i].size() > bestSize) {
+ bestSize = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /* perform best found split */
+ BuildRecordSplit& brecord = children[bestChild];
+ BuildRecordSplit lrecord(current.depth+1);
+ BuildRecordSplit rrecord(current.depth+1);
+ std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+ hasTimeSplits |= new_vector != nullptr;
+
+ /* find new splits */
+ lrecord.split = findFallback(lrecord.prims);
+ rrecord.split = findFallback(rrecord.prims);
+ children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+ } while (children.size() < cfg.branchingFactor);
+
+ /* detect time_ranges that have shrunken */
+ for (size_t i=0; i<children.size(); i++) {
+ const BBox1f c = children[i].prims.time_range;
+ const BBox1f p = in.prims.time_range;
+ hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* create node */
+ auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+ /* recurse into each child and perform reduction */
+ LBBox3fa gbounds = empty;
+ for (size_t i=0; i<children.size(); i++) {
+ values[i] = createLargeLeaf(children[i],alloc);
+ gbounds.extend(values[i].lbounds);
+ }
+
+ setNode(current,children.children.data(),node,values,children.numChildren);
+
+ /* calculate geometry bounds of this node */
+ if (hasTimeSplits)
+ return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+ else
+ return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+ }
+
+ const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAlloc();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && current.size() <= cfg.singleThreadThreshold)
+ progressMonitor(current.size());
+
+ /*! find best split */
+ const Split csplit = find(current.prims);
+
+ /*! compute leaf and split cost */
+ const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+ const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+ assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+ /*! create a leaf node when threshold reached or SAH tells us to stop */
+ if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+ current.prims.deterministic_order();
+ return createLargeLeaf(current,alloc);
+ }
+
+ /*! perform initial split */
+ SetMB lprims,rprims;
+ std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+ bool hasTimeSplits = new_vector != nullptr;
+ NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+ LocalChildList children(current);
+ {
+ BuildRecord lrecord(lprims,current.depth+1);
+ BuildRecord rrecord(rprims,current.depth+1);
+ children.split(0,lrecord,rrecord,std::move(new_vector));
+ }
+
+ /*! split until node is full or SAH tells us to stop */
+ while (children.size() < cfg.branchingFactor)
+ {
+ /*! find best child to split */
+ float bestArea = neg_inf;
+ ssize_t bestChild = -1;
+ for (size_t i=0; i<children.size(); i++)
+ {
+ if (children[i].size() <= cfg.minLeafSize) continue;
+ if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+ bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+ }
+ }
+ if (bestChild == -1) break;
+
+ /* perform split */
+ BuildRecord& brecord = children[bestChild];
+ BuildRecord lrecord(current.depth+1);
+ BuildRecord rrecord(current.depth+1);
+ Split csplit = find(brecord.prims);
+ std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+ hasTimeSplits |= new_vector != nullptr;
+ children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+ }
+
+ /* detect time_ranges that have shrunken */
+ for (size_t i=0; i<children.size(); i++) {
+ const BBox1f c = children[i].prims.time_range;
+ const BBox1f p = current.prims.time_range;
+ hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* sort buildrecords for simpler shadow ray traversal */
+ //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+ /*! create an inner node */
+ auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+ LBBox3fa gbounds = empty;
+
+ /* spawn tasks */
+ if (unlikely(current.size() > cfg.singleThreadThreshold))
+ {
+ /*! parallel_for is faster than spawing sub-tasks */
+ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ values[i] = recurse(children[i],nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+
+ /*! merge bounding boxes */
+ for (size_t i=0; i<children.size(); i++)
+ gbounds.extend(values[i].lbounds);
+ }
+ /* recurse into each child */
+ else
+ {
+ //for (size_t i=0; i<children.size(); i++)
+ for (ssize_t i=children.size()-1; i>=0; i--) {
+ values[i] = recurse(children[i],alloc,false);
+ gbounds.extend(values[i].lbounds);
+ }
+ }
+
+ setNode(current,children.children.data(),node,values,children.numChildren);
+
+ /* calculate geometry bounds of this node */
+ if (unlikely(hasTimeSplits))
+ return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+ else
+ return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+ }
+
+ /*! builder entry function */
+ __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+ {
+ const SetMB set(pinfo,&prims);
+ auto ret = recurse(BuildRecord(set,1),nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return ret;
+ }
+
+ private:
+ Settings cfg;
+ HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+ HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+ const RecalculatePrimRef recalculatePrimRef;
+ const CreateAllocFunc createAlloc;
+ const CreateNodeFunc createNode;
+ const SetNodeFunc setNode;
+ const CreateLeafFunc createLeaf;
+ const ProgressMonitor progressMonitor;
+ };
+
+ template<typename NodeRef,
+ typename RecalculatePrimRef,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename SetNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitorFunc>
+
+ static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+ const PrimInfoMB& pinfo,
+ MemoryMonitorInterface* device,
+ const RecalculatePrimRef recalculatePrimRef,
+ const CreateAllocFunc createAlloc,
+ const CreateNodeFunc createNode,
+ const SetNodeFunc setNode,
+ const CreateLeafFunc createLeaf,
+ const ProgressMonitorFunc progressMonitor,
+ const Settings& settings)
+ {
+ typedef BuilderT<
+ NodeRef,
+ RecalculatePrimRef,
+ decltype(createAlloc()),
+ CreateAllocFunc,
+ CreateNodeFunc,
+ SetNodeFunc,
+ CreateLeafFunc,
+ ProgressMonitorFunc> Builder;
+
+ Builder builder(device,
+ recalculatePrimRef,
+ createAlloc,
+ createNode,
+ setNode,
+ createLeaf,
+ progressMonitor,
+ settings);
+
+
+ return builder(prims,pinfo);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..e477c313a3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct BVHBuilderHairMSMBlur
+ {
+ /*! settings for msmblur builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t logBlockSize; //!< log2 of blocksize for SAH heuristic
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ };
+
+ struct BuildRecord
+ {
+ public:
+ __forceinline BuildRecord () {}
+
+ __forceinline BuildRecord (size_t depth)
+ : depth(depth) {}
+
+ __forceinline BuildRecord (const SetMB& prims, size_t depth)
+ : depth(depth), prims(prims) {}
+
+ __forceinline size_t size() const {
+ return prims.size();
+ }
+
+ public:
+ size_t depth; //!< depth of the root of this subtree
+ SetMB prims; //!< the list of primitives
+ };
+
+ template<typename NodeRef,
+ typename RecalculatePrimRef,
+ typename CreateAllocFunc,
+ typename CreateAABBNodeMBFunc,
+ typename SetAABBNodeMBFunc,
+ typename CreateOBBNodeMBFunc,
+ typename SetOBBNodeMBFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor>
+
+ class BuilderT
+ {
+ ALIGNED_CLASS_(16);
+
+ static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth
+ static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build
+
+ typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+ typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+ typedef FastAllocator::CachedAllocator Allocator;
+ typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+ typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+ typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+ typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+ public:
+
+ BuilderT (Scene* scene,
+ const RecalculatePrimRef& recalculatePrimRef,
+ const CreateAllocFunc& createAlloc,
+ const CreateAABBNodeMBFunc& createAABBNodeMB,
+ const SetAABBNodeMBFunc& setAABBNodeMB,
+ const CreateOBBNodeMBFunc& createOBBNodeMB,
+ const SetOBBNodeMBFunc& setOBBNodeMB,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ const Settings settings)
+
+ : cfg(settings),
+ scene(scene),
+ recalculatePrimRef(recalculatePrimRef),
+ createAlloc(createAlloc),
+ createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+ createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+ createLeaf(createLeaf),
+ progressMonitor(progressMonitor),
+ unalignedHeuristic(scene),
+ temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+ private:
+
+ /*! checks if all primitives are from the same geometry */
+ __forceinline bool sameGeometry(const SetMB& set)
+ {
+ mvector<PrimRefMB>& prims = *set.prims;
+ unsigned int firstGeomID = prims[set.begin()].geomID();
+ for (size_t i=set.begin()+1; i<set.end(); i++) {
+ if (prims[i].geomID() != firstGeomID){
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*! performs some split if SAH approaches fail */
+ void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ mvector<PrimRefMB>& prims = *set.prims;
+
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ PrimInfoMB linfo = empty;
+ for (size_t i=begin; i<center; i++)
+ linfo.add_primref(prims[i]);
+
+ PrimInfoMB rinfo = empty;
+ for (size_t i=center; i<end; i++)
+ rinfo.add_primref(prims[i]);
+
+ new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+
+ void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ assert(set.size() > 1);
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfoMB linfo(empty);
+ PrimInfoMB rinfo(empty);
+ unsigned int geomID = (*set.prims)[begin].geomID();
+ size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+ [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+ [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+ new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+
+ /*! creates a large leaf that could be larger than supported by the BVH */
+ NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (current.depth > cfg.maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* special case when directly creating leaf without any splits that could shrink time_range */
+ bool force_split = false;
+ if (current.depth == 1 && current.size() > 0)
+ {
+ BBox1f c = empty;
+ BBox1f p = current.prims.time_range;
+ for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+ mvector<PrimRefMB>& prims = *current.prims.prims;
+ c.extend(prims[i].time_range);
+ }
+
+ force_split = c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* create leaf for few primitives */
+ if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+ return createLeaf(current.prims,alloc);
+
+ /* fill all children by always splitting the largest one */
+ LocalChildList children(current);
+ NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+ do {
+
+ /* find best child with largest bounding box area */
+ int bestChild = -1;
+ size_t bestSize = 0;
+ for (unsigned i=0; i<children.size(); i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+ continue;
+
+ force_split = false;
+
+ /* remember child with largest size */
+ if (children[i].size() > bestSize) {
+ bestSize = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ BuildRecord left(current.depth+1);
+ BuildRecord right(current.depth+1);
+ if (!sameGeometry(children[bestChild].prims)) {
+ splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+ } else {
+ splitFallback(children[bestChild].prims,left.prims,right.prims);
+ }
+ children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+ } while (children.size() < cfg.branchingFactor);
+
+
+ /* detect time_ranges that have shrunken */
+ bool timesplit = false;
+ for (size_t i=0; i<children.size(); i++) {
+ const BBox1f c = children[i].prims.time_range;
+ const BBox1f p = current.prims.time_range;
+ timesplit |= c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* create node */
+ NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<children.size(); i++) {
+ values[i] = createLargeLeaf(children[i],alloc);
+ bounds.extend(values[i].lbounds);
+ }
+
+ setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+ if (timesplit)
+ bounds = current.prims.linearBounds(recalculatePrimRef);
+
+ return NodeRecordMB4D(node,bounds,current.prims.time_range);
+ }
+
+ /*! performs split */
+ std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+ {
+ /* variable to track the SAH of the best splitting approach */
+ float bestSAH = inf;
+ const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+ /* perform standard binning in aligned space */
+ HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+ float alignedObjectSAH = alignedObjectSplit.splitSAH();
+ bestSAH = min(alignedObjectSAH,bestSAH);
+
+ /* perform standard binning in unaligned space */
+ UnalignedHeuristicBinning::Split unalignedObjectSplit;
+ LinearSpace3fa uspace;
+ float unalignedObjectSAH = inf;
+ if (alignedObjectSAH > 0.7f*leafSAH) {
+ uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+ const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+ unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+ unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+ bestSAH = min(unalignedObjectSAH,bestSAH);
+ }
+
+ /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+ float temporal_split_sah = inf;
+ typename HeuristicTemporal::Split temporal_split;
+ if (bestSAH > 0.5f*leafSAH) {
+ if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+ temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+ temporal_split_sah = temporal_split.splitSAH();
+ bestSAH = min(temporal_split_sah,bestSAH);
+ }
+ }
+
+ /* perform fallback split if SAH heuristics failed */
+ if (unlikely(!std::isfinite(bestSAH))) {
+ current.prims.deterministic_order();
+ splitFallback(current.prims,lrecord.prims,rrecord.prims);
+ }
+ /* perform aligned split if this is best */
+ else if (likely(bestSAH == alignedObjectSAH)) {
+ alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+ }
+ /* perform unaligned split if this is best */
+ else if (likely(bestSAH == unalignedObjectSAH)) {
+ unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+ aligned = false;
+ }
+ /* perform temporal split if this is best */
+ else if (likely(bestSAH == temporal_split_sah)) {
+ timesplit = true;
+ return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+ }
+ else
+ assert(false);
+
+ return std::unique_ptr<mvector<PrimRefMB>>();
+ }
+
+ /*! recursive build */
+ NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAlloc();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+ progressMonitor(current.size());
+
+ /* create leaf node */
+ if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+ current.prims.deterministic_order();
+ return createLargeLeaf(current,alloc);
+ }
+
+ /* fill all children by always splitting the one with the largest surface area */
+ NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+ LocalChildList children(current);
+ bool aligned = true;
+ bool timesplit = false;
+
+ do {
+
+ /* find best child with largest bounding box area */
+ ssize_t bestChild = -1;
+ float bestArea = neg_inf;
+ for (size_t i=0; i<children.size(); i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= cfg.minLeafSize)
+ continue;
+
+ /* remember child with largest area */
+ const float A = children[i].prims.halfArea();
+ if (A > bestArea) {
+ bestArea = children[i].prims.halfArea();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ BuildRecord left(current.depth+1);
+ BuildRecord right(current.depth+1);
+ std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+ children.split(bestChild,left,right,std::move(new_vector));
+
+ } while (children.size() < cfg.branchingFactor);
+
+ /* detect time_ranges that have shrunken */
+ for (size_t i=0; i<children.size(); i++) {
+ const BBox1f c = children[i].prims.time_range;
+ const BBox1f p = current.prims.time_range;
+ timesplit |= c.lower > p.lower || c.upper < p.upper;
+ }
+
+ /* create time split node */
+ if (timesplit)
+ {
+ const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+ /* spawn tasks or ... */
+ if (current.size() > SINGLE_THREADED_THRESHOLD)
+ {
+ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ values[i] = recurse(children[i],nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+ /* ... continue sequential */
+ else {
+ for (size_t i=0; i<children.size(); i++) {
+ values[i] = recurse(children[i],alloc,false);
+ }
+ }
+
+ setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+ const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+ return NodeRecordMB4D(node,bounds,current.prims.time_range);
+ }
+
+ /* create aligned node */
+ else if (aligned)
+ {
+ const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+ /* spawn tasks or ... */
+ if (current.size() > SINGLE_THREADED_THRESHOLD)
+ {
+ LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ values[i] = recurse(children[i],nullptr,true);
+ cbounds[i] = values[i].lbounds;
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<children.size(); i++)
+ bounds.extend(cbounds[i]);
+ setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+ return NodeRecordMB4D(node,bounds,current.prims.time_range);
+ }
+ /* ... continue sequentially */
+ else
+ {
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<children.size(); i++) {
+ values[i] = recurse(children[i],alloc,false);
+ bounds.extend(values[i].lbounds);
+ }
+ setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+ return NodeRecordMB4D(node,bounds,current.prims.time_range);
+ }
+ }
+
+ /* create unaligned node */
+ else
+ {
+ const NodeRef node = createOBBNodeMB(alloc);
+
+ /* spawn tasks or ... */
+ if (current.size() > SINGLE_THREADED_THRESHOLD)
+ {
+ parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+ const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+ const auto child = recurse(children[i],nullptr,true);
+ setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+ /* ... continue sequentially */
+ else
+ {
+ for (size_t i=0; i<children.size(); i++) {
+ const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+ const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+ const auto child = recurse(children[i],alloc,false);
+ setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+ }
+ }
+
+ const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+ return NodeRecordMB4D(node,bounds,current.prims.time_range);
+ }
+ }
+
+ public:
+
+ /*! entry point into builder */
+ NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+ {
+ BuildRecord record(SetMB(pinfo,&prims),1);
+ auto root = recurse(record,nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+
+ private:
+ Settings cfg;
+ Scene* scene;
+ const RecalculatePrimRef& recalculatePrimRef;
+ const CreateAllocFunc& createAlloc;
+ const CreateAABBNodeMBFunc& createAABBNodeMB;
+ const SetAABBNodeMBFunc& setAABBNodeMB;
+ const CreateOBBNodeMBFunc& createOBBNodeMB;
+ const SetOBBNodeMBFunc& setOBBNodeMB;
+ const CreateLeafFunc& createLeaf;
+ const ProgressMonitor& progressMonitor;
+
+ private:
+ HeuristicBinning alignedHeuristic;
+ UnalignedHeuristicBinning unalignedHeuristic;
+ HeuristicTemporal temporalSplitHeuristic;
+ };
+
+ template<typename NodeRef,
+ typename RecalculatePrimRef,
+ typename CreateAllocFunc,
+ typename CreateAABBNodeMBFunc,
+ typename SetAABBNodeMBFunc,
+ typename CreateOBBNodeMBFunc,
+ typename SetOBBNodeMBFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor>
+
+ static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+ const RecalculatePrimRef& recalculatePrimRef,
+ const CreateAllocFunc& createAlloc,
+ const CreateAABBNodeMBFunc& createAABBNodeMB,
+ const SetAABBNodeMBFunc& setAABBNodeMB,
+ const CreateOBBNodeMBFunc& createOBBNodeMB,
+ const SetOBBNodeMBFunc& setOBBNodeMB,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ const Settings settings)
+ {
+ typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+ CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+ CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+ CreateLeafFunc,ProgressMonitor> Builder;
+
+ Builder builder(scene,recalculatePrimRef,createAlloc,
+ createAABBNodeMB,setAABBNodeMB,
+ createOBBNodeMB,setOBBNodeMB,
+ createLeaf,progressMonitor,settings);
+
+ return builder(prims,pinfo);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..3f7e678a10
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+# define NUM_OBJECT_BINS 16
+# define NUM_SPATIAL_BINS 16
+#else
+# define NUM_OBJECT_BINS 32
+# define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+ namespace isa
+ {
+ MAYBE_UNUSED static const float travCost = 1.0f;
+ MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+ struct GeneralBVHBuilder
+ {
+ static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
+
+
+ /*! settings for SAH builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+ travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+ /*! initialize settings from API settings */
+ Settings (const RTCBuildArguments& settings)
+ : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+ travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+ {
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast<size_t>(settings.sahBlockSize));
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost )) intCost = settings.intersectionCost;
+
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+ : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+ travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+ {
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t logBlockSize; //!< log2 of blocksize for SAH heuristic
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ float travCost; //!< estimated cost of one traversal step
+ float intCost; //!< estimated cost of one primitive intersection
+ size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+ size_t primrefarrayalloc; //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+ };
+
+ /*! recursive state of builder */
+ template<typename Set, typename Split>
+ struct BuildRecordT
+ {
+ public:
+ __forceinline BuildRecordT () {}
+
+ __forceinline BuildRecordT (size_t depth)
+ : depth(depth), alloc_barrier(false), prims(empty) {}
+
+ __forceinline BuildRecordT (size_t depth, const Set& prims)
+ : depth(depth), alloc_barrier(false), prims(prims) {}
+
+ __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+ __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+ __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size(); }
+
+ __forceinline size_t size() const { return prims.size(); }
+
+ public:
+ size_t depth; //!< Depth of the root of this subtree.
+ bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+ Set prims; //!< The list of primitives.
+ };
+
+ template<typename PrimRef, typename Set>
+ struct DefaultCanCreateLeafFunc
+ {
+ __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+ };
+
+ template<typename PrimRef, typename Set>
+ struct DefaultCanCreateLeafSplitFunc
+ {
+ __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+ };
+
+ template<typename BuildRecord,
+ typename Heuristic,
+ typename Set,
+ typename PrimRef,
+ typename ReductionTy,
+ typename Allocator,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename CanCreateLeafFunc,
+ typename CanCreateLeafSplitFunc,
+ typename ProgressMonitor>
+
+ class BuilderT
+ {
+ friend struct GeneralBVHBuilder;
+
+ BuilderT (PrimRef* prims,
+ Heuristic& heuristic,
+ const CreateAllocFunc& createAlloc,
+ const CreateNodeFunc& createNode,
+ const UpdateNodeFunc& updateNode,
+ const CreateLeafFunc& createLeaf,
+ const CanCreateLeafFunc& canCreateLeaf,
+ const CanCreateLeafSplitFunc& canCreateLeafSplit,
+ const ProgressMonitor& progressMonitor,
+ const Settings& settings) :
+ cfg(settings),
+ prims(prims),
+ heuristic(heuristic),
+ createAlloc(createAlloc),
+ createNode(createNode),
+ updateNode(updateNode),
+ createLeaf(createLeaf),
+ canCreateLeaf(canCreateLeaf),
+ canCreateLeafSplit(canCreateLeafSplit),
+ progressMonitor(progressMonitor)
+ {
+ if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+ }
+
+ const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (current.depth > cfg.maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* create leaf for few primitives */
+ if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+ return createLeaf(prims,current.prims,alloc);
+
+ /* fill all children by always splitting the largest one */
+ ReductionTy values[MAX_BRANCHING_FACTOR];
+ BuildRecord children[MAX_BRANCHING_FACTOR];
+ size_t numChildren = 1;
+ children[0] = current;
+ do {
+
+ /* find best child with largest bounding box area */
+ size_t bestChild = -1;
+ size_t bestSize = 0;
+ for (size_t i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+ continue;
+
+ /* remember child with largest size */
+ if (children[i].prims.size() > bestSize) {
+ bestSize = children[i].prims.size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == (size_t)-1) break;
+
+ /*! split best child into left and right child */
+ BuildRecord left(current.depth+1);
+ BuildRecord right(current.depth+1);
+ if (!canCreateLeaf(prims,children[bestChild].prims)) {
+ canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+ } else {
+ heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+ }
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = left;
+ children[numChildren+0] = right;
+ numChildren++;
+
+ } while (numChildren < cfg.branchingFactor);
+
+ /* set barrier for primrefarrayalloc */
+ if (unlikely(current.size() > cfg.primrefarrayalloc))
+ for (size_t i=0; i<numChildren; i++)
+ children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+ /* create node */
+ auto node = createNode(children,numChildren,alloc);
+
+ /* recurse into each child and perform reduction */
+ for (size_t i=0; i<numChildren; i++)
+ values[i] = createLargeLeaf(children[i],alloc);
+
+ /* perform reduction */
+ return updateNode(current,children,node,values,numChildren);
+ }
+
+ const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAlloc();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && current.size() <= cfg.singleThreadThreshold)
+ progressMonitor(current.size());
+
+ /*! find best split */
+ auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+ /*! compute leaf and split cost */
+ const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+ const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+ assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+ /*! create a leaf node when threshold reached or SAH tells us to stop */
+ if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+ heuristic.deterministic_order(current.prims);
+ return createLargeLeaf(current,alloc);
+ }
+
+ /*! perform initial split */
+ Set lprims,rprims;
+ heuristic.split(split,current.prims,lprims,rprims);
+
+ /*! initialize child list with initial split */
+ ReductionTy values[MAX_BRANCHING_FACTOR];
+ BuildRecord children[MAX_BRANCHING_FACTOR];
+ children[0] = BuildRecord(current.depth+1,lprims);
+ children[1] = BuildRecord(current.depth+1,rprims);
+ size_t numChildren = 2;
+
+ /*! split until node is full or SAH tells us to stop */
+ while (numChildren < cfg.branchingFactor)
+ {
+ /*! find best child to split */
+ float bestArea = neg_inf;
+ ssize_t bestChild = -1;
+ for (size_t i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+ /* find child with largest surface area */
+ if (halfArea(children[i].prims.geomBounds) > bestArea) {
+ bestChild = i;
+ bestArea = halfArea(children[i].prims.geomBounds);
+ }
+ }
+ if (bestChild == -1) break;
+
+ /* perform best found split */
+ BuildRecord& brecord = children[bestChild];
+ BuildRecord lrecord(current.depth+1);
+ BuildRecord rrecord(current.depth+1);
+ auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+ heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+ children[bestChild ] = lrecord;
+ children[numChildren] = rrecord;
+ numChildren++;
+ }
+
+ /* set barrier for primrefarrayalloc */
+ if (unlikely(current.size() > cfg.primrefarrayalloc))
+ for (size_t i=0; i<numChildren; i++)
+ children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+ /* sort buildrecords for faster shadow ray traversal */
+ std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+ /*! create an inner node */
+ auto node = createNode(children,numChildren,alloc);
+
+ /* spawn tasks */
+ if (current.size() > cfg.singleThreadThreshold)
+ {
+ /*! parallel_for is faster than spawing sub-tasks */
+ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ values[i] = recurse(children[i],nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+
+ return updateNode(current,children,node,values,numChildren);
+ }
+ /* recurse into each child */
+ else
+ {
+ for (size_t i=0; i<numChildren; i++)
+ values[i] = recurse(children[i],alloc,false);
+
+ return updateNode(current,children,node,values,numChildren);
+ }
+ }
+
+ private:
+ Settings cfg;
+ PrimRef* prims;
+ Heuristic& heuristic;
+ const CreateAllocFunc& createAlloc;
+ const CreateNodeFunc& createNode;
+ const UpdateNodeFunc& updateNode;
+ const CreateLeafFunc& createLeaf;
+ const CanCreateLeafFunc& canCreateLeaf;
+ const CanCreateLeafSplitFunc& canCreateLeafSplit;
+ const ProgressMonitor& progressMonitor;
+ };
+
+ template<
+ typename ReductionTy,
+ typename Heuristic,
+ typename Set,
+ typename PrimRef,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor>
+
+ __noinline static ReductionTy build(Heuristic& heuristic,
+ PrimRef* prims,
+ const Set& set,
+ CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ const Settings& settings)
+ {
+ typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+ typedef BuilderT<
+ BuildRecord,
+ Heuristic,
+ Set,
+ PrimRef,
+ ReductionTy,
+ decltype(createAlloc()),
+ CreateAllocFunc,
+ CreateNodeFunc,
+ UpdateNodeFunc,
+ CreateLeafFunc,
+ DefaultCanCreateLeafFunc<PrimRef, Set>,
+ DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+ ProgressMonitor> Builder;
+
+ /* instantiate builder */
+ Builder builder(prims,
+ heuristic,
+ createAlloc,
+ createNode,
+ updateNode,
+ createLeaf,
+ DefaultCanCreateLeafFunc<PrimRef, Set>(),
+ DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+ progressMonitor,
+ settings);
+
+ /* build hierarchy */
+ BuildRecord record(1,set);
+ const ReductionTy root = builder.recurse(record,nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+
+ template<
+ typename ReductionTy,
+ typename Heuristic,
+ typename Set,
+ typename PrimRef,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename CanCreateLeafFunc,
+ typename CanCreateLeafSplitFunc,
+ typename ProgressMonitor>
+
+ __noinline static ReductionTy build(Heuristic& heuristic,
+ PrimRef* prims,
+ const Set& set,
+ CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ const CanCreateLeafFunc& canCreateLeaf,
+ const CanCreateLeafSplitFunc& canCreateLeafSplit,
+ const ProgressMonitor& progressMonitor,
+ const Settings& settings)
+ {
+ typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+ typedef BuilderT<
+ BuildRecord,
+ Heuristic,
+ Set,
+ PrimRef,
+ ReductionTy,
+ decltype(createAlloc()),
+ CreateAllocFunc,
+ CreateNodeFunc,
+ UpdateNodeFunc,
+ CreateLeafFunc,
+ CanCreateLeafFunc,
+ CanCreateLeafSplitFunc,
+ ProgressMonitor> Builder;
+
+ /* instantiate builder */
+ Builder builder(prims,
+ heuristic,
+ createAlloc,
+ createNode,
+ updateNode,
+ createLeaf,
+ canCreateLeaf,
+ canCreateLeafSplit,
+ progressMonitor,
+ settings);
+
+ /* build hierarchy */
+ BuildRecord record(1,set);
+ const ReductionTy root = builder.recurse(record,nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+ };
+
+ /* SAH builder that operates on an array of BuildRecords */
+ struct BVHBuilderBinnedSAH
+ {
+ typedef PrimInfoRange Set;
+ typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+ typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+ typedef GeneralBVHBuilder::Settings Settings;
+
+ /*! special builder that propagates reduction over the tree */
+ template<
+ typename ReductionTy,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ const ProgressMonitor& progressMonitor,
+ PrimRef* prims, const PrimInfo& pinfo,
+ const Settings& settings)
+ {
+ Heuristic heuristic(prims);
+ return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+ heuristic,
+ prims,
+ PrimInfoRange(0,pinfo.size(),pinfo),
+ createAlloc,
+ createNode,
+ updateNode,
+ createLeaf,
+ progressMonitor,
+ settings);
+ }
+
+ /*! special builder that propagates reduction over the tree */
+ template<
+ typename ReductionTy,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename CanCreateLeafFunc,
+ typename CanCreateLeafSplitFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ const CanCreateLeafFunc& canCreateLeaf,
+ const CanCreateLeafSplitFunc& canCreateLeafSplit,
+ const ProgressMonitor& progressMonitor,
+ PrimRef* prims, const PrimInfo& pinfo,
+ const Settings& settings)
+ {
+ Heuristic heuristic(prims);
+ return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+ heuristic,
+ prims,
+ PrimInfoRange(0,pinfo.size(),pinfo),
+ createAlloc,
+ createNode,
+ updateNode,
+ createLeaf,
+ canCreateLeaf,
+ canCreateLeafSplit,
+ progressMonitor,
+ settings);
+ }
+ };
+
+ /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+ struct BVHBuilderBinnedFastSpatialSAH
+ {
+ typedef PrimInfoExtRange Set;
+ typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+ typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+ typedef GeneralBVHBuilder::Settings Settings;
+
+ static const unsigned int GEOMID_MASK = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+ static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+ template<typename ReductionTy, typename UserCreateLeaf>
+ struct CreateLeafExt
+ {
+ __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+ : userCreateLeaf(userCreateLeaf) {}
+
+ // __noinline is workaround for ICC2016 compiler bug
+ template<typename Allocator>
+ __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+ {
+ for (size_t i=range.begin(); i<range.end(); i++)
+ prims[i].lower.u &= GEOMID_MASK;
+
+ return userCreateLeaf(prims,range,alloc);
+ }
+
+ const UserCreateLeaf userCreateLeaf;
+ };
+
+ /*! special builder that propagates reduction over the tree */
+ template<
+ typename ReductionTy,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename SplitPrimitiveFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode,
+ UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ SplitPrimitiveFunc splitPrimitive,
+ ProgressMonitor progressMonitor,
+ PrimRef* prims,
+ const size_t extSize,
+ const PrimInfo& pinfo,
+ const Settings& settings)
+ {
+ typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+ Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+ /* calculate total surface area */ // FIXME: this sum is not deterministic
+ const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+ double A = 0.0f;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ PrimRef& prim = prims[i];
+ A += area(prim.bounds());
+ }
+ return A;
+ },std::plus<double>());
+
+
+ /* calculate maximum number of spatial splits per primitive */
+ const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+ const float f = 10.0f;
+
+ const float invA = 1.0f / A;
+ parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ PrimRef& prim = prims[i];
+ assert((prim.geomID() & SPLITS_MASK) == 0);
+ // FIXME: is there a better general heuristic ?
+ const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+ unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+ prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+ }
+ });
+
+ return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+ heuristic,
+ prims,
+ PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+ createAlloc,
+ createNode,
+ updateNode,
+ CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+ progressMonitor,
+ settings);
+ }
+ };
+
+ /* Open/Merge SAH builder that operates on an array of BuildRecords */
+ struct BVHBuilderBinnedOpenMergeSAH
+ {
+ static const size_t NUM_OBJECT_BINS_HQ = 32;
+ typedef PrimInfoExtRange Set;
+ typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+ typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+ typedef GeneralBVHBuilder::Settings Settings;
+
+ /*! special builder that propagates reduction over the tree */
+ template<
+ typename ReductionTy,
+ typename BuildRef,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename UpdateNodeFunc,
+ typename CreateLeafFunc,
+ typename NodeOpenerFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAlloc,
+ CreateNodeFunc createNode,
+ UpdateNodeFunc updateNode,
+ const CreateLeafFunc& createLeaf,
+ NodeOpenerFunc nodeOpenerFunc,
+ ProgressMonitor progressMonitor,
+ BuildRef* prims,
+ const size_t extSize,
+ const PrimInfo& pinfo,
+ const Settings& settings)
+ {
+ typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+ Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+ return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+ heuristic,
+ prims,
+ PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+ createAlloc,
+ createNode,
+ updateNode,
+ createLeaf,
+ progressMonitor,
+ settings);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..a4d3b68e46
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
@@ -0,0 +1,972 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! mapping into bins */
+ template<size_t BINS>
+ struct BinMapping
+ {
+ public:
+ __forceinline BinMapping() {}
+
+ /*! calculates the mapping */
+ __forceinline BinMapping(size_t N, const BBox3fa& centBounds)
+ {
+ num = min(BINS,size_t(4.0f + 0.05f*N));
+ assert(num >= 1);
+ const vfloat4 eps = 1E-34f;
+ const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+ scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+ ofs = (vfloat4) centBounds.lower;
+ }
+
+ /*! calculates the mapping */
+ __forceinline BinMapping(const BBox3fa& centBounds)
+ {
+ num = BINS;
+ const vfloat4 eps = 1E-34f;
+ const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+ scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+ ofs = (vfloat4) centBounds.lower;
+ }
+
+ /*! calculates the mapping */
+ template<typename PrimInfo>
+ __forceinline BinMapping(const PrimInfo& pinfo)
+ {
+ const vfloat4 eps = 1E-34f;
+ num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+ const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+ scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+ ofs = (vfloat4) pinfo.centBounds.lower;
+ }
+
+ /*! returns number of bins */
+ __forceinline size_t size() const { return num; }
+
+ /*! slower but safe binning */
+ __forceinline Vec3ia bin(const Vec3fa& p) const
+ {
+ const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+ assert(i[0] >= 0 && (size_t)i[0] < num);
+ assert(i[1] >= 0 && (size_t)i[1] < num);
+ assert(i[2] >= 0 && (size_t)i[2] < num);
+ return Vec3ia(i);
+#else
+ return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+ }
+
+ /*! faster but unsafe binning */
+ __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+ return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+ }
+
+ /*! faster but unsafe binning */
+ template<typename PrimRef>
+ __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+ return bin_unsafe(p.binCenter());
+ }
+
+ /*! faster but unsafe binning */
+ template<typename PrimRef, typename BinBoundsAndCenter>
+ __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+ return bin_unsafe(binBoundsAndCenter.binCenter(p));
+ }
+
+ template<typename PrimRef>
+ __forceinline bool bin_unsafe(const PrimRef& ref,
+ const vint4& vSplitPos,
+ const vbool4& splitDimMask) const // FIXME: rename to isLeft
+ {
+ return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+ }
+ /*! calculates left spatial position of bin */
+ __forceinline float pos(const size_t bin, const size_t dim) const {
+ return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+ }
+
+ /*! returns true if the mapping is invalid in some dimension */
+ __forceinline bool invalid(const size_t dim) const {
+ return scale[dim] == 0.0f;
+ }
+
+ /*! stream output */
+ friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+ return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+ }
+
+ public:
+ size_t num;
+ vfloat4 ofs,scale; //!< linear function that maps to bin ID
+ };
+
+ /*! stores all information to perform some split */
+ template<size_t BINS>
+ struct BinSplit
+ {
+ enum
+ {
+ SPLIT_OBJECT = 0,
+ SPLIT_FALLBACK = 1,
+ SPLIT_ENFORCE = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+ SPLIT_TEMPORAL = 2,
+ SPLIT_GEOMID = 3,
+ };
+
+ /*! construct an invalid split by default */
+ __forceinline BinSplit()
+ : sah(inf), dim(-1), pos(0), data(0) {}
+
+ __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+ : sah(sah), dim(dim), fpos(fpos), data(data) {}
+
+ /*! constructs specified split */
+ __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+ : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+
+ /*! tests if this split is valid */
+ __forceinline bool valid() const { return dim != -1; }
+
+ /*! calculates surface area heuristic for performing the split */
+ __forceinline float splitSAH() const { return sah; }
+
+ /*! stream output */
+ friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+ return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+ }
+
+ public:
+ float sah; //!< SAH cost of the split
+ int dim; //!< split dimension
+ union { int pos; float fpos; }; //!< bin index for splitting
+ unsigned int data; //!< extra optional split data
+ BinMapping<BINS> mapping; //!< mapping into bins
+ };
+
+ /*! stores extended information about the split */
+ template<typename BBox>
+ struct SplitInfoT
+ {
+
+ __forceinline SplitInfoT () {}
+
+ __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+ : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+
+ public:
+ size_t leftCount,rightCount;
+ BBox leftBounds,rightBounds;
+ };
+
+ typedef SplitInfoT<BBox3fa> SplitInfo;
+ typedef SplitInfoT<LBBox3fa> SplitInfo2;
+
+ /*! stores all binning information */
+ template<size_t BINS, typename PrimRef, typename BBox>
+ struct __aligned(64) BinInfoT
+ {
+ typedef BinSplit<BINS> Split;
+ typedef vbool4 vbool;
+ typedef vint4 vint;
+ typedef vfloat4 vfloat;
+
+ __forceinline BinInfoT() {
+ }
+
+ __forceinline BinInfoT(EmptyTy) {
+ clear();
+ }
+
+ /*! bin access function */
+ __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; }
+ __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+ __forceinline unsigned int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; }
+ __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+ __forceinline vuint4 &counts(const size_t binID) { return _counts[binID]; }
+ __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+ /*! clears the bin info */
+ __forceinline void clear()
+ {
+ for (size_t i=0; i<BINS; i++) {
+ bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+ counts(i) = vuint4(zero);
+ }
+ }
+
+ /*! bins an array of primitives */
+ __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+ {
+ if (unlikely(N == 0)) return;
+ size_t i;
+ for (i=0; i<N-1; i+=2)
+ {
+ /*! map even and odd primitive to bin */
+ BBox prim0; Vec3fa center0;
+ prims[i+0].binBoundsAndCenter(prim0,center0);
+ const vint4 bin0 = (vint4)mapping.bin(center0);
+
+ BBox prim1; Vec3fa center1;
+ prims[i+1].binBoundsAndCenter(prim1,center1);
+ const vint4 bin1 = (vint4)mapping.bin(center1);
+
+ /*! increase bounds for bins for even primitive */
+ const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0);
+ const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0);
+ const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0);
+ const unsigned int s0 = (unsigned int)prims[i+0].size();
+ counts(b00,0)+=s0;
+ counts(b01,1)+=s0;
+ counts(b02,2)+=s0;
+
+ /*! increase bounds of bins for odd primitive */
+ const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1);
+ const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1);
+ const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1);
+ const unsigned int s1 = (unsigned int)prims[i+1].size();
+ counts(b10,0)+=s1;
+ counts(b11,1)+=s1;
+ counts(b12,2)+=s1;
+ }
+ /*! for uneven number of primitives */
+ if (i < N)
+ {
+ /*! map primitive to bin */
+ BBox prim0; Vec3fa center0;
+ prims[i].binBoundsAndCenter(prim0,center0);
+ const vint4 bin0 = (vint4)mapping.bin(center0);
+
+ /*! increase bounds of bins */
+ const unsigned int s0 = (unsigned int)prims[i].size();
+ const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+ const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+ const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+ }
+ }
+
+ /*! bins an array of primitives */
+ template<typename BinBoundsAndCenter>
+ __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+ {
+ if (N == 0) return;
+
+ size_t i;
+ for (i=0; i<N-1; i+=2)
+ {
+ /*! map even and odd primitive to bin */
+ BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0);
+ const vint4 bin0 = (vint4)mapping.bin(center0);
+ BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1);
+ const vint4 bin1 = (vint4)mapping.bin(center1);
+
+ /*! increase bounds for bins for even primitive */
+ const unsigned int s0 = prims[i+0].size();
+ const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+ const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+ const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+
+ /*! increase bounds of bins for odd primitive */
+ const unsigned int s1 = prims[i+1].size();
+ const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+ const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+ const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+ }
+
+ /*! for uneven number of primitives */
+ if (i < N)
+ {
+ /*! map primitive to bin */
+ BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0);
+ const vint4 bin0 = (vint4)mapping.bin(center0);
+
+ /*! increase bounds of bins */
+ const unsigned int s0 = prims[i+0].size();
+ const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+ const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+ const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+ }
+ }
+
+ __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+ bin(prims+begin,end-begin,mapping);
+ }
+
+ template<typename BinBoundsAndCenter>
+ __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+ bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+ }
+
+ /*! merges in other binning information */
+ __forceinline void merge (const BinInfoT& other, size_t numBins)
+ {
+
+ for (size_t i=0; i<numBins; i++)
+ {
+ counts(i) += other.counts(i);
+ bounds(i,0).extend(other.bounds(i,0));
+ bounds(i,1).extend(other.bounds(i,1));
+ bounds(i,2).extend(other.bounds(i,2));
+ }
+ }
+
+ /*! reduces binning information */
+ static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+ {
+ BinInfoT c;
+ for (size_t i=0; i<numBins; i++)
+ {
+ c.counts(i) = a.counts(i)+b.counts(i);
+ c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+ c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+ c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+ }
+ return c;
+ }
+
+ /*! finds the best split by scanning binning information */
+ __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+ {
+ /* sweep from right to left and compute parallel prefix of merged bounds */
+ vfloat4 rAreas[BINS];
+ vuint4 rCounts[BINS];
+ vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+ for (size_t i=mapping.size()-1; i>0; i--)
+ {
+ count += counts(i);
+ rCounts[i] = count;
+ bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+ by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+ bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+ rAreas[i][3] = 0.0f;
+ }
+ /* sweep from left to right and compute SAH */
+ vuint4 blocks_add = (1 << blocks_shift)-1;
+ vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0;
+ count = 0; bx = empty; by = empty; bz = empty;
+ for (size_t i=1; i<mapping.size(); i++, ii+=1)
+ {
+ count += counts(i-1);
+ bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+ by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+ bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+ const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+ const vfloat4 rArea = rAreas[i];
+ const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+ const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+ const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+ //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+ vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+ vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+ }
+
+ /* find best dimension */
+ float bestSAH = inf;
+ int bestDim = -1;
+ int bestPos = 0;
+ for (int dim=0; dim<3; dim++)
+ {
+ /* ignore zero sized dimensions */
+ if (unlikely(mapping.invalid(dim)))
+ continue;
+
+ /* test if this is a better dimension */
+ if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+ bestDim = dim;
+ bestPos = vbestPos[dim];
+ bestSAH = vbestSAH[dim];
+ }
+ }
+ return Split(bestSAH,bestDim,bestPos,mapping);
+ }
+
+ /*! calculates extended split information */
+ __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const
+ {
+ if (split.dim == -1) {
+ new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+ return;
+ }
+
+ size_t leftCount = 0;
+ BBox leftBounds = empty;
+ for (size_t i=0; i<(size_t)split.pos; i++) {
+ leftCount += counts(i,split.dim);
+ leftBounds.extend(bounds(i,split.dim));
+ }
+ size_t rightCount = 0;
+ BBox rightBounds = empty;
+ for (size_t i=split.pos; i<mapping.size(); i++) {
+ rightCount += counts(i,split.dim);
+ rightBounds.extend(bounds(i,split.dim));
+ }
+ new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+ }
+
+ /*! gets the number of primitives left of the split */
+ __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+ {
+ if (unlikely(split.dim == -1)) return -1;
+
+ size_t leftCount = 0;
+ for (size_t i = 0; i < (size_t)split.pos; i++) {
+ leftCount += counts(i, split.dim);
+ }
+ return leftCount;
+ }
+
+ /*! gets the number of primitives right of the split */
+ __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+ {
+ if (unlikely(split.dim == -1)) return -1;
+
+ size_t rightCount = 0;
+ for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+ rightCount += counts(i, split.dim);
+ }
+ return rightCount;
+ }
+
+ private:
+ BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+ vuint4 _counts[BINS]; //!< counts number of primitives that map into the bins
+ };
+
+#if defined(__AVX512ER__) // KNL
+
+ /*! mapping into bins */
+ template<>
+ struct BinMapping<16>
+ {
+ public:
+ __forceinline BinMapping() {}
+
+ /*! calculates the mapping */
+ template<typename PrimInfo>
+ __forceinline BinMapping(const PrimInfo& pinfo)
+ {
+ num = 16;
+ const vfloat4 eps = 1E-34f;
+ const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+ scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+ ofs = (vfloat4) pinfo.centBounds.lower;
+ scale16 = scale;
+ ofs16 = ofs;
+ }
+
+ /*! returns number of bins */
+ __forceinline size_t size() const { return num; }
+
+ __forceinline vint16 bin16(const Vec3fa& p) const {
+ return vint16(vint4(floori((vfloat4(p)-ofs)*scale)));
+ }
+
+ __forceinline vint16 bin16(const vfloat16& p) const {
+ return floori((p-ofs16)*scale16);
+ }
+
+ __forceinline int bin_unsafe(const PrimRef& ref,
+ const vint16& vSplitPos,
+ const vbool16& splitDimMask) const // FIXME: rename to isLeft
+ {
+ const vfloat16 lower(*(vfloat4*)&ref.lower);
+ const vfloat16 upper(*(vfloat4*)&ref.upper);
+ const vfloat16 p = lower + upper;
+ const vint16 i = floori((p-ofs16)*scale16);
+ return lt(splitDimMask,i,vSplitPos);
+ }
+
+ /*! returns true if the mapping is invalid in some dimension */
+ __forceinline bool invalid(const size_t dim) const {
+ return scale[dim] == 0.0f;
+ }
+
+ public:
+ size_t num;
+ vfloat4 ofs,scale; //!< linear function that maps to bin ID
+ vfloat16 ofs16,scale16; //!< linear function that maps to bin ID
+ };
+
+ /* 16 bins in-register binner */
+ template<typename PrimRef>
+ struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa>
+ {
+ typedef BinSplit<16> Split;
+ typedef vbool16 vbool;
+ typedef vint16 vint;
+ typedef vfloat16 vfloat;
+
+ __forceinline BinInfoT() {
+ }
+
+ __forceinline BinInfoT(EmptyTy) {
+ clear();
+ }
+
+ /*! clears the bin info */
+ __forceinline void clear()
+ {
+ lower[0] = lower[1] = lower[2] = pos_inf;
+ upper[0] = upper[1] = upper[2] = neg_inf;
+ count[0] = count[1] = count[2] = 0;
+ }
+
+
+ static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x,
+ const vfloat16 min_y,
+ const vfloat16 min_z,
+ const vfloat16 max_x,
+ const vfloat16 max_y,
+ const vfloat16 max_z)
+ {
+ const vfloat16 r_min_x = reverse_prefix_min(min_x);
+ const vfloat16 r_min_y = reverse_prefix_min(min_y);
+ const vfloat16 r_min_z = reverse_prefix_min(min_z);
+ const vfloat16 r_max_x = reverse_prefix_max(max_x);
+ const vfloat16 r_max_y = reverse_prefix_max(max_y);
+ const vfloat16 r_max_z = reverse_prefix_max(max_z);
+ const vfloat16 dx = r_max_x - r_min_x;
+ const vfloat16 dy = r_max_y - r_min_y;
+ const vfloat16 dz = r_max_z - r_min_z;
+ const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz));
+ return area_rl;
+ }
+
+ static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x,
+ const vfloat16 min_y,
+ const vfloat16 min_z,
+ const vfloat16 max_x,
+ const vfloat16 max_y,
+ const vfloat16 max_z)
+ {
+ const vfloat16 r_min_x = prefix_min(min_x);
+ const vfloat16 r_min_y = prefix_min(min_y);
+ const vfloat16 r_min_z = prefix_min(min_z);
+ const vfloat16 r_max_x = prefix_max(max_x);
+ const vfloat16 r_max_y = prefix_max(max_y);
+ const vfloat16 r_max_z = prefix_max(max_z);
+ const vfloat16 dx = r_max_x - r_min_x;
+ const vfloat16 dy = r_max_y - r_min_y;
+ const vfloat16 dz = r_max_z - r_min_z;
+ const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz));
+ return area_lr;
+ }
+
+
+ /*! bins an array of primitives */
+ __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping)
+ {
+ if (unlikely(N == 0)) return;
+
+ const vfloat16 init_min(pos_inf);
+ const vfloat16 init_max(neg_inf);
+
+ vfloat16 min_x0,min_x1,min_x2;
+ vfloat16 min_y0,min_y1,min_y2;
+ vfloat16 min_z0,min_z1,min_z2;
+ vfloat16 max_x0,max_x1,max_x2;
+ vfloat16 max_y0,max_y1,max_y2;
+ vfloat16 max_z0,max_z1,max_z2;
+ vuint16 count0,count1,count2;
+
+ min_x0 = init_min;
+ min_x1 = init_min;
+ min_x2 = init_min;
+ min_y0 = init_min;
+ min_y1 = init_min;
+ min_y2 = init_min;
+ min_z0 = init_min;
+ min_z1 = init_min;
+ min_z2 = init_min;
+
+ max_x0 = init_max;
+ max_x1 = init_max;
+ max_x2 = init_max;
+ max_y0 = init_max;
+ max_y1 = init_max;
+ max_y2 = init_max;
+ max_z0 = init_max;
+ max_z1 = init_max;
+ max_z2 = init_max;
+
+ count0 = zero;
+ count1 = zero;
+ count2 = zero;
+
+ const vint16 step16(step);
+ size_t i;
+ for (i=0; i<N-1; i+=2)
+ {
+ /*! map even and odd primitive to bin */
+ const BBox3fa primA = prims[i+0].bounds();
+ const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper);
+ const vint16 binA = mapping.bin16(centerA);
+
+ const BBox3fa primB = prims[i+1].bounds();
+ const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper);
+ const vint16 binB = mapping.bin16(centerB);
+
+ /* A */
+ {
+ const vfloat16 b_min_x = prims[i+0].lower.x;
+ const vfloat16 b_min_y = prims[i+0].lower.y;
+ const vfloat16 b_min_z = prims[i+0].lower.z;
+ const vfloat16 b_max_x = prims[i+0].upper.x;
+ const vfloat16 b_max_y = prims[i+0].upper.y;
+ const vfloat16 b_max_z = prims[i+0].upper.z;
+
+ const vint16 bin0 = shuffle<0>(binA);
+ const vint16 bin1 = shuffle<1>(binA);
+ const vint16 bin2 = shuffle<2>(binA);
+
+ const vbool16 m_update_x = step16 == bin0;
+ const vbool16 m_update_y = step16 == bin1;
+ const vbool16 m_update_z = step16 == bin2;
+
+ assert(popcnt((size_t)m_update_x) == 1);
+ assert(popcnt((size_t)m_update_y) == 1);
+ assert(popcnt((size_t)m_update_z) == 1);
+
+ min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+ min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+ min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+ max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+ max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+ min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+ min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+ max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+ max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+ min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+ min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+ max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+ max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+ // ------------------------------------------------------------------------
+ count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+ count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+ count2 = mask_add(m_update_z,count2,count2,vuint16(1));
+ }
+
+
+ /* B */
+ {
+ const vfloat16 b_min_x = prims[i+1].lower.x;
+ const vfloat16 b_min_y = prims[i+1].lower.y;
+ const vfloat16 b_min_z = prims[i+1].lower.z;
+ const vfloat16 b_max_x = prims[i+1].upper.x;
+ const vfloat16 b_max_y = prims[i+1].upper.y;
+ const vfloat16 b_max_z = prims[i+1].upper.z;
+
+ const vint16 bin0 = shuffle<0>(binB);
+ const vint16 bin1 = shuffle<1>(binB);
+ const vint16 bin2 = shuffle<2>(binB);
+
+ const vbool16 m_update_x = step16 == bin0;
+ const vbool16 m_update_y = step16 == bin1;
+ const vbool16 m_update_z = step16 == bin2;
+
+ assert(popcnt((size_t)m_update_x) == 1);
+ assert(popcnt((size_t)m_update_y) == 1);
+ assert(popcnt((size_t)m_update_z) == 1);
+
+ min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+ min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+ min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+ max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+ max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+ min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+ min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+ max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+ max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+ min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+ min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+ max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+ max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+ // ------------------------------------------------------------------------
+ count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+ count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+ count2 = mask_add(m_update_z,count2,count2,vuint16(1));
+ }
+
+ }
+
+ if (i < N)
+ {
+ const BBox3fa prim0 = prims[i].bounds();
+ const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper);
+ const vint16 bin = mapping.bin16(center0);
+
+ const vfloat16 b_min_x = prims[i].lower.x;
+ const vfloat16 b_min_y = prims[i].lower.y;
+ const vfloat16 b_min_z = prims[i].lower.z;
+ const vfloat16 b_max_x = prims[i].upper.x;
+ const vfloat16 b_max_y = prims[i].upper.y;
+ const vfloat16 b_max_z = prims[i].upper.z;
+
+ const vint16 bin0 = shuffle<0>(bin);
+ const vint16 bin1 = shuffle<1>(bin);
+ const vint16 bin2 = shuffle<2>(bin);
+
+ const vbool16 m_update_x = step16 == bin0;
+ const vbool16 m_update_y = step16 == bin1;
+ const vbool16 m_update_z = step16 == bin2;
+
+ assert(popcnt((size_t)m_update_x) == 1);
+ assert(popcnt((size_t)m_update_y) == 1);
+ assert(popcnt((size_t)m_update_z) == 1);
+
+ min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+ min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+ min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+ max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+ max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+ min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+ min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+ max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+ max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+ // ------------------------------------------------------------------------
+ min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+ min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+ min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+ // ------------------------------------------------------------------------
+ max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+ max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+ max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+ // ------------------------------------------------------------------------
+ count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+ count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+ count2 = mask_add(m_update_z,count2,count2,vuint16(1));
+ }
+
+ lower[0] = Vec3vf16( min_x0, min_y0, min_z0 );
+ lower[1] = Vec3vf16( min_x1, min_y1, min_z1 );
+ lower[2] = Vec3vf16( min_x2, min_y2, min_z2 );
+
+ upper[0] = Vec3vf16( max_x0, max_y0, max_z0 );
+ upper[1] = Vec3vf16( max_x1, max_y1, max_z1 );
+ upper[2] = Vec3vf16( max_x2, max_y2, max_z2 );
+
+ count[0] = count0;
+ count[1] = count1;
+ count[2] = count2;
+ }
+
+ __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) {
+ bin(prims+begin,end-begin,mapping);
+ }
+
+ /*! merges in other binning information */
+ __forceinline void merge (const BinInfoT& other, size_t numBins)
+ {
+ for (size_t i=0; i<3; i++)
+ {
+ lower[i] = min(lower[i],other.lower[i]);
+ upper[i] = max(upper[i],other.upper[i]);
+ count[i] += other.count[i];
+ }
+ }
+
+ /*! reducesr binning information */
+ static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b)
+ {
+ BinInfoT c;
+ for (size_t i=0; i<3; i++)
+ {
+ c.counts[i] = a.counts[i] + b.counts[i];
+ c.lower[i] = min(a.lower[i],b.lower[i]);
+ c.upper[i] = max(a.upper[i],b.upper[i]);
+ }
+ return c;
+ }
+
+ /*! finds the best split by scanning binning information */
+ __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const
+ {
+ /* find best dimension */
+ float bestSAH = inf;
+ int bestDim = -1;
+ int bestPos = 0;
+ const vuint16 blocks_add = (1 << blocks_shift)-1;
+ const vfloat16 inf(pos_inf);
+ for (size_t dim=0; dim<3; dim++)
+ {
+ /* ignore zero sized dimensions */
+ if (unlikely(mapping.invalid(dim)))
+ continue;
+
+ const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+ const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+ const vuint16 lCount16 = prefix_sum(count[dim]);
+ const vuint16 rCount16 = reverse_prefix_sum(count[dim]);
+
+ /* compute best split in this dimension */
+ const vfloat16 leftArea = lArea16;
+ const vfloat16 rightArea = align_shift_right<1>(zero,rArea16);
+ const vuint16 lC = lCount16;
+ const vuint16 rC = align_shift_right<1>(zero,rCount16);
+ const vuint16 leftCount = ( lC + blocks_add) >> blocks_shift;
+ const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift;
+ const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries
+ const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf));
+ /* test if this is a better dimension */
+ if (any(sah < vfloat16(bestSAH)))
+ {
+ const size_t index = select_min(sah);
+ assert(index < 15);
+ assert(sah[index] < bestSAH);
+ bestDim = dim;
+ bestPos = index+1;
+ bestSAH = sah[index];
+ }
+ }
+
+ return Split(bestSAH,bestDim,bestPos,mapping);
+
+ }
+
+ /*! calculates extended split information */
+ __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const
+ {
+ if (split.dim == -1) {
+ new (&info) SplitInfo(0,empty,0,empty);
+ return;
+ }
+ // FIXME: horizontal reduction!
+
+ size_t leftCount = 0;
+ BBox3fa leftBounds = empty;
+ for (size_t i=0; i<(size_t)split.pos; i++) {
+ leftCount += count[split.dim][i];
+ Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+ Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+ leftBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+ }
+ size_t rightCount = 0;
+ BBox3fa rightBounds = empty;
+ for (size_t i=split.pos; i<mapping.size(); i++) {
+ rightCount += count[split.dim][i];
+ Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+ Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+ rightBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+ }
+ new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds);
+ }
+
+ /*! gets the number of primitives left of the split */
+ __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const
+ {
+ if (unlikely(split.dim == -1)) return -1;
+
+ size_t leftCount = 0;
+ for (size_t i = 0; i < (size_t)split.pos; i++) {
+ leftCount += count[split.dim][i];
+ }
+ return leftCount;
+ }
+
+ /*! gets the number of primitives right of the split */
+ __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const
+ {
+ if (unlikely(split.dim == -1)) return -1;
+
+ size_t rightCount = 0;
+ for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+ rightCount += count[split.dim][i];
+ }
+ return rightCount;
+ }
+
+ private:
+ Vec3vf16 lower[3];
+ Vec3vf16 upper[3];
+ vuint16 count[3];
+ };
+#endif
+ }
+
+ template<typename BinInfoT, typename BinMapping, typename PrimRef>
+ __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+ {
+ if (likely(end-begin < parallelThreshold)) {
+ binner.bin(prims,begin,end,mapping);
+ } else {
+ binner = parallel_reduce(begin,end,blockSize,binner,
+ [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+ [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+ }
+ }
+
+ template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+ __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+ {
+ if (likely(end-begin < parallelThreshold)) {
+ binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+ } else {
+ binner = parallel_reduce(begin,end,blockSize,binner,
+ [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+ [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+ }
+ }
+
+ template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+ __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+ {
+ if (!parallel) {
+ binner.bin(prims,begin,end,mapping);
+ } else {
+ binner = parallel_reduce(begin,end,blockSize,binner,
+ [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+ [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+ }
+ }
+
+ template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+ __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+ {
+ if (!parallel) {
+ binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+ } else {
+ binner = parallel_reduce(begin,end,blockSize,binner,
+ [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+ [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..a4c272f015
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,205 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+ {
+ __forceinline PrimInfoRange () {
+ }
+
+ __forceinline PrimInfoRange(const PrimInfo& pinfo)
+ : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+ __forceinline PrimInfoRange(EmptyTy)
+ : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+ __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+ : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+
+ __forceinline float leafSAH() const {
+ return expectedApproxHalfArea(geomBounds)*float(size());
+ }
+
+ __forceinline float leafSAH(size_t block_shift) const {
+ return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+ }
+ };
+
+ /*! Performs standard object binning */
+ template<typename PrimRef, size_t BINS>
+ struct HeuristicArrayBinningSAH
+ {
+ typedef BinSplit<BINS> Split;
+ typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+ typedef range<size_t> Set;
+
+#if defined(__AVX512ER__) // KNL
+ static const size_t PARALLEL_THRESHOLD = 4*768;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768;
+#else
+ static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+ __forceinline HeuristicArrayBinningSAH ()
+ : prims(nullptr) {}
+
+ /*! remember prim array */
+ __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+ : prims(prims) {}
+
+ /*! finds the best split */
+ __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+ {
+ if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+ return find_template<false>(pinfo,logBlockSize);
+ else
+ return find_template<true>(pinfo,logBlockSize);
+ }
+
+ template<bool parallel>
+ __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+ {
+ Binner binner(empty);
+ const BinMapping<BINS> mapping(pinfo);
+ bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+ return binner.best(mapping,logBlockSize);
+ }
+
+ /*! array partitioning */
+ __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+ {
+ if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+ split_template<false>(split,pinfo,linfo,rinfo);
+ else
+ split_template<true>(split,pinfo,linfo,rinfo);
+ }
+
+ template<bool parallel>
+ __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ if (!split.valid()) {
+ deterministic_order(set);
+ return splitFallback(set,lset,rset);
+ }
+
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ CentGeomBBox3fa local_left(empty);
+ CentGeomBBox3fa local_right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ const typename Binner::vint vSplitPos(splitPos);
+ const typename Binner::vbool vSplitMask(splitDimMask);
+ auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+ size_t center = 0;
+ if (!parallel)
+ center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+ [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+ else
+ center = parallel_partitioning(
+ prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+ [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+ [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+ PARALLEL_PARTITION_BLOCK_SIZE);
+
+ new (&lset) PrimInfoRange(begin,center,local_left);
+ new (&rset) PrimInfoRange(center,end,local_right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ }
+
+ void deterministic_order(const PrimInfoRange& pinfo)
+ {
+ /* required as parallel partition destroys original primitive order */
+ std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+ }
+
+ void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+ {
+ const size_t begin = pinfo.begin();
+ const size_t end = pinfo.end();
+ const size_t center = (begin + end)/2;
+
+ CentGeomBBox3fa left(empty);
+ for (size_t i=begin; i<center; i++)
+ left.extend_center2(prims[i]);
+ new (&linfo) PrimInfoRange(begin,center,left);
+
+ CentGeomBBox3fa right(empty);
+ for (size_t i=center; i<end; i++)
+ right.extend_center2(prims[i]);
+ new (&rinfo) PrimInfoRange(center,end,right);
+ }
+
+ void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+ {
+ assert(range.size() > 1);
+ CentGeomBBox3fa left(empty);
+ CentGeomBBox3fa right(empty);
+ unsigned int geomID = prims[range.begin()].geomID();
+ size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+ [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+ [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+ new (&linfo) PrimInfoRange(range.begin(),center,left);
+ new (&rinfo) PrimInfoRange(center,range.end(),right);
+ }
+
+ private:
+ PrimRef* const prims;
+ };
+
+ /*! Performs standard object binning */
+ template<typename PrimRefMB, size_t BINS>
+ struct HeuristicArrayBinningMB
+ {
+ typedef BinSplit<BINS> Split;
+ typedef typename PrimRefMB::BBox BBox;
+ typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+ static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+ /*! finds the best split */
+ const Split find(const SetMB& set, const size_t logBlockSize)
+ {
+ ObjectBinner binner(empty);
+ const BinMapping<BINS> mapping(set.size(),set.centBounds);
+ bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+ Split osplit = binner.best(mapping,logBlockSize);
+ osplit.sah *= set.time_range.size();
+ if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+ return osplit;
+ }
+
+ /*! array partitioning */
+ __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfoMB left = empty;
+ PrimInfoMB right = empty;
+ const vint4 vSplitPos(split.pos);
+ const vbool4 vSplitMask(1 << split.dim);
+ auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+ auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+ auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+ size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+ new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..1370244586
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Performs standard object binning */
+ template<typename PrimRef, size_t BINS>
+ struct UnalignedHeuristicArrayBinningSAH
+ {
+ typedef BinSplit<BINS> Split;
+ typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+ typedef range<size_t> Set;
+
+ __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+ : scene(nullptr), prims(nullptr) {}
+
+ /*! remember prim array */
+ __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+ : scene(scene), prims(prims) {}
+
+ const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+ {
+ Vec3fa axis(0,0,1);
+ uint64_t bestGeomPrimID = -1;
+
+ /*! find curve with minimum ID that defines valid direction */
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ const unsigned int geomID = prims[i].geomID();
+ const unsigned int primID = prims[i].primID();
+ const uint64_t geomprimID = prims[i].ID64();
+ if (geomprimID >= bestGeomPrimID) continue;
+ const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+ if (sqr_length(axis1) > 1E-18f) {
+ axis = normalize(axis1);
+ bestGeomPrimID = geomprimID;
+ }
+ }
+ return frame(axis).transposed();
+ }
+
+ const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+ {
+ auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+ {
+ CentGeomBBox3fa bounds(empty);
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ Geometry* mesh = scene->get(prims[i].geomID());
+ bounds.extend(mesh->vbounds(space,prims[i].primID()));
+ }
+ return bounds;
+ };
+
+ const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096),
+ CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+ return PrimInfo(set.begin(),set.end(),bounds);
+ }
+
+ struct BinBoundsAndCenter
+ {
+ __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+ : scene(scene), space(space) {}
+
+ /*! returns center for binning */
+ __forceinline Vec3fa binCenter(const PrimRef& ref) const
+ {
+ Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+ BBox3fa bounds = mesh->vbounds(space,ref.primID());
+ return embree::center2(bounds);
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+ {
+ Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+ BBox3fa bounds = mesh->vbounds(space,ref.primID());
+ bounds_o = bounds;
+ center_o = embree::center2(bounds);
+ }
+
+ private:
+ Scene* scene;
+ const LinearSpace3fa space;
+ };
+
+ /*! finds the best split */
+ __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+ {
+ if (likely(pinfo.size() < 10000))
+ return find_template<false>(pinfo,logBlockSize,space);
+ else
+ return find_template<true>(pinfo,logBlockSize,space);
+ }
+
+ /*! finds the best split */
+ template<bool parallel>
+ const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+ {
+ Binner binner(empty);
+ const BinMapping<BINS> mapping(set);
+ BinBoundsAndCenter binBoundsAndCenter(scene,space);
+ bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+ return binner.best(mapping,logBlockSize);
+ }
+
+ /*! array partitioning */
+ __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ if (likely(set.size() < 10000))
+ split_template<false>(split,space,set,lset,rset);
+ else
+ split_template<true>(split,space,set,lset,rset);
+ }
+
+ /*! array partitioning */
+ template<bool parallel>
+ __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ if (!split.valid()) {
+ deterministic_order(set);
+ return splitFallback(set,lset,rset);
+ }
+
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ CentGeomBBox3fa local_left(empty);
+ CentGeomBBox3fa local_right(empty);
+ const int splitPos = split.pos;
+ const int splitDim = split.dim;
+ BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+ size_t center = 0;
+ if (likely(set.size() < 10000))
+ center = serial_partitioning(prims,begin,end,local_left,local_right,
+ [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+ [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+ else
+ center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+ [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+ [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+ [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+ 128);
+
+ new (&lset) PrimInfoRange(begin,center,local_left);
+ new (&rset) PrimInfoRange(center,end,local_right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ }
+
+ void deterministic_order(const range<size_t>& set)
+ {
+ /* required as parallel partition destroys original primitive order */
+ std::sort(&prims[set.begin()],&prims[set.end()]);
+ }
+
+ void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ CentGeomBBox3fa left(empty);
+ for (size_t i=begin; i<center; i++)
+ left.extend_center2(prims[i]);
+ new (&lset) PrimInfoRange(begin,center,left);
+
+ CentGeomBBox3fa right(empty);
+ for (size_t i=center; i<end; i++)
+ right.extend_center2(prims[i]);
+ new (&rset) PrimInfoRange(center,end,right);
+ }
+
+ private:
+ Scene* const scene;
+ PrimRef* const prims;
+ };
+
+ /*! Performs standard object binning */
+ template<typename PrimRefMB, size_t BINS>
+ struct UnalignedHeuristicArrayBinningMB
+ {
+ typedef BinSplit<BINS> Split;
+ typedef typename PrimRefMB::BBox BBox;
+ typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+
+ static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+ UnalignedHeuristicArrayBinningMB(Scene* scene)
+ : scene(scene) {}
+
+ const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+ {
+ Vec3fa axis0(0,0,1);
+ uint64_t bestGeomPrimID = -1;
+
+ /*! find curve with minimum ID that defines valid direction */
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ const PrimRefMB& prim = (*set.prims)[i];
+ const unsigned int geomID = prim.geomID();
+ const unsigned int primID = prim.primID();
+ const uint64_t geomprimID = prim.ID64();
+ if (geomprimID >= bestGeomPrimID) continue;
+
+ const Geometry* mesh = scene->get(geomID);
+ const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+ if (tbounds.size() == 0) continue;
+
+ const size_t t = (tbounds.begin()+tbounds.end())/2;
+ const Vec3fa axis1 = mesh->computeDirection(primID,t);
+ if (sqr_length(axis1) > 1E-18f) {
+ axis0 = normalize(axis1);
+ bestGeomPrimID = geomprimID;
+ }
+ }
+
+ return frame(axis0).transposed();
+ }
+
+ struct BinBoundsAndCenter
+ {
+ __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+ : scene(scene), time_range(time_range), space(space) {}
+
+ /*! returns center for binning */
+ template<typename PrimRef>
+ __forceinline Vec3fa binCenter(const PrimRef& ref) const
+ {
+ Geometry* mesh = scene->get(ref.geomID());
+ LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+ return center2(lbounds.interpolate(0.5f));
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+ {
+ Geometry* mesh = scene->get(ref.geomID());
+ LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+ bounds_o = lbounds.interpolate(0.5f);
+ center_o = center2(bounds_o);
+ }
+
+ /*! returns bounds and centroid used for binning */
+ __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+ {
+ Geometry* mesh = scene->get(ref.geomID());
+ LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+ bounds_o = lbounds;
+ center_o = center2(lbounds.interpolate(0.5f));
+ }
+
+ private:
+ Scene* scene;
+ BBox1f time_range;
+ const LinearSpace3fa space;
+ };
+
+ /*! finds the best split */
+ const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+ {
+ BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+ ObjectBinner binner(empty);
+ const BinMapping<BINS> mapping(set.size(),set.centBounds);
+ bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+ Split osplit = binner.best(mapping,logBlockSize);
+ osplit.sah *= set.time_range.size();
+ if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+ return osplit;
+ }
+
+ /*! array partitioning */
+ __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfoMB left = empty;
+ PrimInfoMB right = empty;
+ const vint4 vSplitPos(split.pos);
+ const vbool4 vSplitMask(1 << split.dim);
+ auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+ auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+ auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+ size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+ new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+ new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+ }
+
+ private:
+ Scene* scene;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..21f18c0208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO:
+// - adjust parallel build thresholds
+// - openNodesBasedOnExtend should consider max extended size
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD 0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Performs standard object binning */
+ template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+ struct HeuristicArrayOpenMergeSAH
+ {
+ typedef BinSplit<OBJECT_BINS> Split;
+ typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+
+ static const size_t PARALLEL_THRESHOLD = 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+ static const size_t MOVE_STEP_SIZE = 64;
+ static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+ __forceinline HeuristicArrayOpenMergeSAH ()
+ : prims0(nullptr) {}
+
+ /*! remember prim array */
+ __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+ : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size)
+ {
+ assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+ }
+
+ struct OpenHeuristic
+ {
+ __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+ {
+ const Vec3fa diag = pinfo.geomBounds.size();
+ dim = maxDim(diag);
+ assert(diag[dim] > 0.0f);
+ inv_max_extend = 1.0f / diag[dim];
+ }
+
+ __forceinline bool operator () ( PrimRef& prim ) const {
+ return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+ }
+
+ private:
+ size_t dim;
+ float inv_max_extend;
+ };
+
+ /*! compute extended ranges */
+ __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+ {
+ assert(set.ext_range_size() > 0);
+ const float left_factor = (float)lweight / (lweight + rweight);
+ const size_t ext_range_size = set.ext_range_size();
+ const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+ const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+ lset.set_ext_range(lset.end() + left_ext_range_size);
+ rset.set_ext_range(rset.end() + right_ext_range_size);
+ }
+
+ /*! move ranges */
+ __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t left_ext_range_size = lset.ext_range_size();
+ const size_t right_size = rset.size();
+
+ /* has the left child an extended range? */
+ if (left_ext_range_size > 0)
+ {
+ /* left extended range smaller than right range ? */
+ if (left_ext_range_size < right_size)
+ {
+ /* only move a small part of the beginning of the right range to the end */
+ parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ prims0[i+right_size] = prims0[i];
+ });
+ }
+ else
+ {
+ /* no overlap, move entire right range to new location, can be made fully parallel */
+ parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ prims0[i+left_ext_range_size] = prims0[i];
+ });
+ }
+ /* update right range */
+ assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+ rset.move_right(left_ext_range_size);
+ }
+ }
+
+ /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+ __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+ {
+ const OpenHeuristic heuristic(set);
+ const unsigned int geomID = prims0[set.begin()].geomID();
+
+ auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> {
+ bool commonGeomID = true;
+ size_t opens = 0;
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ commonGeomID &= prims0[i].geomID() == geomID;
+ if (heuristic(prims0[i]))
+ opens += prims0[i].node.getN()-1; // coarse approximation
+ }
+ return std::pair<size_t,bool>(opens,commonGeomID);
+ };
+ auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> {
+ return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second);
+ };
+ return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+ }
+
+ // FIXME: should consider maximum available extended size
+ __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+ {
+ const OpenHeuristic heuristic(set);
+ const size_t ext_range_start = set.end();
+
+ if (false && set.size() < PARALLEL_THRESHOLD)
+ {
+ size_t extra_elements = 0;
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ if (heuristic(prims0[i]))
+ {
+ PrimRef tmp[MAX_OPENED_CHILD_NODES];
+ const size_t n = nodeOpenerFunc(prims0[i],tmp);
+ assert(extra_elements + n-1 <= set.ext_range_size());
+ for (size_t j=0; j<n; j++)
+ set.extend_center2(tmp[j]);
+
+ prims0[i] = tmp[0];
+ for (size_t j=1; j<n; j++)
+ prims0[ext_range_start+extra_elements+j-1] = tmp[j];
+ extra_elements += n-1;
+ }
+ }
+ set._end += extra_elements;
+ }
+ else
+ {
+ std::atomic<size_t> ext_elements;
+ ext_elements.store(0);
+ PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+ PrimInfo info(empty);
+ for (size_t i=r.begin(); i<r.end(); i++)
+ if (heuristic(prims0[i]))
+ {
+ PrimRef tmp[MAX_OPENED_CHILD_NODES];
+ const size_t n = nodeOpenerFunc(prims0[i],tmp);
+ const size_t ID = ext_elements.fetch_add(n-1);
+ assert(ID + n-1 <= set.ext_range_size());
+
+ for (size_t j=0; j<n; j++)
+ info.extend_center2(tmp[j]);
+
+ prims0[i] = tmp[0];
+ for (size_t j=1; j<n; j++)
+ prims0[ext_range_start+ID+j-1] = tmp[j];
+ }
+ return info;
+ }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+ set.centBounds.extend(info.centBounds);
+ assert(ext_elements.load() <= set.ext_range_size());
+ set._end += ext_elements.load();
+ }
+ }
+
+ __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+ {
+ const OpenHeuristic heuristic(set);
+ size_t next_iteration_extra_elements = est_new_elements;
+
+ while (next_iteration_extra_elements <= set.ext_range_size())
+ {
+ next_iteration_extra_elements = 0;
+ size_t extra_elements = 0;
+ const size_t ext_range_start = set.end();
+
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ if (heuristic(prims0[i]))
+ {
+ PrimRef tmp[MAX_OPENED_CHILD_NODES];
+ const size_t n = nodeOpenerFunc(prims0[i],tmp);
+ assert(extra_elements + n-1 <= set.ext_range_size());
+ for (size_t j=0;j<n;j++)
+ set.extend_center2(tmp[j]);
+
+ prims0[i] = tmp[0];
+ for (size_t j=1;j<n;j++)
+ prims0[ext_range_start+extra_elements+j-1] = tmp[j];
+ extra_elements += n-1;
+
+ for (size_t j=0; j<n; j++)
+ if (heuristic(tmp[j]))
+ next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+ }
+ }
+ assert( extra_elements <= set.ext_range_size());
+ set._end += extra_elements;
+
+ for (size_t i=set.begin();i<set.end();i++)
+ assert(prims0[i].numPrimitives() > 0);
+
+ if (unlikely(next_iteration_extra_elements == 0)) break;
+ }
+ }
+
+ __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ /* single element */
+ if (set.size() <= 1)
+ return Split();
+
+ /* disable opening if there is no overlap */
+ const size_t D = 4;
+ if (unlikely(set.has_ext_range() && set.size() <= D))
+ {
+ bool disjoint = true;
+ for (size_t j=set.begin(); j<set.end()-1; j++) {
+ for (size_t i=set.begin()+1; i<set.end(); i++) {
+ if (conjoint(prims0[j].bounds(),prims0[i].bounds())) {
+ disjoint = false; break;
+ }
+ }
+ }
+ if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+ }
+
+ std::pair<size_t,bool> p(0,false);
+
+ /* disable opening when all primitives are from same geometry */
+ if (unlikely(set.has_ext_range()))
+ {
+ p = getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+ if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif
+ }
+
+ /* open nodes when we have sufficient space available */
+ if (unlikely(set.has_ext_range()))
+ {
+#if USE_LOOP_OPENING == 1
+ openNodesBasedOnExtendLoop(set,p.first);
+#else
+ if (p.first <= set.ext_range_size())
+ openNodesBasedOnExtend(set);
+#endif
+
+ /* disable opening when unsufficient space for opening a node available */
+ if (set.ext_range_size() < max_open_size-1)
+ set.set_ext_range(set.end()); /* disable opening */
+ }
+
+ /* find best split */
+ return object_find(set,logBlockSize);
+ }
+
+
+ /*! finds the best object split */
+ __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+ {
+ if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+ else return parallel_object_find (set,logBlockSize);
+ }
+
+ /*! finds the best object split */
+ __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ Binner binner(empty);
+ const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+ binner.bin(prims0,set.begin(),set.end(),mapping);
+ return binner.best(mapping,logBlockSize);
+ }
+
+ /*! finds the best split */
+ __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ Binner binner(empty);
+ const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+ const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+ auto body = [&] (const range<size_t>& r) -> Binner {
+ Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner;
+ };
+ auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner {
+ Binner r = b0; r.merge(b1,_mapping.size()); return r;
+ };
+ binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+ return binner.best(mapping,logBlockSize);
+ }
+
+ /*! array partitioning */
+ __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ PrimInfoExtRange set = set_i;
+
+ /* valid split */
+ if (unlikely(!split.valid())) {
+ deterministic_order(set);
+ splitFallback(set,lset,rset);
+ return;
+ }
+
+ std::pair<size_t,size_t> ext_weights(0,0);
+
+ /* object split */
+ if (likely(set.size() < PARALLEL_THRESHOLD))
+ ext_weights = sequential_object_split(split,set,lset,rset);
+ else
+ ext_weights = parallel_object_split(split,set,lset,rset);
+
+ /* if we have an extended range, set extended child ranges and move right split range */
+ if (unlikely(set.has_ext_range()))
+ {
+ setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+ moveExtentedRange(set,lset,rset);
+ }
+ }
+
+ /*! array partitioning */
+ std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo local_left(empty);
+ PrimInfo local_right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ const vint4 vSplitPos(splitPos);
+ const vbool4 vSplitMask( (int)splitDimMask );
+
+ size_t center = serial_partitioning(prims0,
+ begin,end,local_left,local_right,
+ [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+ [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });
+
+ new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+ new (&rset) PrimInfoExtRange(center,end,end,local_right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+ }
+
+ /*! array partitioning */
+ __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo left(empty);
+ PrimInfo right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ const vint4 vSplitPos(splitPos);
+ const vbool4 vSplitMask( (int)splitDimMask );
+ auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+ const size_t center = parallel_partitioning(
+ prims0,begin,end,EmptyTy(),left,right,isLeft,
+ [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+ [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+ PARALLEL_PARTITION_BLOCK_SIZE);
+
+ new (&lset) PrimInfoExtRange(begin,center,center,left);
+ new (&rset) PrimInfoExtRange(center,end,end,right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+
+ return std::pair<size_t,size_t>(left.size(),right.size());
+ }
+
+ void deterministic_order(const extended_range<size_t>& set)
+ {
+ /* required as parallel partition destroys original primitive order */
+ std::sort(&prims0[set.begin()],&prims0[set.end()]);
+ }
+
+ __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ PrimInfo left(empty);
+ for (size_t i=begin; i<center; i++)
+ left.add_center2(prims0[i]);
+
+ const size_t lweight = left.end;
+
+ PrimInfo right(empty);
+ for (size_t i=center; i<end; i++)
+ right.add_center2(prims0[i]);
+
+ const size_t rweight = right.end;
+ new (&lset) PrimInfoExtRange(begin,center,center,left);
+ new (&rset) PrimInfoExtRange(center,end,end,right);
+
+ /* if we have an extended range */
+ if (set.has_ext_range())
+ {
+ setExtentedRanges(set,lset,rset,lweight,rweight);
+ moveExtentedRange(set,lset,rset);
+ }
+ }
+
+ private:
+ PrimRef* const prims0;
+ const NodeOpenerFunc& nodeOpenerFunc;
+ size_t max_open_size;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..d8ca6cb92c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+ static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+ namespace isa
+ {
+
+ /*! mapping into bins */
+ template<size_t BINS>
+ struct SpatialBinMapping
+ {
+ public:
+ __forceinline SpatialBinMapping() {}
+
+ /*! calculates the mapping */
+ __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+ {
+ const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+ const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+ const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+ const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+ scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+ ofs = (vfloat4) pinfo.geomBounds.lower;
+ inv_scale = 1.0f / scale;
+ }
+
+ /*! slower but safe binning */
+ __forceinline vint4 bin(const Vec3fa& p) const
+ {
+ const vint4 i = floori((vfloat4(p)-ofs)*scale);
+ return clamp(i,vint4(0),vint4(BINS-1));
+ }
+
+ __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+ {
+#if defined(__AVX__)
+ const vfloat8 ofs8(ofs);
+ const vfloat8 scale8(scale);
+ const vint8 lu = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+ const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+ return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+ const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+ const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+ const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+ const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+ return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+ }
+
+
+ /*! calculates left spatial position of bin */
+ __forceinline float pos(const size_t bin, const size_t dim) const {
+ return madd(float(bin),inv_scale[dim],ofs[dim]);
+ }
+
+ /*! calculates left spatial position of bin */
+ template<size_t N>
+ __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+ return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+ }
+
+ /*! returns true if the mapping is invalid in some dimension */
+ __forceinline bool invalid(const size_t dim) const {
+ return scale[dim] == 0.0f;
+ }
+
+ public:
+ vfloat4 ofs,scale,inv_scale; //!< linear function that maps to bin ID
+ };
+
+ /*! stores all information required to perform some split */
+ template<size_t BINS>
+ struct SpatialBinSplit
+ {
+ /*! construct an invalid split by default */
+ __forceinline SpatialBinSplit()
+ : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+
+ /*! constructs specified split */
+ __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+ : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+ /*! constructs specified split */
+ __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+ : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+
+ /*! tests if this split is valid */
+ __forceinline bool valid() const { return dim != -1; }
+
+ /*! calculates surface area heuristic for performing the split */
+ __forceinline float splitSAH() const { return sah; }
+
+ /*! stream output */
+ friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+ return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+ }
+
+ public:
+ float sah; //!< SAH cost of the split
+ int dim; //!< split dimension
+ int pos; //!< split position
+ int left; //!< number of elements on the left side
+ int right; //!< number of elements on the right side
+ float factor; //!< factor splitting the extended range
+ SpatialBinMapping<BINS> mapping; //!< mapping into bins
+ };
+
+ /*! stores all binning information */
+ template<size_t BINS, typename PrimRef>
+ struct __aligned(64) SpatialBinInfo
+ {
+ SpatialBinInfo() {
+ }
+
+ __forceinline SpatialBinInfo(EmptyTy) {
+ clear();
+ }
+
+ /*! clears the bin info */
+ __forceinline void clear()
+ {
+ for (size_t i=0; i<BINS; i++) {
+ bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+ numBegin[i] = numEnd[i] = 0;
+ }
+ }
+
+ /*! adds binning data */
+ __forceinline void add(const size_t dim,
+ const size_t beginID,
+ const size_t endID,
+ const size_t binID,
+ const BBox3fa &b,
+ const size_t n = 1)
+ {
+ assert(beginID < BINS);
+ assert(endID < BINS);
+ assert(binID < BINS);
+
+ numBegin[beginID][dim]+=(unsigned int)n;
+ numEnd [endID][dim]+=(unsigned int)n;
+ bounds [binID][dim].extend(b);
+ }
+
+ /*! extends binning bounds */
+ __forceinline void extend(const size_t dim,
+ const size_t binID,
+ const BBox3fa &b)
+ {
+ assert(binID < BINS);
+ bounds [binID][dim].extend(b);
+ }
+
+ /*! bins an array of triangles */
+ template<typename SplitPrimitive>
+ __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+ {
+ for (size_t i=0; i<N; i++)
+ {
+ const PrimRef prim = prims[i];
+ unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+ if (unlikely(splits == 1))
+ {
+ const vint4 bin = mapping.bin(center(prim.bounds()));
+ for (size_t dim=0; dim<3; dim++)
+ {
+ assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+ numBegin[bin[dim]][dim]++;
+ numEnd [bin[dim]][dim]++;
+ bounds [bin[dim]][dim].extend(prim.bounds());
+ }
+ }
+ else
+ {
+ const vint4 bin0 = mapping.bin(prim.bounds().lower);
+ const vint4 bin1 = mapping.bin(prim.bounds().upper);
+
+ for (size_t dim=0; dim<3; dim++)
+ {
+ size_t bin;
+ PrimRef rest = prim;
+ size_t l = bin0[dim];
+ size_t r = bin1[dim];
+
+ // same bin optimization
+ if (likely(l == r))
+ {
+ numBegin[l][dim]++;
+ numEnd [l][dim]++;
+ bounds [l][dim].extend(prim.bounds());
+ continue;
+ }
+
+ for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++)
+ {
+ const float pos = mapping.pos(bin+1,dim);
+
+ PrimRef left,right;
+ splitPrimitive(rest,(int)dim,pos,left,right);
+ if (unlikely(left.bounds().empty())) l++;
+ bounds[bin][dim].extend(left.bounds());
+ rest = right;
+ }
+ if (unlikely(rest.bounds().empty())) r--;
+ numBegin[l][dim]++;
+ numEnd [r][dim]++;
+ bounds [bin][dim].extend(rest.bounds());
+ }
+ }
+ }
+ }
+
+ /*! bins a range of primitives inside an array */
+ template<typename SplitPrimitive>
+ void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+ bin(splitPrimitive,prims+begin,end-begin,mapping);
+ }
+
+ /*! bins an array of primitives */
+ template<typename PrimitiveSplitterFactory>
+ __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+ {
+ for (size_t i=begin; i<end; i++)
+ {
+ const PrimRef &prim = source[i];
+ const vint4 bin0 = mapping.bin(prim.bounds().lower);
+ const vint4 bin1 = mapping.bin(prim.bounds().upper);
+
+ for (size_t dim=0; dim<3; dim++)
+ {
+ if (unlikely(mapping.invalid(dim)))
+ continue;
+
+ size_t bin;
+ size_t l = bin0[dim];
+ size_t r = bin1[dim];
+
+ // same bin optimization
+ if (likely(l == r))
+ {
+ add(dim,l,l,l,prim.bounds());
+ continue;
+ }
+ const size_t bin_start = bin0[dim];
+ const size_t bin_end = bin1[dim];
+ BBox3fa rest = prim.bounds();
+ const auto splitter = splitterFactory(prim);
+ for (bin=bin_start; bin<bin_end; bin++)
+ {
+ const float pos = mapping.pos(bin+1,dim);
+ BBox3fa left,right;
+ splitter(rest,dim,pos,left,right);
+ if (unlikely(left.empty())) l++;
+ extend(dim,bin,left);
+ rest = right;
+ }
+ if (unlikely(rest.empty())) r--;
+ add(dim,l,r,bin,rest);
+ }
+ }
+ }
+
+
+
+ /*! bins an array of primitives */
+ __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+ {
+ for (size_t i=begin; i<end; i++)
+ {
+ const PrimRef &prim = source[i];
+ const vint4 bin0 = mapping.bin(prim.bounds().lower);
+ const vint4 bin1 = mapping.bin(prim.bounds().upper);
+
+ for (size_t dim=0; dim<3; dim++)
+ {
+ if (unlikely(mapping.invalid(dim)))
+ continue;
+
+ const size_t l = bin0[dim];
+ const size_t r = bin1[dim];
+
+ const unsigned int n = prim.primID();
+
+ // same bin optimization
+ if (likely(l == r))
+ {
+ add(dim,l,l,l,prim.bounds(),n);
+ continue;
+ }
+ const size_t bin_start = bin0[dim];
+ const size_t bin_end = bin1[dim];
+ for (size_t bin=bin_start; bin<bin_end; bin++)
+ add(dim,l,r,bin,prim.bounds(),n);
+ }
+ }
+ }
+
+ /*! merges in other binning information */
+ void merge (const SpatialBinInfo& other)
+ {
+ for (size_t i=0; i<BINS; i++)
+ {
+ numBegin[i] += other.numBegin[i];
+ numEnd [i] += other.numEnd [i];
+ bounds[i][0].extend(other.bounds[i][0]);
+ bounds[i][1].extend(other.bounds[i][1]);
+ bounds[i][2].extend(other.bounds[i][2]);
+ }
+ }
+
+ /*! merges in other binning information */
+ static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+ {
+ SpatialBinInfo c(empty);
+ for (size_t i=0; i<BINS; i++)
+ {
+ c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+ c.numEnd [i] += a.numEnd [i]+b.numEnd [i];
+ c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+ c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+ c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+ }
+ return c;
+ }
+
+ /*! finds the best split by scanning binning information */
+ SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const
+ {
+ /* sweep from right to left and compute parallel prefix of merged bounds */
+ vfloat4 rAreas[BINS];
+ vuint4 rCounts[BINS];
+ vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+ for (size_t i=BINS-1; i>0; i--)
+ {
+ count += numEnd[i];
+ rCounts[i] = count;
+ bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+ by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+ bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+ rAreas[i][3] = 0.0f;
+ }
+
+ /* sweep from left to right and compute SAH */
+ vuint4 blocks_add = (1 << blocks_shift)-1;
+ vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+ count = 0; bx = empty; by = empty; bz = empty;
+ for (size_t i=1; i<BINS; i++, ii+=1)
+ {
+ count += numBegin[i-1];
+ bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+ by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+ bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+ const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+ const vfloat4 rArea = rAreas[i];
+ const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift);
+ const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+ const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+ // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+ const vbool4 mask = sah < vbestSAH;
+ vbestPos = select(mask,ii ,vbestPos);
+ vbestSAH = select(mask,sah,vbestSAH);
+ vbestlCount = select(mask,count,vbestlCount);
+ vbestrCount = select(mask,rCounts[i],vbestrCount);
+ }
+
+ /* find best dimension */
+ float bestSAH = inf;
+ int bestDim = -1;
+ int bestPos = 0;
+ unsigned int bestlCount = 0;
+ unsigned int bestrCount = 0;
+ for (int dim=0; dim<3; dim++)
+ {
+ /* ignore zero sized dimensions */
+ if (unlikely(mapping.invalid(dim)))
+ continue;
+
+ /* test if this is a better dimension */
+ if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+ bestDim = dim;
+ bestPos = vbestPos[dim];
+ bestSAH = vbestSAH[dim];
+ bestlCount = vbestlCount[dim];
+ bestrCount = vbestrCount[dim];
+ }
+ }
+ assert(bestSAH >= 0.0f);
+
+ /* return invalid split if no split found */
+ if (bestDim == -1)
+ return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+
+ /* return best found split */
+ return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+ }
+
+ private:
+ BBox3fa bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+ vuint4 numBegin[BINS]; //!< number of primitives starting in bin
+ vuint4 numEnd[BINS]; //!< number of primitives ending in bin
+ };
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..911dcf950c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,552 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+ namespace isa
+ {
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+ struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+ {
+ __forceinline PrimInfoExtRange() {
+ }
+
+ __forceinline PrimInfoExtRange(EmptyTy)
+ : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+ __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds)
+ : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+
+ __forceinline float leafSAH() const {
+ return expectedApproxHalfArea(geomBounds)*float(size());
+ }
+
+ __forceinline float leafSAH(size_t block_shift) const {
+ return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+ }
+ };
+
+ template<typename ObjectSplit, typename SpatialSplit>
+ struct Split2
+ {
+ __forceinline Split2 () {}
+
+ __forceinline Split2 (const Split2& other)
+ {
+ spatial = other.spatial;
+ sah = other.sah;
+ if (spatial) spatialSplit() = other.spatialSplit();
+ else objectSplit() = other.objectSplit();
+ }
+
+ __forceinline Split2& operator= (const Split2& other)
+ {
+ spatial = other.spatial;
+ sah = other.sah;
+ if (spatial) spatialSplit() = other.spatialSplit();
+ else objectSplit() = other.objectSplit();
+ return *this;
+ }
+
+ __forceinline ObjectSplit& objectSplit() { return *( ObjectSplit*)data; }
+ __forceinline const ObjectSplit& objectSplit() const { return *(const ObjectSplit*)data; }
+
+ __forceinline SpatialSplit& spatialSplit() { return *( SpatialSplit*)data; }
+ __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+
+ __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+ : spatial(false), sah(sah)
+ {
+ new (data) ObjectSplit(objectSplit);
+ }
+
+ __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+ : spatial(true), sah(sah)
+ {
+ new (data) SpatialSplit(spatialSplit);
+ }
+
+ __forceinline float splitSAH() const {
+ return sah;
+ }
+
+ __forceinline bool valid() const {
+ return sah < float(inf);
+ }
+
+ public:
+ __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+ bool spatial;
+ float sah;
+ };
+
+ /*! Performs standard object binning */
+ template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+ struct HeuristicArraySpatialSAH
+ {
+ typedef BinSplit<OBJECT_BINS> ObjectSplit;
+ typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+ typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+ typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+ //typedef extended_range<size_t> Set;
+ typedef Split2<ObjectSplit,SpatialSplit> Split;
+
+#if defined(__AVX512ER__) // KNL
+ static const size_t PARALLEL_THRESHOLD = 3*1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#else
+ static const size_t PARALLEL_THRESHOLD = 3*1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+
+ static const size_t MOVE_STEP_SIZE = 64;
+ static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+ __forceinline HeuristicArraySpatialSAH ()
+ : prims0(nullptr) {}
+
+ /*! remember prim array */
+ __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+ : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+ /*! compute extended ranges */
+ __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+ {
+ assert(set.ext_range_size() > 0);
+ const float left_factor = (float)lweight / (lweight + rweight);
+ const size_t ext_range_size = set.ext_range_size();
+ const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+ const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+ lset.set_ext_range(lset.end() + left_ext_range_size);
+ rset.set_ext_range(rset.end() + right_ext_range_size);
+ }
+
+ /*! move ranges */
+ __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t left_ext_range_size = lset.ext_range_size();
+ const size_t right_size = rset.size();
+
+ /* has the left child an extended range? */
+ if (left_ext_range_size > 0)
+ {
+ /* left extended range smaller than right range ? */
+ if (left_ext_range_size < right_size)
+ {
+ /* only move a small part of the beginning of the right range to the end */
+ parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ prims0[i+right_size] = prims0[i];
+ });
+ }
+ else
+ {
+ /* no overlap, move entire right range to new location, can be made fully parallel */
+ parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ prims0[i+left_ext_range_size] = prims0[i];
+ });
+ }
+ /* update right range */
+ assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+ rset.move_right(left_ext_range_size);
+ }
+ }
+
+ /*! finds the best split */
+ const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ SplitInfo oinfo;
+ const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+ const float object_split_sah = object_split.splitSAH();
+
+ if (unlikely(set.has_ext_range()))
+ {
+ const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+
+ /* do only spatial splits if the child bounds overlap */
+ if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+ safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+ {
+ const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+ const float spatial_split_sah = spatial_split.splitSAH();
+
+ /* valid spatial split, better SAH and number of splits do not exceed extended range */
+ if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+ spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+ {
+ return Split(spatial_split,spatial_split_sah);
+ }
+ }
+ }
+
+ return Split(object_split,object_split_sah);
+ }
+
+ /*! finds the best object split */
+ __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+ {
+ if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+ else return parallel_object_find (set,logBlockSize,info);
+ }
+
+ /*! finds the best object split */
+ __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+ {
+ ObjectBinner binner(empty);
+ const BinMapping<OBJECT_BINS> mapping(set);
+ binner.bin(prims0,set.begin(),set.end(),mapping);
+ ObjectSplit s = binner.best(mapping,logBlockSize);
+ binner.getSplitInfo(mapping, s, info);
+ return s;
+ }
+
+ /*! finds the best split */
+ __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+ {
+ ObjectBinner binner(empty);
+ const BinMapping<OBJECT_BINS> mapping(set);
+ const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+ binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+ [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+ [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+ ObjectSplit s = binner.best(mapping,logBlockSize);
+ binner.getSplitInfo(mapping, s, info);
+ return s;
+ }
+
+ /*! finds the best spatial split */
+ __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+ else return parallel_spatial_find (set, logBlockSize);
+ }
+
+ /*! finds the best spatial split */
+ __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ SpatialBinner binner(empty);
+ const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+ binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+ /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+ return binner.best(mapping,logBlockSize); //,set.ext_size());
+ }
+
+ __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+ {
+ SpatialBinner binner(empty);
+ const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+ const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+ binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+ [&] (const range<size_t>& r) -> SpatialBinner {
+ SpatialBinner binner(empty);
+ binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+ return binner; },
+ [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+ /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+ return binner.best(mapping,logBlockSize); //,set.ext_size());
+ }
+
+
+ /*! subdivides primitives based on a spatial split */
+ __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+ {
+ assert(set.has_ext_range());
+ const size_t max_ext_range_size = set.ext_range_size();
+ const size_t ext_range_start = set.end();
+
+ /* atomic counter for number of primref splits */
+ std::atomic<size_t> ext_elements;
+ ext_elements.store(0);
+
+ const float fpos = split.mapping.pos(split.pos,split.dim);
+
+ const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+ parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+ for (size_t i=r.begin();i<r.end();i++)
+ {
+ const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+ if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+ //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+ //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+ //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+ if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+ {
+ assert(splits > 1);
+
+ PrimRef left,right;
+ const auto splitter = splitterFactory(prims0[i]);
+ splitter(prims0[i],split.dim,fpos,left,right);
+
+ // no empty splits
+ if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+
+ left.lower.u = (left.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+ right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+ const size_t ID = ext_elements.fetch_add(1);
+
+ /* break if the number of subdivided elements are greater than the maximum allowed size */
+ if (unlikely(ID >= max_ext_range_size))
+ break;
+
+ /* only write within the correct bounds */
+ assert(ID < max_ext_range_size);
+ prims0[i] = left;
+ prims0[ext_range_start+ID] = right;
+ }
+ }
+ });
+
+ const size_t numExtElements = min(max_ext_range_size,ext_elements.load());
+ assert(set.end()+numExtElements<=set.ext_end());
+ set._end += numExtElements;
+ }
+
+ /*! array partitioning */
+ void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ PrimInfoExtRange set = set_i;
+
+ /* valid split */
+ if (unlikely(!split.valid())) {
+ deterministic_order(set);
+ return splitFallback(set,lset,rset);
+ }
+
+ std::pair<size_t,size_t> ext_weights(0,0);
+
+ if (unlikely(split.spatial))
+ {
+ create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping);
+
+ /* spatial split */
+ if (likely(set.size() < PARALLEL_THRESHOLD))
+ ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+ else
+ ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+ }
+ else
+ {
+ /* object split */
+ if (likely(set.size() < PARALLEL_THRESHOLD))
+ ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+ else
+ ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+ }
+
+ /* if we have an extended range, set extended child ranges and move right split range */
+ if (unlikely(set.has_ext_range()))
+ {
+ setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+ moveExtentedRange(set,lset,rset);
+ }
+ }
+
+ /*! array partitioning */
+ std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo local_left(empty);
+ PrimInfo local_right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ const typename ObjectBinner::vint vSplitPos(splitPos);
+ const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+ size_t center = serial_partitioning(prims0,
+ begin,end,local_left,local_right,
+ [&] (const PrimRef& ref) {
+ return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+ },
+ [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+ const size_t left_weight = local_left.end;
+ const size_t right_weight = local_right.end;
+
+ new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+ new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ return std::pair<size_t,size_t>(left_weight,right_weight);
+ }
+
+
+ /*! array partitioning */
+ __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo local_left(empty);
+ PrimInfo local_right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ /* init spatial mapping */
+ const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+ const vint4 vSplitPos(splitPos);
+ const vbool4 vSplitMask( (int)splitDimMask );
+
+ size_t center = serial_partitioning(prims0,
+ begin,end,local_left,local_right,
+ [&] (const PrimRef& ref) {
+ const Vec3fa c = ref.bounds().center();
+ return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask);
+ },
+ [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+ const size_t left_weight = local_left.end;
+ const size_t right_weight = local_right.end;
+
+ new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+ new (&rset) PrimInfoExtRange(center,end,end,local_right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ return std::pair<size_t,size_t>(left_weight,right_weight);
+ }
+
+
+
+ /*! array partitioning */
+ __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo left(empty);
+ PrimInfo right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ const typename ObjectBinner::vint vSplitPos(splitPos);
+ const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+ auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+ const size_t center = parallel_partitioning(
+ prims0,begin,end,EmptyTy(),left,right,isLeft,
+ [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+ [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+ PARALLEL_PARTITION_BLOCK_SIZE);
+
+ const size_t left_weight = left.end;
+ const size_t right_weight = right.end;
+
+ left.begin = begin; left.end = center;
+ right.begin = center; right.end = end;
+
+ new (&lset) PrimInfoExtRange(begin,center,center,left);
+ new (&rset) PrimInfoExtRange(center,end,end,right);
+
+ assert(area(left.geomBounds) >= 0.0f);
+ assert(area(right.geomBounds) >= 0.0f);
+ return std::pair<size_t,size_t>(left_weight,right_weight);
+ }
+
+ /*! array partitioning */
+ __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ PrimInfo left(empty);
+ PrimInfo right(empty);
+ const unsigned int splitPos = split.pos;
+ const unsigned int splitDim = split.dim;
+ const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+ /* init spatial mapping */
+ const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+ const vint4 vSplitPos(splitPos);
+ const vbool4 vSplitMask( (int)splitDimMask );
+
+ auto isLeft = [&] (const PrimRef &ref) {
+ const Vec3fa c = ref.bounds().center();
+ return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+ const size_t center = parallel_partitioning(
+ prims0,begin,end,EmptyTy(),left,right,isLeft,
+ [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+ [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+ PARALLEL_PARTITION_BLOCK_SIZE);
+
+ const size_t left_weight = left.end;
+ const size_t right_weight = right.end;
+
+ left.begin = begin; left.end = center;
+ right.begin = center; right.end = end;
+
+ new (&lset) PrimInfoExtRange(begin,center,center,left);
+ new (&rset) PrimInfoExtRange(center,end,end,right);
+
+ assert(area(left.geomBounds) >= 0.0f);
+ assert(area(right.geomBounds) >= 0.0f);
+ return std::pair<size_t,size_t>(left_weight,right_weight);
+ }
+
+ void deterministic_order(const PrimInfoExtRange& set)
+ {
+ /* required as parallel partition destroys original primitive order */
+ std::sort(&prims0[set.begin()],&prims0[set.end()]);
+ }
+
+ void splitFallback(const PrimInfoExtRange& set,
+ PrimInfoExtRange& lset,
+ PrimInfoExtRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ PrimInfo left(empty);
+ for (size_t i=begin; i<center; i++) {
+ left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+ }
+ const size_t lweight = left.end;
+
+ PrimInfo right(empty);
+ for (size_t i=center; i<end; i++) {
+ right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+ }
+ const size_t rweight = right.end;
+
+ new (&lset) PrimInfoExtRange(begin,center,center,left);
+ new (&rset) PrimInfoExtRange(center,end,end,right);
+
+ /* if we have an extended range */
+ if (set.has_ext_range()) {
+ setExtentedRanges(set,lset,rset,lweight,rweight);
+ moveExtentedRange(set,lset,rset);
+ }
+ }
+
+ private:
+ PrimRef* const prims0;
+ const PrimitiveSplitterFactory& splitterFactory;
+ const CentGeomBBox3fa& root_info;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..ede0d04c78
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Performs standard object binning */
+ struct HeuristicStrandSplit
+ {
+ typedef range<size_t> Set;
+
+ static const size_t PARALLEL_THRESHOLD = 10000;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+ /*! stores all information to perform some split */
+ struct Split
+ {
+ /*! construct an invalid split by default */
+ __forceinline Split()
+ : sah(inf), axis0(zero), axis1(zero) {}
+
+ /*! constructs specified split */
+ __forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+ : sah(sah), axis0(axis0), axis1(axis1) {}
+
+ /*! calculates standard surface area heuristic for the split */
+ __forceinline float splitSAH() const { return sah; }
+
+ /*! test if this split is valid */
+ __forceinline bool valid() const { return sah != float(inf); }
+
+ public:
+ float sah; //!< SAH cost of the split
+ Vec3fa axis0, axis1; //!< axis the two strands are aligned into
+ };
+
+ __forceinline HeuristicStrandSplit () // FIXME: required?
+ : scene(nullptr), prims(nullptr) {}
+
+ /*! remember prim array */
+ __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+ : scene(scene), prims(prims) {}
+
+ __forceinline const Vec3fa direction(const PrimRef& prim) {
+ return scene->get(prim.geomID())->computeDirection(prim.primID());
+ }
+
+ __forceinline const BBox3fa bounds(const PrimRef& prim) {
+ return scene->get(prim.geomID())->vbounds(prim.primID());
+ }
+
+ __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+ return scene->get(prim.geomID())->vbounds(space,prim.primID());
+ }
+
+ /*! finds the best split */
+ const Split find(const range<size_t>& set, size_t logBlockSize)
+ {
+ Vec3fa axis0(0,0,1);
+ uint64_t bestGeomPrimID = -1;
+
+ /* curve with minimum ID determines first axis */
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ const uint64_t geomprimID = prims[i].ID64();
+ if (geomprimID >= bestGeomPrimID) continue;
+ const Vec3fa axis = direction(prims[i]);
+ if (sqr_length(axis) > 1E-18f) {
+ axis0 = normalize(axis);
+ bestGeomPrimID = geomprimID;
+ }
+ }
+
+ /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+ float bestCos = 1.0f;
+ Vec3fa axis1 = axis0;
+ bestGeomPrimID = -1;
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ const uint64_t geomprimID = prims[i].ID64();
+ Vec3fa axisi = direction(prims[i]);
+ float leni = length(axisi);
+ if (leni == 0.0f) continue;
+ axisi /= leni;
+ float cos = abs(dot(axisi,axis0));
+ if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+ bestCos = cos; axis1 = axisi;
+ bestGeomPrimID = geomprimID;
+ }
+ }
+
+ /* partition the two strands */
+ size_t lnum = 0, rnum = 0;
+ BBox3fa lbounds = empty, rbounds = empty;
+ const LinearSpace3fa space0 = frame(axis0).transposed();
+ const LinearSpace3fa space1 = frame(axis1).transposed();
+
+ for (size_t i=set.begin(); i<set.end(); i++)
+ {
+ PrimRef& prim = prims[i];
+ const Vec3fa axisi = normalize(direction(prim));
+ const float cos0 = abs(dot(axisi,axis0));
+ const float cos1 = abs(dot(axisi,axis1));
+
+ if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+ else { rnum++; rbounds.extend(bounds(space1,prim)); }
+ }
+
+ /*! return an invalid split if we do not partition */
+ if (lnum == 0 || rnum == 0)
+ return Split(inf,axis0,axis1);
+
+ /*! calculate sah for the split */
+ const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+ const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+ const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+ return Split(sah,axis0,axis1);
+ }
+
+ /*! array partitioning */
+ void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ if (!split.valid()) {
+ deterministic_order(set);
+ return splitFallback(set,lset,rset);
+ }
+
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ CentGeomBBox3fa local_left(empty);
+ CentGeomBBox3fa local_right(empty);
+
+ auto primOnLeftSide = [&] (const PrimRef& prim) -> bool {
+ const Vec3fa axisi = normalize(direction(prim));
+ const float cos0 = abs(dot(axisi,split.axis0));
+ const float cos1 = abs(dot(axisi,split.axis1));
+ return cos0 > cos1;
+ };
+
+ auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) {
+ pinfo.extend(bounds(ref));
+ };
+
+ size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+
+ new (&lset) PrimInfoRange(begin,center,local_left);
+ new (&rset) PrimInfoRange(center,end,local_right);
+ assert(area(lset.geomBounds) >= 0.0f);
+ assert(area(rset.geomBounds) >= 0.0f);
+ }
+
+ void deterministic_order(const Set& set)
+ {
+ /* required as parallel partition destroys original primitive order */
+ std::sort(&prims[set.begin()],&prims[set.end()]);
+ }
+
+ void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+ {
+ const size_t begin = set.begin();
+ const size_t end = set.end();
+ const size_t center = (begin + end)/2;
+
+ CentGeomBBox3fa left(empty);
+ for (size_t i=begin; i<center; i++)
+ left.extend(bounds(prims[i]));
+ new (&lset) PrimInfoRange(begin,center,left);
+
+ CentGeomBBox3fa right(empty);
+ for (size_t i=center; i<end; i++)
+ right.extend(bounds(prims[i]));
+ new (&rset) PrimInfoRange(center,end,right);
+ }
+
+ private:
+ Scene* const scene;
+ PrimRef* const prims;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..c999941a11
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Performs standard object binning */
+ template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+ struct HeuristicMBlurTemporalSplit
+ {
+ typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+ typedef mvector<PrimRefMB>* PrimRefVector;
+ typedef typename PrimRefMB::BBox BBox;
+
+ static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+ HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+ : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+ struct TemporalBinInfo
+ {
+ __forceinline TemporalBinInfo () {
+ }
+
+ __forceinline TemporalBinInfo (EmptyTy)
+ {
+ for (size_t i=0; i<BINS-1; i++)
+ {
+ count0[i] = count1[i] = 0;
+ bounds0[i] = bounds1[i] = empty;
+ }
+ }
+
+ void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+ {
+ for (int b=0; b<BINS-1; b++)
+ {
+ const float t = float(b+1)/float(BINS);
+ const float ct = lerp(time_range.lower,time_range.upper,t);
+ const float center_time = set.align_time(ct);
+ if (center_time <= time_range.lower) continue;
+ if (center_time >= time_range.upper) continue;
+ const BBox1f dt0(time_range.lower,center_time);
+ const BBox1f dt1(center_time,time_range.upper);
+
+ /* find linear bounds for both time segments */
+ for (size_t i=begin; i<end; i++)
+ {
+ if (prims[i].time_range_overlap(dt0))
+ {
+ const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+ bounds0[b].extend(bn0);
+#else
+ bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+ count0[b] += prims[i].timeSegmentRange(dt0).size();
+ }
+
+ if (prims[i].time_range_overlap(dt1))
+ {
+ const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+ bounds1[b].extend(bn1);
+#else
+ bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+ count1[b] += prims[i].timeSegmentRange(dt1).size();
+ }
+ }
+ }
+ }
+
+ __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+ {
+ if (likely(end-begin < parallelThreshold)) {
+ bin(prims,begin,end,time_range,set,recalculatePrimRef);
+ }
+ else
+ {
+ auto bin = [&](const range<size_t>& r) -> TemporalBinInfo {
+ TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner;
+ };
+ *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+ }
+ }
+
+ /*! merges in other binning information */
+ __forceinline void merge (const TemporalBinInfo& other)
+ {
+ for (size_t i=0; i<BINS-1; i++)
+ {
+ count0[i] += other.count0[i];
+ count1[i] += other.count1[i];
+ bounds0[i].extend(other.bounds0[i]);
+ bounds1[i].extend(other.bounds1[i]);
+ }
+ }
+
+ static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+ TemporalBinInfo r = a; r.merge(b); return r;
+ }
+
+ Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+ {
+ float bestSAH = inf;
+ float bestPos = 0.0f;
+ for (int b=0; b<BINS-1; b++)
+ {
+ float t = float(b+1)/float(BINS);
+ float ct = lerp(time_range.lower,time_range.upper,t);
+ const float center_time = set.align_time(ct);
+ if (center_time <= time_range.lower) continue;
+ if (center_time >= time_range.upper) continue;
+ const BBox1f dt0(time_range.lower,center_time);
+ const BBox1f dt1(center_time,time_range.upper);
+
+ /* calculate sah */
+ const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+ const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+ float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+ float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+ if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+ if (unlikely(rCount == 0)) sah1 = 0.0f;
+ const float sah = sah0+sah1;
+ if (sah < bestSAH) {
+ bestSAH = sah;
+ bestPos = center_time;
+ }
+ }
+ return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+ }
+
+ public:
+ size_t count0[BINS-1];
+ size_t count1[BINS-1];
+ BBox bounds0[BINS-1];
+ BBox bounds1[BINS-1];
+ };
+
+ /*! finds the best split */
+ const Split find(const SetMB& set, const size_t logBlockSize)
+ {
+ assert(set.size() > 0);
+ TemporalBinInfo binner(empty);
+ binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+ Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+ if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+ return tsplit;
+ }
+
+ __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+ {
+ assert(tsplit.sah != float(inf));
+ assert(tsplit.fpos > set.time_range.lower);
+ assert(tsplit.fpos < set.time_range.upper);
+
+ float center_time = tsplit.fpos;
+ const BBox1f time_range0(set.time_range.lower,center_time);
+ const BBox1f time_range1(center_time,set.time_range.upper);
+ mvector<PrimRefMB>& prims = *set.prims;
+
+ /* calculate primrefs for first time range */
+ std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+ PrimRefVector lprims = new_vector.get();
+
+ auto reduction_func0 = [&] (const range<size_t>& r) {
+ PrimInfoMB pinfo = empty;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ if (likely(prims[i].time_range_overlap(time_range0)))
+ {
+ const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+ (*lprims)[i-set.begin()] = prim;
+ pinfo.add_primref(prim);
+ }
+ else
+ {
+ (*lprims)[i-set.begin()] = prims[i];
+ }
+ }
+ return pinfo;
+ };
+ PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+ /* primrefs for first time range are in lprims[0 .. set.size()) */
+ /* some primitives may need to be filtered out */
+ if (linfo.size() != set.size())
+ linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+ [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+
+ lset = SetMB(linfo,lprims,time_range0);
+
+ /* calculate primrefs for second time range */
+ auto reduction_func1 = [&] (const range<size_t>& r) {
+ PrimInfoMB pinfo = empty;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ if (likely(prims[i].time_range_overlap(time_range1)))
+ {
+ const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+ prims[i] = prim;
+ pinfo.add_primref(prim);
+ }
+ }
+ return pinfo;
+ };
+ PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+ rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+ /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+ /* some primitives may need to be filtered out */
+ if (rinfo.size() != set.size())
+ rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+ [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+
+ rset = SetMB(rinfo,&prims,time_range1);
+
+ return new_vector;
+ }
+
+ private:
+ MemoryMonitorInterface* device; // device to report memory usage to
+ const RecalculatePrimRef recalculatePrimRef;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..06c1388742
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+ // FIXME: maybe there's a better place for this util fct
+ __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+ {
+ const Vec3fa e0 = v1-v0;
+ const Vec3fa e1 = v2-v0;
+ const Vec3fa d = cross(e0,e1);
+ return fabs(d.x) + fabs(d.y) + fabs(d.z);
+ }
+
+ //namespace isa
+ //{
+ template<typename BBox>
+ class CentGeom
+ {
+ public:
+ __forceinline CentGeom () {}
+
+ __forceinline CentGeom (EmptyTy)
+ : geomBounds(empty), centBounds(empty) {}
+
+ __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds)
+ : geomBounds(geomBounds), centBounds(centBounds) {}
+
+ template<typename PrimRef>
+ __forceinline void extend_primref(const PrimRef& prim)
+ {
+ BBox bounds; Vec3fa center;
+ prim.binBoundsAndCenter(bounds,center);
+ geomBounds.extend(bounds);
+ centBounds.extend(center);
+ }
+
+ template<typename PrimRef>
+ __forceinline void extend_center2(const PrimRef& prim)
+ {
+ BBox3fa bounds = prim.bounds();
+ geomBounds.extend(bounds);
+ centBounds.extend(bounds.center2());
+ }
+
+ __forceinline void extend(const BBox& geomBounds_) {
+ geomBounds.extend(geomBounds_);
+ centBounds.extend(center2(geomBounds_));
+ }
+
+ __forceinline void merge(const CentGeom& other)
+ {
+ geomBounds.extend(other.geomBounds);
+ centBounds.extend(other.centBounds);
+ }
+
+ static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+ CentGeom r = a; r.merge(b); return r;
+ }
+
+ public:
+ BBox geomBounds; //!< geometry bounds of primitives
+ BBox3fa centBounds; //!< centroid bounds of primitives
+ };
+
+ typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+ /*! stores bounding information for a set of primitives */
+ template<typename BBox>
+ class PrimInfoT : public CentGeom<BBox>
+ {
+ public:
+ using CentGeom<BBox>::geomBounds;
+ using CentGeom<BBox>::centBounds;
+
+ __forceinline PrimInfoT () {}
+
+ __forceinline PrimInfoT (EmptyTy)
+ : CentGeom<BBox>(empty), begin(0), end(0) {}
+
+ __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+ : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+ template<typename PrimRef>
+ __forceinline void add_primref(const PrimRef& prim)
+ {
+ CentGeom<BBox>::extend_primref(prim);
+ end++;
+ }
+
+ template<typename PrimRef>
+ __forceinline void add_center2(const PrimRef& prim) {
+ CentGeom<BBox>::extend_center2(prim);
+ end++;
+ }
+
+ template<typename PrimRef>
+ __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+ CentGeom<BBox>::extend_center2(prim);
+ end+=i;
+ }
+
+ /*__forceinline void add(const BBox& geomBounds_) {
+ CentGeom<BBox>::extend(geomBounds_);
+ end++;
+ }
+
+ __forceinline void add(const BBox& geomBounds_, const size_t i) {
+ CentGeom<BBox>::extend(geomBounds_);
+ end+=i;
+ }*/
+
+ __forceinline void merge(const PrimInfoT& other)
+ {
+ CentGeom<BBox>::merge(other);
+ begin += other.begin;
+ end += other.end;
+ }
+
+ static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+ PrimInfoT r = a; r.merge(b); return r;
+ }
+
+ /*! returns the number of primitives */
+ __forceinline size_t size() const {
+ return end-begin;
+ }
+
+ __forceinline float halfArea() {
+ return expectedApproxHalfArea(geomBounds);
+ }
+
+ __forceinline float leafSAH() const {
+ return expectedApproxHalfArea(geomBounds)*float(size());
+ //return halfArea(geomBounds)*blocks(num);
+ }
+
+ __forceinline float leafSAH(size_t block_shift) const {
+ return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+ //return halfArea(geomBounds)*float((num+3) >> 2);
+ //return halfArea(geomBounds)*blocks(num);
+ }
+
+ /*! stream output */
+ friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+ return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+ }
+
+ public:
+ size_t begin,end; //!< number of primitives
+ };
+
+ typedef PrimInfoT<BBox3fa> PrimInfo;
+ //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+ /*! stores bounding information for a set of primitives */
+ template<typename BBox>
+ class PrimInfoMBT : public CentGeom<BBox>
+ {
+ public:
+ using CentGeom<BBox>::geomBounds;
+ using CentGeom<BBox>::centBounds;
+
+ __forceinline PrimInfoMBT () {
+ }
+
+ __forceinline PrimInfoMBT (EmptyTy)
+ : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+ __forceinline PrimInfoMBT (size_t begin, size_t end)
+ : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+ template<typename PrimRef>
+ __forceinline void add_primref(const PrimRef& prim)
+ {
+ CentGeom<BBox>::extend_primref(prim);
+ time_range.extend(prim.time_range);
+ object_range._end++;
+ num_time_segments += prim.size();
+ if (max_num_time_segments < prim.totalTimeSegments()) {
+ max_num_time_segments = prim.totalTimeSegments();
+ max_time_range = prim.time_range;
+ }
+ }
+
+ __forceinline void merge(const PrimInfoMBT& other)
+ {
+ CentGeom<BBox>::merge(other);
+ time_range.extend(other.time_range);
+ object_range._begin += other.object_range.begin();
+ object_range._end += other.object_range.end();
+ num_time_segments += other.num_time_segments;
+ if (max_num_time_segments < other.max_num_time_segments) {
+ max_num_time_segments = other.max_num_time_segments;
+ max_time_range = other.max_time_range;
+ }
+ }
+
+ static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+ PrimInfoMBT r = a; r.merge(b); return r;
+ }
+
+ __forceinline size_t begin() const {
+ return object_range.begin();
+ }
+
+ __forceinline size_t end() const {
+ return object_range.end();
+ }
+
+ /*! returns the number of primitives */
+ __forceinline size_t size() const {
+ return object_range.size();
+ }
+
+ __forceinline float halfArea() const {
+ return time_range.size()*expectedApproxHalfArea(geomBounds);
+ }
+
+ __forceinline float leafSAH() const {
+ return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments);
+ }
+
+ __forceinline float leafSAH(size_t block_shift) const {
+ return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+ }
+
+ __forceinline float align_time(float ct) const
+ {
+ //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+ float t0 = (ct-max_time_range.lower)/max_time_range.size();
+ float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+ return t1*max_time_range.size()+max_time_range.lower;
+ }
+
+ /*! stream output */
+ friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo)
+ {
+ return cout << "PrimInfo { " <<
+ "object_range = " << pinfo.object_range <<
+ ", time_range = " << pinfo.time_range <<
+ ", time_segments = " << pinfo.num_time_segments <<
+ ", geomBounds = " << pinfo.geomBounds <<
+ ", centBounds = " << pinfo.centBounds <<
+ "}";
+ }
+
+ public:
+ range<size_t> object_range; //!< primitive range
+ size_t num_time_segments; //!< total number of time segments of all added primrefs
+ size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+ BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+ BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+ };
+
+ typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+ struct SetMB : public PrimInfoMB
+ {
+ static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+ static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+ static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+ typedef mvector<PrimRefMB>* PrimRefVector;
+
+ __forceinline SetMB() {}
+
+ __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+ : PrimInfoMB(pinfo_i), prims(prims) {}
+
+ __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+ : PrimInfoMB(pinfo_i), prims(prims)
+ {
+ object_range = object_range_in;
+ time_range = intersect(time_range,time_range_in);
+ }
+
+ __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+ : PrimInfoMB(pinfo_i), prims(prims)
+ {
+ time_range = intersect(time_range,time_range_in);
+ }
+
+ void deterministic_order() const
+ {
+ /* required as parallel partition destroys original primitive order */
+ PrimRefMB* prim = prims->data();
+ std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+ }
+
+ template<typename RecalculatePrimRef>
+ __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+ {
+ auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+ {
+ LBBox3fa cbounds(empty);
+ for (size_t j = r.begin(); j < r.end(); j++)
+ {
+ PrimRefMB& ref = (*prims)[j];
+ const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+ cbounds.extend(bn);
+ };
+ return cbounds;
+ };
+
+ return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+ reduce,
+ [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+ }
+
+ template<typename RecalculatePrimRef>
+ __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+ {
+ auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+ {
+ LBBox3fa cbounds(empty);
+ for (size_t j = r.begin(); j < r.end(); j++)
+ {
+ PrimRefMB& ref = (*prims)[j];
+ const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+ cbounds.extend(bn);
+ };
+ return cbounds;
+ };
+
+ return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+ reduce,
+ [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+ }
+
+ template<typename RecalculatePrimRef>
+ const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+ {
+ auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+ {
+ PrimInfoMB pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ PrimRefMB& ref = (*prims)[j];
+ PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+ pinfo.add_primref(ref1);
+ };
+ return pinfo;
+ };
+
+ const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD,
+ PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+ return SetMB(pinfo,prims,object_range,time_range);
+ }
+
+ public:
+ PrimRefVector prims;
+ };
+//}
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..e23de3df28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
@@ -0,0 +1,244 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+ {
+ ParallelPrefixSumState<PrimInfo> pstate;
+
+ /* first try */
+ progressMonitor(0);
+ PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+ return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != prims.size())
+ {
+ progressMonitor(0);
+ pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+ return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+ {
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator2 iter(scene,types,mblur);
+
+ /* first try */
+ progressMonitor(0);
+ pstate.init(iter,size_t(1024));
+ PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+ return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != prims.size())
+ {
+ progressMonitor(0);
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+ return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+ return pinfo;
+ }
+
+ PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+ {
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator2 iter(scene,types,true);
+
+ /* first try */
+ progressMonitor(0);
+ pstate.init(iter,size_t(1024));
+ PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+ return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != prims.size())
+ {
+ progressMonitor(0);
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+ return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+ {
+ ParallelForForPrefixSumState<PrimInfoMB> pstate;
+ Scene::Iterator2 iter(scene,types,true);
+
+ /* first try */
+ progressMonitor(0);
+ pstate.init(iter,size_t(1024));
+ PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+ return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+ }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != prims.size())
+ {
+ progressMonitor(0);
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+ return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+ }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+ }
+
+ /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+ pinfo.time_range = t0t1;
+ return pinfo;
+ }
+
+ template<typename Mesh>
+ size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+ {
+ size_t numPrimitives = morton.size();
+
+ /* compute scene bounds */
+ std::pair<size_t,BBox3fa> cb_empty(0,empty);
+ auto cb = parallel_reduce
+ ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+ {
+ size_t num = 0;
+ BBox3fa bounds = empty;
+
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa prim_bounds = empty;
+ if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+ bounds.extend(center2(prim_bounds));
+ num++;
+ }
+ return std::make_pair(num,bounds);
+ }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+ return std::make_pair(a.first + b.first,merge(a.second,b.second));
+ });
+
+
+ size_t numPrimitivesGen = cb.first;
+ const BBox3fa centBounds = cb.second;
+
+ /* compute morton codes */
+ if (likely(numPrimitivesGen == numPrimitives))
+ {
+ /* fast path if all primitives were valid */
+ BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+ parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+ BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ generator(mesh->bounds(j),unsigned(j));
+ });
+ }
+ else
+ {
+ /* slow path, fallback in case some primitives were invalid */
+ ParallelPrefixSumState<size_t> pstate;
+ BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+ parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+ size_t num = 0;
+ BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+ generator(bounds,unsigned(j));
+ num++;
+ }
+ return num;
+ }, std::plus<size_t>());
+
+ parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+ size_t num = 0;
+ BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(j,&bounds)) continue;
+ generator(bounds,unsigned(j));
+ num++;
+ }
+ return num;
+ }, std::plus<size_t>());
+ }
+ return numPrimitivesGen;
+ }
+
+ // ====================================================================================================
+ // ====================================================================================================
+ // ====================================================================================================
+
+ // template for grid meshes
+
+#if 0
+ template<>
+ PrimInfo createPrimRefArray<GridMesh,false>(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+ {
+ PING;
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator<GridMesh,false> iter(scene);
+
+ /* first try */
+ progressMonitor(0);
+ pstate.init(iter,size_t(1024));
+ PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k) -> PrimInfo
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != prims.size())
+ {
+ progressMonitor(0);
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, const PrimInfo& base) -> PrimInfo
+ {
+ k = base.size();
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(j,&bounds)) continue;
+ const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+ pinfo.add_center2(prim);
+ prims[k++] = prim;
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+ return pinfo;
+ }
+#endif
+
+ // ====================================================================================================
+ // ====================================================================================================
+ // ====================================================================================================
+
+ IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+ IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+ IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+ IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..9919c945c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
@@ -0,0 +1,28 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+
+ PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+
+ PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+ PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+ template<typename Mesh>
+ size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+ }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8bdb38b955
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)
+#define CHECK_PRESPLIT(x)
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{
+ namespace isa
+ {
+
+ struct PresplitItem
+ {
+ union {
+ float priority;
+ unsigned int data;
+ };
+ unsigned int index;
+
+ __forceinline operator unsigned() const
+ {
+ return reinterpret_cast<const unsigned&>(priority);
+ }
+ __forceinline bool operator < (const PresplitItem& item) const
+ {
+ return (priority < item.priority);
+ }
+
+ template<typename Mesh>
+ __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+ {
+ const unsigned int geomID = ref.geomID();
+ const unsigned int primID = ref.primID();
+ const float area_aabb = area(ref.bounds());
+ const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+ const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+ assert(area_prim <= area_aabb);
+ //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);
+ const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+ assert(priority >= 0.0f && priority < FLT_LARGE);
+ return priority;
+ }
+
+
+ };
+
+ inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+ return cout << "index " << item.index << " priority " << item.priority;
+ };
+
+ template<typename SplitterFactory>
+ void splitPrimitive(SplitterFactory &Splitter,
+ const PrimRef &prim,
+ const unsigned int geomID,
+ const unsigned int primID,
+ const unsigned int split_level,
+ const Vec3fa &grid_base,
+ const float grid_scale,
+ const float grid_extend,
+ PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+ unsigned int& numSubPrims)
+ {
+ assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+ if (split_level == 0)
+ {
+ assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+ subPrims[numSubPrims++] = prim;
+ }
+ else
+ {
+ const Vec3fa lower = prim.lower;
+ const Vec3fa upper = prim.upper;
+ const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+ const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+ Vec3ia ilower(floor(glower));
+ Vec3ia iupper(floor(gupper));
+
+ /* this ignores dimensions that are empty */
+ iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+ /* compute a morton code for the lower and upper grid coordinates. */
+ const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+ const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+
+ /* if all bits are equal then we cannot split */
+ if(unlikely(lower_code == upper_code))
+ {
+ assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+ subPrims[numSubPrims++] = prim;
+ return;
+ }
+
+ /* compute octree level and dimension to perform the split in */
+ const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+ const unsigned int level = diff / 3;
+ const unsigned int dim = diff % 3;
+
+ /* now we compute the grid position of the split */
+ const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+
+ /* compute world space position of split */
+ const float inv_grid_size = 1.0f / GRID_SIZE;
+ const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+ assert(prim.lower[dim] <= fsplit &&
+ prim.upper[dim] >= fsplit);
+
+ /* split primitive */
+ const auto splitter = Splitter(prim);
+ BBox3fa left,right;
+ splitter(prim.bounds(),dim,fsplit,left,right);
+ assert(!left.empty());
+ assert(!right.empty());
+
+
+ splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+ splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+ }
+ }
+
+
+ template<typename Mesh, typename SplitterFactory>
+ PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+ {
+ ParallelPrefixSumState<PrimInfo> pstate;
+
+ /* first try */
+ progressMonitor(0);
+ PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+ return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != numPrimRefs)
+ {
+ progressMonitor(0);
+ pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+ return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+ return pinfo;
+ }
+
+ __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+ {
+ const Vec3fa lower = ref.lower;
+ const Vec3fa upper = ref.upper;
+ const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+ const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+ Vec3ia ilower(floor(glower));
+ Vec3ia iupper(floor(gupper));
+
+ /* this ignores dimensions that are empty */
+ iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+ /* compute a morton code for the lower and upper grid coordinates. */
+ const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+ const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+ return Vec2i(lower_code,upper_code);
+ }
+
+ template<typename Mesh, typename SplitterFactory>
+ PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+ {
+ static const size_t MIN_STEP_SIZE = 128;
+
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator2 iter(scene,types,mblur);
+
+ /* first try */
+ progressMonitor(0);
+ pstate.init(iter,size_t(1024));
+ PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+ return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ /* if we need to filter out geometry, run again */
+ if (pinfo.size() != numPrimRefs)
+ {
+ progressMonitor(0);
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+ return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ }
+
+ /* use correct number of primitives */
+ size_t numPrimitives = pinfo.size();
+ const size_t alloc_numPrimitives = prims.size();
+ const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+ /* set up primitive splitter */
+ SplitterFactory Splitter(scene);
+
+
+ DBG_PRESPLIT(
+ const size_t org_numPrimitives = pinfo.size();
+ PRINT(numPrimitives);
+ PRINT(alloc_numPrimitives);
+ PRINT(numSplitPrimitivesBudget);
+ );
+
+ /* allocate double buffer presplit items */
+ const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+ PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+ PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+ /* compute grid */
+ const Vec3fa grid_base = pinfo.geomBounds.lower;
+ const Vec3fa grid_diag = pinfo.geomBounds.size();
+ const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z));
+ const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+ /* init presplit items and get total sum */
+ const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+ float sum = 0.0f;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ presplitItem[i].index = (unsigned int)i;
+ const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+ /* if all bits are equal then we cannot split */
+ presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;
+ /* FIXME: sum undeterministic */
+ sum += presplitItem[i].priority;
+ }
+ return sum;
+ },[](const float& a, const float& b) -> float { return a+b; });
+
+ /* compute number of splits per primitive */
+ const float inv_psum = 1.0f / psum;
+ parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ {
+ if (presplitItem[i].priority > 0.0f)
+ {
+ const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+ if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+ {
+ presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+ //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+ assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+ }
+ else
+ presplitItem[i].priority = 0.0f;
+ }
+ }
+ });
+
+ auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };
+ size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+ /* anything to split ? */
+ if (center < numPrimitives)
+ {
+ const size_t numPrimitivesToSplit = numPrimitives - center;
+ assert(presplitItem[center].priority >= 1.0f);
+
+ /* sort presplit items in ascending order */
+ radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+ CHECK_PRESPLIT(
+ parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+ for (size_t i=r.begin(); i<r.end(); i++)
+ assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+ });
+ );
+
+ unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+ unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+ /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+ const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+ size_t sum = 0;
+ for (size_t i=t.begin(); i<t.end(); i++)
+ {
+ PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+ assert(presplitItem[i].priority >= 1.0f);
+ const unsigned int primrefID = presplitItem[i].index;
+ const float prio = presplitItem[i].priority;
+ const unsigned int geomID = prims[primrefID].geomID();
+ const unsigned int primID = prims[primrefID].primID();
+ const unsigned int split_levels = (unsigned int)prio;
+ unsigned int numSubPrims = 0;
+ splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+ assert(numSubPrims);
+ numSubPrims--; // can reuse slot
+ sum+=numSubPrims;
+ presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+ primOffset0[i-center] = numSubPrims;
+ }
+ return sum;
+ },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+
+ /* if we are over budget, need to shrink the range */
+ if (totalNumSubPrims > numSplitPrimitivesBudget)
+ {
+ size_t new_center = numPrimitives-1;
+ size_t sum = 0;
+ for (;new_center>=center;new_center--)
+ {
+ const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+ if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+ sum += numSubPrims;
+ }
+ new_center++;
+ center = new_center;
+ }
+
+ /* parallel prefix sum to compute offsets for storing sub-primitives */
+ const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+ /* iterate over range, and split primitives into sub primitives and append them to prims array */
+ parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+ for (size_t j=rn.begin(); j<rn.end(); j++)
+ {
+ PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+ const unsigned int primrefID = presplitItem[j].index;
+ const unsigned int geomID = prims[primrefID].geomID();
+ const unsigned int primID = prims[primrefID].primID();
+ const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+ assert(split_levels);
+ assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+ unsigned int numSubPrims = 0;
+ splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+ const size_t newID = numPrimitives + primOffset1[j-center];
+ assert(newID+numSubPrims <= alloc_numPrimitives);
+ prims[primrefID] = subPrims[0];
+ for (size_t i=1;i<numSubPrims;i++)
+ prims[newID+i-1] = subPrims[i];
+ }
+ });
+
+ numPrimitives += offset;
+ DBG_PRESPLIT(
+ PRINT(pinfo.size());
+ PRINT(numPrimitives);
+ PRINT((float)numPrimitives/org_numPrimitives));
+ }
+
+ /* recompute centroid bounding boxes */
+ pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+ PrimInfo p(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ p.add_center2(prims[j]);
+ return p;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ assert(pinfo.size() == numPrimitives);
+
+ /* free double buffer presplit items */
+ alignedFree(tmp_presplitItem);
+ alignedFree(presplitItem);
+ return pinfo;
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h
new file mode 100644
index 0000000000..dbd6cf07c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<size_t N>
+ __forceinline void splitPolygon(const BBox3fa& bounds,
+ const size_t dim,
+ const float pos,
+ const Vec3fa (&v)[N+1],
+ const Vec3fa (&inv_length)[N],
+ BBox3fa& left_o,
+ BBox3fa& right_o)
+ {
+ BBox3fa left = empty, right = empty;
+ /* clip triangle to left and right box by processing all edges */
+ for (size_t i=0; i<N; i++)
+ {
+ const Vec3fa &v0 = v[i];
+ const Vec3fa &v1 = v[i+1];
+ const float v0d = v0[dim];
+ const float v1d = v1[dim];
+
+ if (v0d <= pos) left. extend(v0); // this point is on left side
+ if (v0d >= pos) right.extend(v0); // this point is on right side
+
+ if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+ {
+ assert((v1d-v0d) != 0.0f);
+ const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+ left.extend(c);
+ right.extend(c);
+ }
+ }
+
+ /* clip against current bounds */
+ left_o = intersect(left,bounds);
+ right_o = intersect(right,bounds);
+ }
+
+ template<size_t N>
+ __forceinline void splitPolygon(const PrimRef& prim,
+ const size_t dim,
+ const float pos,
+ const Vec3fa (&v)[N+1],
+ PrimRef& left_o,
+ PrimRef& right_o)
+ {
+ BBox3fa left = empty, right = empty;
+ for (size_t i=0; i<N; i++)
+ {
+ const Vec3fa &v0 = v[i];
+ const Vec3fa &v1 = v[i+1];
+ const float v0d = v0[dim];
+ const float v1d = v1[dim];
+
+ if (v0d <= pos) left. extend(v0); // this point is on left side
+ if (v0d >= pos) right.extend(v0); // this point is on right side
+
+ if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+ {
+ assert((v1d-v0d) != 0.0f);
+ const float inv_length = 1.0f/(v1d-v0d);
+ const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+ left.extend(c);
+ right.extend(c);
+ }
+ }
+
+ /* clip against current bounds */
+ new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+ new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+ }
+
+ struct TriangleSplitter
+ {
+ __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+ {
+ const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+ const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );
+ TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+ v[0] = mesh->vertex(tri.v[0]);
+ v[1] = mesh->vertex(tri.v[1]);
+ v[2] = mesh->vertex(tri.v[2]);
+ v[3] = mesh->vertex(tri.v[0]);
+ inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+ inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+ inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+ }
+
+ __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+ splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+ }
+
+ __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+ splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+ }
+
+ private:
+ Vec3fa v[4];
+ Vec3fa inv_length[3];
+ };
+
+ struct TriangleSplitterFactory
+ {
+ __forceinline TriangleSplitterFactory(const Scene* scene)
+ : scene(scene) {}
+
+ __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+ return TriangleSplitter(scene,prim);
+ }
+
+ private:
+ const Scene* scene;
+ };
+
+ struct QuadSplitter
+ {
+ __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+ {
+ const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+ const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );
+ QuadMesh::Quad quad = mesh->quad(prim.primID());
+ v[0] = mesh->vertex(quad.v[0]);
+ v[1] = mesh->vertex(quad.v[1]);
+ v[2] = mesh->vertex(quad.v[2]);
+ v[3] = mesh->vertex(quad.v[3]);
+ v[4] = mesh->vertex(quad.v[0]);
+ inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+ inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+ inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+ inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+ }
+
+ __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+ splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+ }
+
+ __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+ splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+ }
+
+ private:
+ Vec3fa v[5];
+ Vec3fa inv_length[4];
+ };
+
+ struct QuadSplitterFactory
+ {
+ __forceinline QuadSplitterFactory(const Scene* scene)
+ : scene(scene) {}
+
+ __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+ return QuadSplitter(scene,prim);
+ }
+
+ private:
+ const Scene* scene;
+ };
+ }
+}
+