diff options
author | RĂ©mi Verschelde <remi@verschelde.fr> | 2021-04-27 19:07:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-27 19:07:12 +0200 |
commit | 95cfce661bbe9700209c09dfe297ab7ef5ebfe09 (patch) | |
tree | 9f3c8a98b619e2a0dd171d6cb6d8dc2e791baa14 /thirdparty/embree-aarch64/kernels/builders | |
parent | b999fbc4bd349cce153c2133dd0487694add1a05 (diff) | |
parent | 4d9d99bb827967e2bb931eeb8c3f0e079b39ae1a (diff) |
Merge pull request #48050 from JFonS/occlusion_culling
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/builders')
18 files changed, 7286 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h new file mode 100644 index 0000000000..755ce255fb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h @@ -0,0 +1,411 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_sah.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_strand_array.h" + +#define NUM_HAIR_OBJECT_BINS 32 + +namespace embree +{ + namespace isa + { + struct BVHBuilderHair + { + /*! settings for builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t finished_range_threshold; //!< finished range threshold + }; + + template<typename NodeRef, + typename CreateAllocFunc, + typename CreateAABBNodeFunc, + typename SetAABBNodeFunc, + typename CreateOBBNodeFunc, + typename SetOBBNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor, + typename ReportFinishedRangeFunc> + + class BuilderT + { + ALIGNED_CLASS_(16); + friend struct BVHBuilderHair; + + typedef FastAllocator::CachedAllocator Allocator; + typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH; + typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH; + typedef HeuristicStrandSplit HeuristicStrandSplitSAH; + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + static const size_t travCostAligned = 1; + static const size_t travCostUnaligned = 5; + static const size_t intCost = 6; + + BuilderT (Scene* scene, + PrimRef* prims, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + const Settings settings) + + : cfg(settings), + prims(prims), + createAlloc(createAlloc), + createAABBNode(createAABBNode), + setAABBNode(setAABBNode), + createOBBNode(createOBBNode), + setOBBNode(setOBBNode), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + reportFinishedRange(reportFinishedRange), + alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {} + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const PrimInfoRange& range) + { + if (range.size() == 0) return true; + unsigned int firstGeomID = prims[range.begin()].geomID(); + for (size_t i=range.begin()+1; i<range.end(); i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /*! creates a large leaf that could be larger than supported by the BVH */ + NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo)) + return createLeaf(prims,pinfo,alloc); + + /* fill all children by always splitting the largest one */ + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + unsigned numChildren = 1; + children[0] = pinfo; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i])) + continue; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + __aligned(64) PrimInfoRange left, right; + if (!sameGeometry(children[bestChild])) { + alignedHeuristic.splitByGeometry(children[bestChild],left,right); + } else { + alignedHeuristic.splitFallback(children[bestChild],left,right); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* create node */ + auto node = createAABBNode(alloc); + + for (size_t i=0; i<numChildren; i++) { + const NodeRef child = createLargeLeaf(depth+1,children[i],alloc); + setAABBNode(node,i,child,children[i].geomBounds); + } + + return node; + } + + /*! performs split */ + __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack + { + /* variable to track the SAH of the best splitting approach */ + float bestSAH = inf; + const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize; + const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds); + + /* try standard binning in aligned space */ + float alignedObjectSAH = inf; + HeuristicBinningSAH::Split alignedObjectSplit; + if (aligned) { + alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize); + alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + } + + /* try standard binning in unaligned space */ + UnalignedHeuristicBinningSAH::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (bestSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpace(pinfo); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace); + unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH(); + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* try splitting into two strands */ + HeuristicStrandSplitSAH::Split strandSplit; + float strandSAH = inf; + if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) { + strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize); + strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH(); + bestSAH = min(strandSAH,bestSAH); + } + + /* fallback if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) + { + alignedHeuristic.deterministic_order(pinfo); + alignedHeuristic.splitFallback(pinfo,linfo,rinfo); + } + + /* perform aligned split if this is best */ + else if (bestSAH == alignedObjectSAH) { + alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo); + } + + /* perform unaligned split if this is best */ + else if (bestSAH == unalignedObjectSAH) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo); + aligned = false; + } + + /* perform strand split if this is best */ + else if (bestSAH == strandSAH) { + strandHeuristic.split(strandSplit,pinfo,linfo,rinfo); + aligned = false; + } + + /* can never happen */ + else + assert(false); + } + + /*! recursive build */ + NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(pinfo.size()); + + PrimInfoRange children[MAX_BRANCHING_FACTOR]; + + /* create leaf node */ + if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) { + alignedHeuristic.deterministic_order(pinfo); + return createLargeLeaf(depth,pinfo,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + size_t numChildren = 1; + children[0] = pinfo; + bool aligned = true; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.minLeafSize) + continue; + + /* remember child with largest area */ + if (area(children[i].geomBounds) > bestArea) { + bestArea = area(children[i].geomBounds); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + PrimInfoRange left, right; + split(children[bestChild],left,right,aligned); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + NodeRef node; + + /* create aligned node */ + if (aligned) + { + node = createAABBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else { + for (size_t i=0; i<numChildren; i++) { + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds); + } + } + } + + /* create unaligned node */ + else + { + node = createOBBNode(alloc); + + /* spawn tasks or ... */ + if (pinfo.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space); + const OBBox3fa obounds(space,sinfo.geomBounds); + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else + { + for (size_t i=0; i<numChildren; i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]); + const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space); + const OBBox3fa obounds(space,sinfo.geomBounds); + const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold; + setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds); + } + } + } + + /* reports a finished range of primrefs */ + if (unlikely(alloc_barrier)) + reportFinishedRange(pinfo); + + return node; + } + + private: + Settings cfg; + PrimRef* prims; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeFunc& createAABBNode; + const SetAABBNodeFunc& setAABBNode; + const CreateOBBNodeFunc& createOBBNode; + const SetOBBNodeFunc& setOBBNode; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + const ReportFinishedRangeFunc& reportFinishedRange; + + private: + HeuristicBinningSAH alignedHeuristic; + UnalignedHeuristicBinningSAH unalignedHeuristic; + HeuristicStrandSplitSAH strandHeuristic; + }; + + template<typename NodeRef, + typename CreateAllocFunc, + typename CreateAABBNodeFunc, + typename SetAABBNodeFunc, + typename CreateOBBNodeFunc, + typename SetOBBNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor, + typename ReportFinishedRangeFunc> + + static NodeRef build (const CreateAllocFunc& createAlloc, + const CreateAABBNodeFunc& createAABBNode, + const SetAABBNodeFunc& setAABBNode, + const CreateOBBNodeFunc& createOBBNode, + const SetOBBNodeFunc& setOBBNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const ReportFinishedRangeFunc& reportFinishedRange, + Scene* scene, + PrimRef* prims, + const PrimInfo& pinfo, + const Settings settings) + { + typedef BuilderT<NodeRef, + CreateAllocFunc, + CreateAABBNodeFunc,SetAABBNodeFunc, + CreateOBBNodeFunc,SetOBBNodeFunc, + CreateLeafFunc,ProgressMonitor, + ReportFinishedRangeFunc> Builder; + + Builder builder(scene,prims,createAlloc, + createAABBNode,setAABBNode, + createOBBNode,setOBBNode, + createLeaf,progressMonitor,reportFinishedRange,settings); + + NodeRef root = builder.recurse(1,pinfo,nullptr,true,false); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h new file mode 100644 index 0000000000..92be2f7e65 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h @@ -0,0 +1,501 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/builder.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderMorton + { + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + /*! settings for morton builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold) + : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + /*! Build primitive consisting of morton code and primitive ID. */ + struct __aligned(8) BuildPrim + { + union { + struct { + unsigned int code; //!< morton code + unsigned int index; //!< i'th primitive + }; + uint64_t t; + }; + + /*! interface for radix sort */ + __forceinline operator unsigned() const { return code; } + + /*! interface for standard sort */ + __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; } + }; + + /*! maps bounding box to morton code */ + struct MortonCodeMapping + { + static const size_t LATTICE_BITS_PER_DIM = 10; + static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM; + + vfloat4 base; + vfloat4 scale; + + __forceinline MortonCodeMapping(const BBox3fa& bounds) + { + base = (vfloat4)bounds.lower; + const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower; + scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f)); + } + + __forceinline const vint4 bin (const BBox3fa& box) const + { + const vfloat4 lower = (vfloat4)box.lower; + const vfloat4 upper = (vfloat4)box.upper; + const vfloat4 centroid = lower+upper; + return vint4((centroid-base)*scale); + } + + __forceinline unsigned int code (const BBox3fa& box) const + { + const vint4 binID = bin(box); + const unsigned int x = extract<0>(binID); + const unsigned int y = extract<1>(binID); + const unsigned int z = extract<2>(binID); + const unsigned int xyz = bitInterleave(x,y,z); + return xyz; + } + }; + +#if defined (__AVX2__) + + /*! for AVX2 there is a fast scalar bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest) {} + + __forceinline void operator() (const BBox3fa& b, const unsigned index) + { + dest->index = index; + dest->code = mapping.code(b); + dest++; + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + }; + +#else + + /*! before AVX2 is it better to use the SSE version of bitInterleave */ + struct MortonCodeGenerator + { + __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest) + : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {} + + __forceinline ~MortonCodeGenerator() + { + if (slots != 0) + { + const vint4 code = bitInterleave(ax,ay,az); + for (size_t i=0; i<slots; i++) { + dest[currentID-slots+i].index = ai[i]; + dest[currentID-slots+i].code = code[i]; + } + } + } + + __forceinline void operator() (const BBox3fa& b, const unsigned index) + { + const vint4 binID = mapping.bin(b); + ax[slots] = extract<0>(binID); + ay[slots] = extract<1>(binID); + az[slots] = extract<2>(binID); + ai[slots] = index; + slots++; + currentID++; + + if (slots == 4) + { + const vint4 code = bitInterleave(ax,ay,az); + vint4::storeu(&dest[currentID-4],unpacklo(code,ai)); + vint4::storeu(&dest[currentID-2],unpackhi(code,ai)); + slots = 0; + } + } + + public: + const MortonCodeMapping mapping; + BuildPrim* dest; + size_t currentID; + size_t slots; + vint4 ax, ay, az, ai; + }; + +#endif + + template< + typename ReductionTy, + typename Allocator, + typename CreateAllocator, + typename CreateNodeFunc, + typename SetNodeBoundsFunc, + typename CreateLeafFunc, + typename CalculateBounds, + typename ProgressMonitor> + + class BuilderT : private Settings + { + ALIGNED_CLASS_(16); + + public: + + BuilderT (CreateAllocator& createAllocator, + CreateNodeFunc& createNode, + SetNodeBoundsFunc& setBounds, + CreateLeafFunc& createLeaf, + CalculateBounds& calculateBounds, + ProgressMonitor& progressMonitor, + const Settings& settings) + + : Settings(settings), + createAllocator(createAllocator), + createNode(createNode), + setBounds(setBounds), + createLeaf(createLeaf), + calculateBounds(calculateBounds), + progressMonitor(progressMonitor), + morton(nullptr) {} + + ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (depth > maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.size() <= maxLeafSize) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + range<unsigned> children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + + do { + + /* find best child with largest number of primitives */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= maxLeafSize) + continue; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == size_t(-1)) break; + + /*! split best child into left and right child */ + auto split = children[bestChild].split(); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = split.first; + children[numChildren+0] = split.second; + numChildren++; + + } while (numChildren < branchingFactor); + + /* create node */ + auto node = createNode(alloc,numChildren); + + /* recurse into each child */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + for (size_t i=0; i<numChildren; i++) + bounds[i] = createLargeLeaf(depth+1,children[i],alloc); + + return setBounds(node,bounds,numChildren); + } + + /*! recreates morton codes when reaching a region where all codes are identical */ + __noinline void recreateMortonCodes(const range<unsigned>& current) const + { + /* fast path for small ranges */ + if (likely(current.size() < 1024)) + { + /*! recalculate centroid bounds */ + BBox3fa centBounds(empty); + for (size_t i=current.begin(); i<current.end(); i++) + centBounds.extend(center2(calculateBounds(morton[i]))); + + /* recalculate morton codes */ + MortonCodeMapping mapping(centBounds); + for (size_t i=current.begin(); i<current.end(); i++) + morton[i].code = mapping.code(calculateBounds(morton[i])); + + /* sort morton codes */ + std::sort(morton+current.begin(),morton+current.end()); + } + else + { + /*! recalculate centroid bounds */ + auto calculateCentBounds = [&] ( const range<unsigned>& r ) { + BBox3fa centBounds = empty; + for (size_t i=r.begin(); i<r.end(); i++) + centBounds.extend(center2(calculateBounds(morton[i]))); + return centBounds; + }; + const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024), + BBox3fa(empty), calculateCentBounds, BBox3fa::merge); + + /* recalculate morton codes */ + MortonCodeMapping mapping(centBounds); + parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) { + for (size_t i=r.begin(); i<r.end(); i++) { + morton[i].code = mapping.code(calculateBounds(morton[i])); + } + }); + + /*! sort morton codes */ +#if defined(TASKING_TBB) + tbb::parallel_sort(morton+current.begin(),morton+current.end()); +#else + radixsort32(morton+current.begin(),current.size()); +#endif + } + } + + __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const + { + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + unsigned int bitpos = lzcnt(code_start^code_end); + + /* if all items mapped to same morton code, then re-create new morton codes for the items */ + if (unlikely(bitpos == 32)) + { + recreateMortonCodes(current); + const unsigned int code_start = morton[current.begin()].code; + const unsigned int code_end = morton[current.end()-1].code; + bitpos = lzcnt(code_start^code_end); + + /* if the morton code is still the same, goto fall back split */ + if (unlikely(bitpos == 32)) { + current.split(left,right); + return; + } + } + + /* split the items at the topmost different morton code bit */ + const unsigned int bitpos_diff = 31-bitpos; + const unsigned int bitmask = 1 << bitpos_diff; + + /* find location where bit differs using binary search */ + unsigned begin = current.begin(); + unsigned end = current.end(); + while (begin + 1 != end) { + const unsigned mid = (begin+end)/2; + const unsigned bit = morton[mid].code & bitmask; + if (bit == 0) begin = mid; else end = mid; + } + unsigned center = end; +#if defined(DEBUG) + for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0); + for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask); +#endif + + left = make_range(current.begin(),center); + right = make_range(center,current.end()); + } + + ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAllocator(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= singleThreadThreshold) + progressMonitor(current.size()); + + /* create leaf node */ + if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize)) + return createLargeLeaf(depth,current,alloc); + + /* fill all children by always splitting the one with the largest surface area */ + range<unsigned> children[MAX_BRANCHING_FACTOR]; + split(current,children[0],children[1]); + size_t numChildren = 2; + + while (numChildren < branchingFactor) + { + /* find best child with largest number of primitives */ + int bestChild = -1; + unsigned bestItems = 0; + for (unsigned int i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= minLeafSize) + continue; + + /* remember child with largest area */ + if (children[i].size() > bestItems) { + bestItems = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + range<unsigned> left, right; + split(children[bestChild],left,right); + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + } + + /* create leaf node if no split is possible */ + if (unlikely(numChildren == 1)) + return createLeaf(current,alloc); + + /* allocate node */ + auto node = createNode(alloc,numChildren); + + /* process top parts of tree parallel */ + ReductionTy bounds[MAX_BRANCHING_FACTOR]; + if (current.size() > singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + bounds[i] = recurse(depth+1,children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + + /* finish tree sequentially */ + else + { + for (size_t i=0; i<numChildren; i++) + bounds[i] = recurse(depth+1,children[i],alloc,false); + } + + return setBounds(node,bounds,numChildren); + } + + /* build function */ + ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives) + { + /* sort morton codes */ + morton = src; + radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold); + + /* build BVH */ + const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + public: + CreateAllocator& createAllocator; + CreateNodeFunc& createNode; + SetNodeBoundsFunc& setBounds; + CreateLeafFunc& createLeaf; + CalculateBounds& calculateBounds; + ProgressMonitor& progressMonitor; + + public: + BuildPrim* morton; + }; + + + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetBoundsFunc, + typename CreateLeafFunc, + typename CalculateBoundsFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAllocator, + CreateNodeFunc createNode, + SetBoundsFunc setBounds, + CreateLeafFunc createLeaf, + CalculateBoundsFunc calculateBounds, + ProgressMonitor progressMonitor, + BuildPrim* src, + BuildPrim* tmp, + size_t numPrimitives, + const Settings& settings) + { + typedef BuilderT< + ReductionTy, + decltype(createAllocator()), + CreateAllocFunc, + CreateNodeFunc, + SetBoundsFunc, + CreateLeafFunc, + CalculateBoundsFunc, + ProgressMonitor> Builder; + + Builder builder(createAllocator, + createNode, + setBounds, + createLeaf, + calculateBounds, + progressMonitor, + settings); + + return builder.build(src,tmp,numPrimitives); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h new file mode 100644 index 0000000000..4c138dacdb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h @@ -0,0 +1,692 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#define MBLUR_NUM_TEMPORAL_BINS 2 +#define MBLUR_NUM_OBJECT_BINS 32 + +#include "../bvh/bvh.h" +#include "../common/primref_mb.h" +#include "heuristic_binning_array_aligned.h" +#include "heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + template<typename T> + struct SharedVector + { + __forceinline SharedVector() {} + + __forceinline SharedVector(T* ptr, size_t refCount = 1) + : prims(ptr), refCount(refCount) {} + + __forceinline void incRef() { + refCount++; + } + + __forceinline void decRef() + { + if (--refCount == 0) + delete prims; + } + + T* prims; + size_t refCount; + }; + + template<typename BuildRecord, int MAX_BRANCHING_FACTOR> + struct LocalChildListT + { + typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; + + __forceinline LocalChildListT (const BuildRecord& record) + : numChildren(1), numSharedPrimVecs(1) + { + /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */ + children[0] = record; + primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2); + } + + __forceinline ~LocalChildListT() + { + for (size_t i = 0; i < numChildren; i++) + primvecs[i]->decRef(); + } + + __forceinline BuildRecord& operator[] ( const size_t i ) { + return children[i]; + } + + __forceinline size_t size() const { + return numChildren; + } + + __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector) + { + SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild]; + if (lrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[bestChild] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims); + } + + if (rrecord.prims.prims == bsharedPrimVec->prims) { + primvecs[numChildren] = bsharedPrimVec; + bsharedPrimVec->incRef(); + } + else { + primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims); + } + bsharedPrimVec->decRef(); + new_vector.release(); + + children[bestChild] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + public: + array_t<BuildRecord,MAX_BRANCHING_FACTOR> children; + array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs; + size_t numChildren; + + array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs; + size_t numSharedPrimVecs; + }; + + template<typename Mesh> + struct RecalculatePrimRef + { + Scene* scene; + + __forceinline RecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get<Mesh>(geomID); + const LBBox3fa lbounds = mesh->linearBounds(primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Mesh* mesh = scene->get<Mesh>(geomID); + const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range); + } + + // __noinline is workaround for ICC16 bug under MacOSX + __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range); + } + }; + + struct VirtualRecalculatePrimRef + { + Scene* scene; + + __forceinline VirtualRecalculatePrimRef (Scene* scene) + : scene(scene) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const + { + const unsigned geomID = prim.geomID(); + const unsigned primID = prim.primID(); + const Geometry* mesh = scene->get(geomID); + const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const { + return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range); + } + }; + + struct BVHBuilderMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8), + travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false), + singleThreadThreshold(1024) {} + + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + bool singleLeafTimeSegment; //!< split time to single time range + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) { + return a.prims.size() < b.prims.size(); + } + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< Depth of the root of this subtree. + SetMB prims; //!< The list of primitives. + }; + + struct BuildRecordSplit : public BuildRecord + { + __forceinline BuildRecordSplit () {} + + __forceinline BuildRecordSplit (size_t depth) + : BuildRecord(depth) {} + + __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split) + : BuildRecord(record), split(split) {} + + BinSplit<MBLUR_NUM_OBJECT_BINS> split; + }; + + template< + typename NodeRef, + typename RecalculatePrimRef, + typename Allocator, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + class BuilderT + { + ALIGNED_CLASS_(16); + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split; + typedef mvector<PrimRefMB>* PrimRefVector; + typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector; + typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList; + typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit; + + public: + + BuilderT (MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitor progressMonitor, + const Settings& settings) + : cfg(settings), + heuristicObjectSplit(), + heuristicTemporalSplit(device, recalculatePrimRef), + recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + /*! finds the best split */ + const Split find(const SetMB& set) + { + /* first try standard object split */ + const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize); + const float object_split_sah = object_split.splitSAH(); + + /* test temporal splits only when object split was bad */ + const float leaf_sah = set.leafSAH(cfg.logBlockSize); + if (object_split_sah < 0.50f*leaf_sah) + return object_split; + + /* do temporal splits only if the the time range is big enough */ + if (set.time_range.size() > 1.01f/float(set.max_num_time_segments)) + { + const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize); + const float temporal_split_sah = temporal_split.splitSAH(); + + /* take temporal split if it improved SAH */ + if (temporal_split_sah < object_split_sah) + return temporal_split; + } + + return object_split; + } + + /*! array partitioning */ + __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + /* perform object split */ + if (likely(split.data == Split::SPLIT_OBJECT)) { + heuristicObjectSplit.split(split,set,lset,rset); + } + /* perform temporal split */ + else if (likely(split.data == Split::SPLIT_TEMPORAL)) { + return heuristicTemporalSplit.split(split,set,lset,rset); + } + /* perform fallback split */ + else if (unlikely(split.data == Split::SPLIT_FALLBACK)) { + set.deterministic_order(); + splitFallback(set,lset,rset); + } + /* split by geometry */ + else if (unlikely(split.data == Split::SPLIT_GEOMID)) { + set.deterministic_order(); + splitByGeometry(set,lset,rset); + } + else + assert(false); + + return std::unique_ptr<mvector<PrimRefMB>>(); + } + + /*! finds the best fallback split */ + __noinline Split findFallback(const SetMB& set) + { + /* split if primitives are not from same geometry */ + if (!sameGeometry(set)) + return Split(0.0f,Split::SPLIT_GEOMID); + + /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */ + if (cfg.singleLeafTimeSegment) + { + /* test if one primitive has more than one time segment in time range, if so split time */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const PrimRefMB& prim = (*set.prims)[i]; + const range<int> itime_range = prim.timeSegmentRange(set.time_range); + const int localTimeSegments = itime_range.size(); + assert(localTimeSegments > 0); + if (localTimeSegments > 1) { + const int icenter = (itime_range.begin() + itime_range.end())/2; + const float splitTime = prim.timeStep(icenter); + return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime); + } + } + } + + /* otherwise return fallback split */ + return Split(0.0f,Split::SPLIT_FALLBACK); + } + + /*! performs fallback split */ + void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) + { + mvector<PrimRefMB>& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i<center; i++) + linfo.add_primref(prims[i]); + + PrimInfoMB rinfo = empty; + for (size_t i=center; i<end; i++) + rinfo.add_primref(prims[i]); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + if (set.size() == 0) return true; + mvector<PrimRefMB>& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + unsigned int firstGeomID = prims[begin].geomID(); + for (size_t i=begin+1; i<end; i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /* split by geometry ID */ + void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(set.size() > 1); + + mvector<PrimRefMB>& prims = *set.prims; + const size_t begin = set.begin(); + const size_t end = set.end(); + + PrimInfoMB left(empty); + PrimInfoMB right(empty); + unsigned int geomID = prims[begin].geomID(); + size_t center = serial_partitioning(prims.data(),begin,end,left,right, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); }); + + new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + + const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (in.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* replace already found split by fallback split */ + const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims)); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i<current.prims.end(); i++) { + mvector<PrimRefMB>& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split) + return createLeaf(current,alloc); + + /* fill all children by always splitting the largest one */ + bool hasTimeSplits = false; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildListSplit children(current); + + do { + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split) + continue; + + force_split = false; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecordSplit& brecord = children[bestChild]; + BuildRecordSplit lrecord(current.depth+1); + BuildRecordSplit rrecord(current.depth+1); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + + /* find new splits */ + lrecord.split = findFallback(lrecord.prims); + rrecord.split = findFallback(rrecord.prims); + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = in.prims.time_range; + hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; + } + + /* create node */ + auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits); + + /* recurse into each child and perform reduction */ + LBBox3fa gbounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = createLargeLeaf(children[i],alloc); + gbounds.extend(values[i].lbounds); + } + + setNode(current,children.children.data(),node,values,children.numChildren); + + /* calculate geometry bounds of this node */ + if (hasTimeSplits) + return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); + else + return NodeRecordMB4D(node,gbounds,current.prims.time_range); + } + + const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= cfg.singleThreadThreshold) + progressMonitor(current.size()); + + /*! find best split */ + const Split csplit = find(current.prims); + + /*! compute leaf and split cost */ + const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize); + const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH(); + assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + SetMB lprims,rprims; + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims); + bool hasTimeSplits = new_vector != nullptr; + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + { + BuildRecord lrecord(lprims,current.depth+1); + BuildRecord rrecord(rprims,current.depth+1); + children.split(0,lrecord,rrecord,std::move(new_vector)); + } + + /*! split until node is full or SAH tells us to stop */ + while (children.size() < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i<children.size(); i++) + { + if (children[i].size() <= cfg.minLeafSize) continue; + if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) { + bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + Split csplit = find(brecord.prims); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims); + hasTimeSplits |= new_vector != nullptr; + children.split(bestChild,lrecord,rrecord,std::move(new_vector)); + } + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + hasTimeSplits |= c.lower > p.lower || c.upper < p.upper; + } + + /* sort buildrecords for simpler shadow ray traversal */ + //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !! + + /*! create an inner node */ + auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits); + LBBox3fa gbounds = empty; + + /* spawn tasks */ + if (unlikely(current.size() > cfg.singleThreadThreshold)) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + /*! merge bounding boxes */ + for (size_t i=0; i<children.size(); i++) + gbounds.extend(values[i].lbounds); + } + /* recurse into each child */ + else + { + //for (size_t i=0; i<children.size(); i++) + for (ssize_t i=children.size()-1; i>=0; i--) { + values[i] = recurse(children[i],alloc,false); + gbounds.extend(values[i].lbounds); + } + } + + setNode(current,children.children.data(),node,values,children.numChildren); + + /* calculate geometry bounds of this node */ + if (unlikely(hasTimeSplits)) + return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range); + else + return NodeRecordMB4D(node,gbounds,current.prims.time_range); + } + + /*! builder entry function */ + __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo) + { + const SetMB set(pinfo,&prims); + auto ret = recurse(BuildRecord(set,1),nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return ret; + } + + private: + Settings cfg; + HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit; + HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit; + const RecalculatePrimRef recalculatePrimRef; + const CreateAllocFunc createAlloc; + const CreateNodeFunc createNode; + const SetNodeFunc setNode; + const CreateLeafFunc createLeaf; + const ProgressMonitor progressMonitor; + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename SetNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitorFunc> + + static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims, + const PrimInfoMB& pinfo, + MemoryMonitorInterface* device, + const RecalculatePrimRef recalculatePrimRef, + const CreateAllocFunc createAlloc, + const CreateNodeFunc createNode, + const SetNodeFunc setNode, + const CreateLeafFunc createLeaf, + const ProgressMonitorFunc progressMonitor, + const Settings& settings) + { + typedef BuilderT< + NodeRef, + RecalculatePrimRef, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + SetNodeFunc, + CreateLeafFunc, + ProgressMonitorFunc> Builder; + + Builder builder(device, + recalculatePrimRef, + createAlloc, + createNode, + setNode, + createLeaf, + progressMonitor, + settings); + + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h new file mode 100644 index 0000000000..e477c313a3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h @@ -0,0 +1,526 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../geometry/primitive.h" +#include "../builders/bvh_builder_msmblur.h" +#include "../builders/heuristic_binning_array_aligned.h" +#include "../builders/heuristic_binning_array_unaligned.h" +#include "../builders/heuristic_timesplit_array.h" + +namespace embree +{ + namespace isa + { + struct BVHBuilderHairMSMBlur + { + /*! settings for msmblur builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {} + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + }; + + struct BuildRecord + { + public: + __forceinline BuildRecord () {} + + __forceinline BuildRecord (size_t depth) + : depth(depth) {} + + __forceinline BuildRecord (const SetMB& prims, size_t depth) + : depth(depth), prims(prims) {} + + __forceinline size_t size() const { + return prims.size(); + } + + public: + size_t depth; //!< depth of the root of this subtree + SetMB prims; //!< the list of primitives + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateAABBNodeMBFunc, + typename SetAABBNodeMBFunc, + typename CreateOBBNodeMBFunc, + typename SetOBBNodeMBFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + class BuilderT + { + ALIGNED_CLASS_(16); + + static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree if we are that many levels before the maximum tree depth + static const size_t SINGLE_THREADED_THRESHOLD = 4096; //!< threshold to switch to single threaded build + + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + typedef FastAllocator::CachedAllocator Allocator; + typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList; + + typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal; + typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning; + typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning; + + public: + + BuilderT (Scene* scene, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + + : cfg(settings), + scene(scene), + recalculatePrimRef(recalculatePrimRef), + createAlloc(createAlloc), + createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB), + createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB), + createLeaf(createLeaf), + progressMonitor(progressMonitor), + unalignedHeuristic(scene), + temporalSplitHeuristic(scene->device,recalculatePrimRef) {} + + private: + + /*! checks if all primitives are from the same geometry */ + __forceinline bool sameGeometry(const SetMB& set) + { + mvector<PrimRefMB>& prims = *set.prims; + unsigned int firstGeomID = prims[set.begin()].geomID(); + for (size_t i=set.begin()+1; i<set.end(); i++) { + if (prims[i].geomID() != firstGeomID){ + return false; + } + } + return true; + } + + /*! performs some split if SAH approaches fail */ + void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset) + { + mvector<PrimRefMB>& prims = *set.prims; + + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfoMB linfo = empty; + for (size_t i=begin; i<center; i++) + linfo.add_primref(prims[i]); + + PrimInfoMB rinfo = empty; + for (size_t i=center; i<end; i++) + rinfo.add_primref(prims[i]); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(set.size() > 1); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB linfo(empty); + PrimInfoMB rinfo(empty); + unsigned int geomID = (*set.prims)[begin].geomID(); + size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo, + [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; }, + [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); }); + + new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end ),set.time_range); + } + + /*! creates a large leaf that could be larger than supported by the BVH */ + NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* special case when directly creating leaf without any splits that could shrink time_range */ + bool force_split = false; + if (current.depth == 1 && current.size() > 0) + { + BBox1f c = empty; + BBox1f p = current.prims.time_range; + for (size_t i=current.prims.begin(); i<current.prims.end(); i++) { + mvector<PrimRefMB>& prims = *current.prims.prims; + c.extend(prims[i].time_range); + } + + force_split = c.lower > p.lower || c.upper < p.upper; + } + + /* create leaf for few primitives */ + if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split) + return createLeaf(current.prims,alloc); + + /* fill all children by always splitting the largest one */ + LocalChildList children(current); + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + + do { + + /* find best child with largest bounding box area */ + int bestChild = -1; + size_t bestSize = 0; + for (unsigned i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split) + continue; + + force_split = false; + + /* remember child with largest size */ + if (children[i].size() > bestSize) { + bestSize = children[i].size(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!sameGeometry(children[bestChild].prims)) { + splitByGeometry(children[bestChild].prims,left.prims,right.prims); + } else { + splitFallback(children[bestChild].prims,left.prims,right.prims); + } + children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>()); + + } while (children.size() < cfg.branchingFactor); + + + /* detect time_ranges that have shrunken */ + bool timesplit = false; + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + timesplit |= c.lower > p.lower || c.upper < p.upper; + } + + /* create node */ + NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit); + + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = createLargeLeaf(children[i],alloc); + bounds.extend(values[i].lbounds); + } + + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + + if (timesplit) + bounds = current.prims.linearBounds(recalculatePrimRef); + + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + + /*! performs split */ + std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit) + { + /* variable to track the SAH of the best splitting approach */ + float bestSAH = inf; + const float leafSAH = current.prims.leafSAH(cfg.logBlockSize); + + /* perform standard binning in aligned space */ + HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize); + float alignedObjectSAH = alignedObjectSplit.splitSAH(); + bestSAH = min(alignedObjectSAH,bestSAH); + + /* perform standard binning in unaligned space */ + UnalignedHeuristicBinning::Split unalignedObjectSplit; + LinearSpace3fa uspace; + float unalignedObjectSAH = inf; + if (alignedObjectSAH > 0.7f*leafSAH) { + uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims); + const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace); + unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace); + unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive + bestSAH = min(unalignedObjectSAH,bestSAH); + } + + /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */ + float temporal_split_sah = inf; + typename HeuristicTemporal::Split temporal_split; + if (bestSAH > 0.5f*leafSAH) { + if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) { + temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize); + temporal_split_sah = temporal_split.splitSAH(); + bestSAH = min(temporal_split_sah,bestSAH); + } + } + + /* perform fallback split if SAH heuristics failed */ + if (unlikely(!std::isfinite(bestSAH))) { + current.prims.deterministic_order(); + splitFallback(current.prims,lrecord.prims,rrecord.prims); + } + /* perform aligned split if this is best */ + else if (likely(bestSAH == alignedObjectSAH)) { + alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims); + } + /* perform unaligned split if this is best */ + else if (likely(bestSAH == unalignedObjectSAH)) { + unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims); + aligned = false; + } + /* perform temporal split if this is best */ + else if (likely(bestSAH == temporal_split_sah)) { + timesplit = true; + return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims); + } + else + assert(false); + + return std::unique_ptr<mvector<PrimRefMB>>(); + } + + /*! recursive build */ + NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD) + progressMonitor(current.size()); + + /* create leaf node */ + if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) { + current.prims.deterministic_order(); + return createLargeLeaf(current,alloc); + } + + /* fill all children by always splitting the one with the largest surface area */ + NodeRecordMB4D values[MAX_BRANCHING_FACTOR]; + LocalChildList children(current); + bool aligned = true; + bool timesplit = false; + + do { + + /* find best child with largest bounding box area */ + ssize_t bestChild = -1; + float bestArea = neg_inf; + for (size_t i=0; i<children.size(); i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].size() <= cfg.minLeafSize) + continue; + + /* remember child with largest area */ + const float A = children[i].prims.halfArea(); + if (A > bestArea) { + bestArea = children[i].prims.halfArea(); + bestChild = i; + } + } + if (bestChild == -1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit); + children.split(bestChild,left,right,std::move(new_vector)); + + } while (children.size() < cfg.branchingFactor); + + /* detect time_ranges that have shrunken */ + for (size_t i=0; i<children.size(); i++) { + const BBox1f c = children[i].prims.time_range; + const BBox1f p = current.prims.time_range; + timesplit |= c.lower > p.lower || c.upper < p.upper; + } + + /* create time split node */ + if (timesplit) + { + const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequential */ + else { + for (size_t i=0; i<children.size(); i++) { + values[i] = recurse(children[i],alloc,false); + } + } + + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + + const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + + /* create aligned node */ + else if (aligned) + { + const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + LBBox3fa cbounds[MAX_BRANCHING_FACTOR]; + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + cbounds[i] = values[i].lbounds; + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) + bounds.extend(cbounds[i]); + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + /* ... continue sequentially */ + else + { + LBBox3fa bounds = empty; + for (size_t i=0; i<children.size(); i++) { + values[i] = recurse(children[i],alloc,false); + bounds.extend(values[i].lbounds); + } + setAABBNodeMB(current,children.children.data(),node,values,children.numChildren); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + } + + /* create unaligned node */ + else + { + const NodeRef node = createOBBNodeMB(alloc); + + /* spawn tasks or ... */ + if (current.size() > SINGLE_THREADED_THRESHOLD) + { + parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims); + const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space); + const auto child = recurse(children[i],nullptr,true); + setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + } + /* ... continue sequentially */ + else + { + for (size_t i=0; i<children.size(); i++) { + const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims); + const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space); + const auto child = recurse(children[i],alloc,false); + setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range); + } + } + + const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef); + return NodeRecordMB4D(node,bounds,current.prims.time_range); + } + } + + public: + + /*! entry point into builder */ + NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo) + { + BuildRecord record(SetMB(pinfo,&prims),1); + auto root = recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + private: + Settings cfg; + Scene* scene; + const RecalculatePrimRef& recalculatePrimRef; + const CreateAllocFunc& createAlloc; + const CreateAABBNodeMBFunc& createAABBNodeMB; + const SetAABBNodeMBFunc& setAABBNodeMB; + const CreateOBBNodeMBFunc& createOBBNodeMB; + const SetOBBNodeMBFunc& setOBBNodeMB; + const CreateLeafFunc& createLeaf; + const ProgressMonitor& progressMonitor; + + private: + HeuristicBinning alignedHeuristic; + UnalignedHeuristicBinning unalignedHeuristic; + HeuristicTemporal temporalSplitHeuristic; + }; + + template<typename NodeRef, + typename RecalculatePrimRef, + typename CreateAllocFunc, + typename CreateAABBNodeMBFunc, + typename SetAABBNodeMBFunc, + typename CreateOBBNodeMBFunc, + typename SetOBBNodeMBFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo, + const RecalculatePrimRef& recalculatePrimRef, + const CreateAllocFunc& createAlloc, + const CreateAABBNodeMBFunc& createAABBNodeMB, + const SetAABBNodeMBFunc& setAABBNodeMB, + const CreateOBBNodeMBFunc& createOBBNodeMB, + const SetOBBNodeMBFunc& setOBBNodeMB, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings settings) + { + typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc, + CreateAABBNodeMBFunc,SetAABBNodeMBFunc, + CreateOBBNodeMBFunc,SetOBBNodeMBFunc, + CreateLeafFunc,ProgressMonitor> Builder; + + Builder builder(scene,recalculatePrimRef,createAlloc, + createAABBNodeMB,setAABBNodeMB, + createOBBNodeMB,setOBBNodeMB, + createLeaf,progressMonitor,settings); + + return builder(prims,pinfo); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h new file mode 100644 index 0000000000..3f7e678a10 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h @@ -0,0 +1,669 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning_array_aligned.h" +#include "heuristic_spatial_array.h" +#include "heuristic_openmerge_array.h" + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL +# define NUM_OBJECT_BINS 16 +# define NUM_SPATIAL_BINS 16 +#else +# define NUM_OBJECT_BINS 32 +# define NUM_SPATIAL_BINS 16 +#endif + +namespace embree +{ + namespace isa + { + MAYBE_UNUSED static const float travCost = 1.0f; + MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024; + + struct GeneralBVHBuilder + { + static const size_t MAX_BRANCHING_FACTOR = 16; //!< maximum supported BVH branching factor + static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth + + + /*! settings for SAH builder */ + struct Settings + { + /*! default settings */ + Settings () + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {} + + /*! initialize settings from API settings */ + Settings (const RTCBuildArguments& settings) + : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), + travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) + { + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth; + if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize )) logBlockSize = bsr(static_cast<size_t>(settings.sahBlockSize)); + if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize; + if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost )) travCost = settings.traversalCost; + if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost )) intCost = settings.intersectionCost; + + minLeafSize = min(minLeafSize,maxLeafSize); + } + + Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf) + : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), + travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc) + { + minLeafSize = min(minLeafSize,maxLeafSize); + } + + public: + size_t branchingFactor; //!< branching factor of BVH to build + size_t maxDepth; //!< maximum depth of BVH to build + size_t logBlockSize; //!< log2 of blocksize for SAH heuristic + size_t minLeafSize; //!< minimum size of a leaf + size_t maxLeafSize; //!< maximum size of a leaf + float travCost; //!< estimated cost of one traversal step + float intCost; //!< estimated cost of one primitive intersection + size_t singleThreadThreshold; //!< threshold when we switch to single threaded build + size_t primrefarrayalloc; //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished + }; + + /*! recursive state of builder */ + template<typename Set, typename Split> + struct BuildRecordT + { + public: + __forceinline BuildRecordT () {} + + __forceinline BuildRecordT (size_t depth) + : depth(depth), alloc_barrier(false), prims(empty) {} + + __forceinline BuildRecordT (size_t depth, const Set& prims) + : depth(depth), alloc_barrier(false), prims(prims) {} + + __forceinline BBox3fa bounds() const { return prims.geomBounds; } + + __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); } + __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size(); } + + __forceinline size_t size() const { return prims.size(); } + + public: + size_t depth; //!< Depth of the root of this subtree. + bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes + Set prims; //!< The list of primitives. + }; + + template<typename PrimRef, typename Set> + struct DefaultCanCreateLeafFunc + { + __forceinline bool operator()(const PrimRef*, const Set&) const { return true; } + }; + + template<typename PrimRef, typename Set> + struct DefaultCanCreateLeafSplitFunc + { + __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { } + }; + + template<typename BuildRecord, + typename Heuristic, + typename Set, + typename PrimRef, + typename ReductionTy, + typename Allocator, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + class BuilderT + { + friend struct GeneralBVHBuilder; + + BuilderT (PrimRef* prims, + Heuristic& heuristic, + const CreateAllocFunc& createAlloc, + const CreateNodeFunc& createNode, + const UpdateNodeFunc& updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) : + cfg(settings), + prims(prims), + heuristic(heuristic), + createAlloc(createAlloc), + createNode(createNode), + updateNode(updateNode), + createLeaf(createLeaf), + canCreateLeaf(canCreateLeaf), + canCreateLeafSplit(canCreateLeafSplit), + progressMonitor(progressMonitor) + { + if (cfg.branchingFactor > MAX_BRANCHING_FACTOR) + throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large"); + } + + const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc) + { + /* this should never occur but is a fatal error */ + if (current.depth > cfg.maxDepth) + throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached"); + + /* create leaf for few primitives */ + if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims)) + return createLeaf(prims,current.prims,alloc); + + /* fill all children by always splitting the largest one */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + size_t numChildren = 1; + children[0] = current; + do { + + /* find best child with largest bounding box area */ + size_t bestChild = -1; + size_t bestSize = 0; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims)) + continue; + + /* remember child with largest size */ + if (children[i].prims.size() > bestSize) { + bestSize = children[i].prims.size(); + bestChild = i; + } + } + if (bestChild == (size_t)-1) break; + + /*! split best child into left and right child */ + BuildRecord left(current.depth+1); + BuildRecord right(current.depth+1); + if (!canCreateLeaf(prims,children[bestChild].prims)) { + canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims); + } else { + heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims); + } + + /* add new children left and right */ + children[bestChild] = children[numChildren-1]; + children[numChildren-1] = left; + children[numChildren+0] = right; + numChildren++; + + } while (numChildren < cfg.branchingFactor); + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i<numChildren; i++) + children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc; + + /* create node */ + auto node = createNode(children,numChildren,alloc); + + /* recurse into each child and perform reduction */ + for (size_t i=0; i<numChildren; i++) + values[i] = createLargeLeaf(children[i],alloc); + + /* perform reduction */ + return updateNode(current,children,node,values,numChildren); + } + + const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel) + { + /* get thread local allocator */ + if (!alloc) + alloc = createAlloc(); + + /* call memory monitor function to signal progress */ + if (toplevel && current.size() <= cfg.singleThreadThreshold) + progressMonitor(current.size()); + + /*! find best split */ + auto split = heuristic.find(current.prims,cfg.logBlockSize); + + /*! compute leaf and split cost */ + const float leafSAH = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize); + const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH(); + assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0))); + + /*! create a leaf node when threshold reached or SAH tells us to stop */ + if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) { + heuristic.deterministic_order(current.prims); + return createLargeLeaf(current,alloc); + } + + /*! perform initial split */ + Set lprims,rprims; + heuristic.split(split,current.prims,lprims,rprims); + + /*! initialize child list with initial split */ + ReductionTy values[MAX_BRANCHING_FACTOR]; + BuildRecord children[MAX_BRANCHING_FACTOR]; + children[0] = BuildRecord(current.depth+1,lprims); + children[1] = BuildRecord(current.depth+1,rprims); + size_t numChildren = 2; + + /*! split until node is full or SAH tells us to stop */ + while (numChildren < cfg.branchingFactor) + { + /*! find best child to split */ + float bestArea = neg_inf; + ssize_t bestChild = -1; + for (size_t i=0; i<numChildren; i++) + { + /* ignore leaves as they cannot get split */ + if (children[i].prims.size() <= cfg.minLeafSize) continue; + + /* find child with largest surface area */ + if (halfArea(children[i].prims.geomBounds) > bestArea) { + bestChild = i; + bestArea = halfArea(children[i].prims.geomBounds); + } + } + if (bestChild == -1) break; + + /* perform best found split */ + BuildRecord& brecord = children[bestChild]; + BuildRecord lrecord(current.depth+1); + BuildRecord rrecord(current.depth+1); + auto split = heuristic.find(brecord.prims,cfg.logBlockSize); + heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims); + children[bestChild ] = lrecord; + children[numChildren] = rrecord; + numChildren++; + } + + /* set barrier for primrefarrayalloc */ + if (unlikely(current.size() > cfg.primrefarrayalloc)) + for (size_t i=0; i<numChildren; i++) + children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc; + + /* sort buildrecords for faster shadow ray traversal */ + std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>()); + + /*! create an inner node */ + auto node = createNode(children,numChildren,alloc); + + /* spawn tasks */ + if (current.size() > cfg.singleThreadThreshold) + { + /*! parallel_for is faster than spawing sub-tasks */ + parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here + for (size_t i=r.begin(); i<r.end(); i++) { + values[i] = recurse(children[i],nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + } + }); + + return updateNode(current,children,node,values,numChildren); + } + /* recurse into each child */ + else + { + for (size_t i=0; i<numChildren; i++) + values[i] = recurse(children[i],alloc,false); + + return updateNode(current,children,node,values,numChildren); + } + } + + private: + Settings cfg; + PrimRef* prims; + Heuristic& heuristic; + const CreateAllocFunc& createAlloc; + const CreateNodeFunc& createNode; + const UpdateNodeFunc& updateNode; + const CreateLeafFunc& createLeaf; + const CanCreateLeafFunc& canCreateLeaf; + const CanCreateLeafSplitFunc& canCreateLeafSplit; + const ProgressMonitor& progressMonitor; + }; + + template< + typename ReductionTy, + typename Heuristic, + typename Set, + typename PrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + DefaultCanCreateLeafFunc<PrimRef, Set>, + DefaultCanCreateLeafSplitFunc<PrimRef, Set>, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + DefaultCanCreateLeafFunc<PrimRef, Set>(), + DefaultCanCreateLeafSplitFunc<PrimRef, Set>(), + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + + template< + typename ReductionTy, + typename Heuristic, + typename Set, + typename PrimRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + __noinline static ReductionTy build(Heuristic& heuristic, + PrimRef* prims, + const Set& set, + CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + const Settings& settings) + { + typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + + typedef BuilderT< + BuildRecord, + Heuristic, + Set, + PrimRef, + ReductionTy, + decltype(createAlloc()), + CreateAllocFunc, + CreateNodeFunc, + UpdateNodeFunc, + CreateLeafFunc, + CanCreateLeafFunc, + CanCreateLeafSplitFunc, + ProgressMonitor> Builder; + + /* instantiate builder */ + Builder builder(prims, + heuristic, + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + + /* build hierarchy */ + BuildRecord record(1,set); + const ReductionTy root = builder.recurse(record,nullptr,true); + _mm_mfence(); // to allow non-temporal stores during build + return root; + } + }; + + /* SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedSAH + { + typedef PrimInfoRange Set; + typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic; + typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename CanCreateLeafFunc, + typename CanCreateLeafSplitFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + const CanCreateLeafFunc& canCreateLeaf, + const CanCreateLeafSplitFunc& canCreateLeafSplit, + const ProgressMonitor& progressMonitor, + PrimRef* prims, const PrimInfo& pinfo, + const Settings& settings) + { + Heuristic heuristic(prims); + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoRange(0,pinfo.size(),pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + canCreateLeaf, + canCreateLeafSplit, + progressMonitor, + settings); + } + }; + + /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */ + struct BVHBuilderBinnedFastSpatialSAH + { + typedef PrimInfoExtRange Set; + typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split; + typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + static const unsigned int GEOMID_MASK = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + template<typename ReductionTy, typename UserCreateLeaf> + struct CreateLeafExt + { + __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf) + : userCreateLeaf(userCreateLeaf) {} + + // __noinline is workaround for ICC2016 compiler bug + template<typename Allocator> + __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const + { + for (size_t i=range.begin(); i<range.end(); i++) + prims[i].lower.u &= GEOMID_MASK; + + return userCreateLeaf(prims,range,alloc); + } + + const UserCreateLeaf userCreateLeaf; + }; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename SplitPrimitiveFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + SplitPrimitiveFunc splitPrimitive, + ProgressMonitor progressMonitor, + PrimRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic; + Heuristic heuristic(splitPrimitive,prims,pinfo); + + /* calculate total surface area */ // FIXME: this sum is not deterministic + const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double { + + double A = 0.0f; + for (size_t i=r.begin(); i<r.end(); i++) + { + PrimRef& prim = prims[i]; + A += area(prim.bounds()); + } + return A; + },std::plus<double>()); + + + /* calculate maximum number of spatial splits per primitive */ + const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1; + const float f = 10.0f; + + const float invA = 1.0f / A; + parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) { + + for (size_t i=r.begin(); i<r.end(); i++) + { + PrimRef& prim = prims[i]; + assert((prim.geomID() & SPLITS_MASK) == 0); + // FIXME: is there a better general heuristic ? + const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA); + unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf))); + prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + } + }); + + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf), + progressMonitor, + settings); + } + }; + + /* Open/Merge SAH builder that operates on an array of BuildRecords */ + struct BVHBuilderBinnedOpenMergeSAH + { + static const size_t NUM_OBJECT_BINS_HQ = 32; + typedef PrimInfoExtRange Set; + typedef BinSplit<NUM_OBJECT_BINS_HQ> Split; + typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord; + typedef GeneralBVHBuilder::Settings Settings; + + /*! special builder that propagates reduction over the tree */ + template< + typename ReductionTy, + typename BuildRef, + typename CreateAllocFunc, + typename CreateNodeFunc, + typename UpdateNodeFunc, + typename CreateLeafFunc, + typename NodeOpenerFunc, + typename ProgressMonitor> + + static ReductionTy build(CreateAllocFunc createAlloc, + CreateNodeFunc createNode, + UpdateNodeFunc updateNode, + const CreateLeafFunc& createLeaf, + NodeOpenerFunc nodeOpenerFunc, + ProgressMonitor progressMonitor, + BuildRef* prims, + const size_t extSize, + const PrimInfo& pinfo, + const Settings& settings) + { + typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic; + Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor); + + return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>( + heuristic, + prims, + PrimInfoExtRange(0,pinfo.size(),extSize,pinfo), + createAlloc, + createNode, + updateNode, + createLeaf, + progressMonitor, + settings); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h new file mode 100644 index 0000000000..a4d3b68e46 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h @@ -0,0 +1,972 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! mapping into bins */ + template<size_t BINS> + struct BinMapping + { + public: + __forceinline BinMapping() {} + + /*! calculates the mapping */ + __forceinline BinMapping(size_t N, const BBox3fa& centBounds) + { + num = min(BINS,size_t(4.0f + 0.05f*N)); + assert(num >= 1); + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + __forceinline BinMapping(const BBox3fa& centBounds) + { + num = BINS; + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps, (vfloat4) centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) centBounds.lower; + } + + /*! calculates the mapping */ + template<typename PrimInfo> + __forceinline BinMapping(const PrimInfo& pinfo) + { + const vfloat4 eps = 1E-34f; + num = min(BINS,size_t(4.0f + 0.05f*pinfo.size())); + const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) pinfo.centBounds.lower; + } + + /*! returns number of bins */ + __forceinline size_t size() const { return num; } + + /*! slower but safe binning */ + __forceinline Vec3ia bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); +#if 1 + assert(i[0] >= 0 && (size_t)i[0] < num); + assert(i[1] >= 0 && (size_t)i[1] < num); + assert(i[2] >= 0 && (size_t)i[2] < num); + return Vec3ia(i); +#else + return Vec3ia(clamp(i,vint4(0),vint4(num-1))); +#endif + } + + /*! faster but unsafe binning */ + __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const { + return Vec3ia(floori((vfloat4(p)-ofs)*scale)); + } + + /*! faster but unsafe binning */ + template<typename PrimRef> + __forceinline Vec3ia bin_unsafe(const PrimRef& p) const { + return bin_unsafe(p.binCenter()); + } + + /*! faster but unsafe binning */ + template<typename PrimRef, typename BinBoundsAndCenter> + __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const { + return bin_unsafe(binBoundsAndCenter.binCenter(p)); + } + + template<typename PrimRef> + __forceinline bool bin_unsafe(const PrimRef& ref, + const vint4& vSplitPos, + const vbool4& splitDimMask) const // FIXME: rename to isLeft + { + return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask); + } + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),1.0f / scale[dim],ofs[dim]); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) { + return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}"; + } + + public: + size_t num; + vfloat4 ofs,scale; //!< linear function that maps to bin ID + }; + + /*! stores all information to perform some split */ + template<size_t BINS> + struct BinSplit + { + enum + { + SPLIT_OBJECT = 0, + SPLIT_FALLBACK = 1, + SPLIT_ENFORCE = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already + SPLIT_TEMPORAL = 2, + SPLIT_GEOMID = 3, + }; + + /*! construct an invalid split by default */ + __forceinline BinSplit() + : sah(inf), dim(-1), pos(0), data(0) {} + + __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0) + : sah(sah), dim(dim), fpos(fpos), data(data) {} + + /*! constructs specified split */ + __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) { + return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + union { int pos; float fpos; }; //!< bin index for splitting + unsigned int data; //!< extra optional split data + BinMapping<BINS> mapping; //!< mapping into bins + }; + + /*! stores extended information about the split */ + template<typename BBox> + struct SplitInfoT + { + + __forceinline SplitInfoT () {} + + __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds) + : leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {} + + public: + size_t leftCount,rightCount; + BBox leftBounds,rightBounds; + }; + + typedef SplitInfoT<BBox3fa> SplitInfo; + typedef SplitInfoT<LBBox3fa> SplitInfo2; + + /*! stores all binning information */ + template<size_t BINS, typename PrimRef, typename BBox> + struct __aligned(64) BinInfoT + { + typedef BinSplit<BINS> Split; + typedef vbool4 vbool; + typedef vint4 vint; + typedef vfloat4 vfloat; + + __forceinline BinInfoT() { + } + + __forceinline BinInfoT(EmptyTy) { + clear(); + } + + /*! bin access function */ + __forceinline BBox &bounds(const size_t binID, const size_t dimID) { return _bounds[binID][dimID]; } + __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; } + + __forceinline unsigned int &counts(const size_t binID, const size_t dimID) { return _counts[binID][dimID]; } + __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; } + + __forceinline vuint4 &counts(const size_t binID) { return _counts[binID]; } + __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i<BINS; i++) { + bounds(i,0) = bounds(i,1) = bounds(i,2) = empty; + counts(i) = vuint4(zero); + } + } + + /*! bins an array of primitives */ + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping) + { + if (unlikely(N == 0)) return; + size_t i; + for (i=0; i<N-1; i+=2) + { + /*! map even and odd primitive to bin */ + BBox prim0; Vec3fa center0; + prims[i+0].binBoundsAndCenter(prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + BBox prim1; Vec3fa center1; + prims[i+1].binBoundsAndCenter(prim1,center1); + const vint4 bin1 = (vint4)mapping.bin(center1); + + /*! increase bounds for bins for even primitive */ + const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); + const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); + const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); + const unsigned int s0 = (unsigned int)prims[i+0].size(); + counts(b00,0)+=s0; + counts(b01,1)+=s0; + counts(b02,2)+=s0; + + /*! increase bounds of bins for odd primitive */ + const unsigned int b10 = extract<0>(bin1); bounds(b10,0).extend(prim1); + const unsigned int b11 = extract<1>(bin1); bounds(b11,1).extend(prim1); + const unsigned int b12 = extract<2>(bin1); bounds(b12,2).extend(prim1); + const unsigned int s1 = (unsigned int)prims[i+1].size(); + counts(b10,0)+=s1; + counts(b11,1)+=s1; + counts(b12,2)+=s1; + } + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; + prims[i].binBoundsAndCenter(prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = (unsigned int)prims[i].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + /*! bins an array of primitives */ + template<typename BinBoundsAndCenter> + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (N == 0) return; + + size_t i; + for (i=0; i<N-1; i+=2) + { + /*! map even and odd primitive to bin */ + BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); + const vint4 bin1 = (vint4)mapping.bin(center1); + + /*! increase bounds for bins for even primitive */ + const unsigned int s0 = prims[i+0].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + + /*! increase bounds of bins for odd primitive */ + const unsigned int s1 = prims[i+1].size(); + const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1); + const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1); + const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1); + } + + /*! for uneven number of primitives */ + if (i < N) + { + /*! map primitive to bin */ + BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); + const vint4 bin0 = (vint4)mapping.bin(center0); + + /*! increase bounds of bins */ + const unsigned int s0 = prims[i+0].size(); + const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0); + const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0); + const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0); + } + } + + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) { + bin(prims+begin,end-begin,mapping); + } + + template<typename BinBoundsAndCenter> + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) { + bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter); + } + + /*! merges in other binning information */ + __forceinline void merge (const BinInfoT& other, size_t numBins) + { + + for (size_t i=0; i<numBins; i++) + { + counts(i) += other.counts(i); + bounds(i,0).extend(other.bounds(i,0)); + bounds(i,1).extend(other.bounds(i,1)); + bounds(i,2).extend(other.bounds(i,2)); + } + } + + /*! reduces binning information */ + static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS) + { + BinInfoT c; + for (size_t i=0; i<numBins; i++) + { + c.counts(i) = a.counts(i)+b.counts(i); + c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0)); + c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1)); + c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2)); + } + return c; + } + + /*! finds the best split by scanning binning information */ + __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty; + for (size_t i=mapping.size()-1; i>0; i--) + { + count += counts(i); + rCounts[i] = count; + bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx); + by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by); + bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz); + rAreas[i][3] = 0.0f; + } + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i<mapping.size(); i++, ii+=1) + { + count += counts(i-1); + bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx); + by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by); + bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz); + const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az); + const vfloat4 rArea = rAreas[i]; + const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions. + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + + vbestPos = select(sah < vbestSAH,ii ,vbestPos); + vbestSAH = select(sah < vbestSAH,sah,vbestSAH); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + } + } + return Split(bestSAH,bestDim,bestPos,mapping); + } + + /*! calculates extended split information */ + __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const + { + if (split.dim == -1) { + new (&info) SplitInfoT<BBox>(0,empty,0,empty); + return; + } + + size_t leftCount = 0; + BBox leftBounds = empty; + for (size_t i=0; i<(size_t)split.pos; i++) { + leftCount += counts(i,split.dim); + leftBounds.extend(bounds(i,split.dim)); + } + size_t rightCount = 0; + BBox rightBounds = empty; + for (size_t i=split.pos; i<mapping.size(); i++) { + rightCount += counts(i,split.dim); + rightBounds.extend(bounds(i,split.dim)); + } + new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds); + } + + /*! gets the number of primitives left of the split */ + __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t leftCount = 0; + for (size_t i = 0; i < (size_t)split.pos; i++) { + leftCount += counts(i, split.dim); + } + return leftCount; + } + + /*! gets the number of primitives right of the split */ + __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t rightCount = 0; + for (size_t i = (size_t)split.pos; i<mapping.size(); i++) { + rightCount += counts(i, split.dim); + } + return rightCount; + } + + private: + BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension + vuint4 _counts[BINS]; //!< counts number of primitives that map into the bins + }; + +#if defined(__AVX512ER__) // KNL + + /*! mapping into bins */ + template<> + struct BinMapping<16> + { + public: + __forceinline BinMapping() {} + + /*! calculates the mapping */ + template<typename PrimInfo> + __forceinline BinMapping(const PrimInfo& pinfo) + { + num = 16; + const vfloat4 eps = 1E-34f; + const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size()); + scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f)); + ofs = (vfloat4) pinfo.centBounds.lower; + scale16 = scale; + ofs16 = ofs; + } + + /*! returns number of bins */ + __forceinline size_t size() const { return num; } + + __forceinline vint16 bin16(const Vec3fa& p) const { + return vint16(vint4(floori((vfloat4(p)-ofs)*scale))); + } + + __forceinline vint16 bin16(const vfloat16& p) const { + return floori((p-ofs16)*scale16); + } + + __forceinline int bin_unsafe(const PrimRef& ref, + const vint16& vSplitPos, + const vbool16& splitDimMask) const // FIXME: rename to isLeft + { + const vfloat16 lower(*(vfloat4*)&ref.lower); + const vfloat16 upper(*(vfloat4*)&ref.upper); + const vfloat16 p = lower + upper; + const vint16 i = floori((p-ofs16)*scale16); + return lt(splitDimMask,i,vSplitPos); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + public: + size_t num; + vfloat4 ofs,scale; //!< linear function that maps to bin ID + vfloat16 ofs16,scale16; //!< linear function that maps to bin ID + }; + + /* 16 bins in-register binner */ + template<typename PrimRef> + struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa> + { + typedef BinSplit<16> Split; + typedef vbool16 vbool; + typedef vint16 vint; + typedef vfloat16 vfloat; + + __forceinline BinInfoT() { + } + + __forceinline BinInfoT(EmptyTy) { + clear(); + } + + /*! clears the bin info */ + __forceinline void clear() + { + lower[0] = lower[1] = lower[2] = pos_inf; + upper[0] = upper[1] = upper[2] = neg_inf; + count[0] = count[1] = count[2] = 0; + } + + + static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x, + const vfloat16 min_y, + const vfloat16 min_z, + const vfloat16 max_x, + const vfloat16 max_y, + const vfloat16 max_z) + { + const vfloat16 r_min_x = reverse_prefix_min(min_x); + const vfloat16 r_min_y = reverse_prefix_min(min_y); + const vfloat16 r_min_z = reverse_prefix_min(min_z); + const vfloat16 r_max_x = reverse_prefix_max(max_x); + const vfloat16 r_max_y = reverse_prefix_max(max_y); + const vfloat16 r_max_z = reverse_prefix_max(max_z); + const vfloat16 dx = r_max_x - r_min_x; + const vfloat16 dy = r_max_y - r_min_y; + const vfloat16 dz = r_max_z - r_min_z; + const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz)); + return area_rl; + } + + static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x, + const vfloat16 min_y, + const vfloat16 min_z, + const vfloat16 max_x, + const vfloat16 max_y, + const vfloat16 max_z) + { + const vfloat16 r_min_x = prefix_min(min_x); + const vfloat16 r_min_y = prefix_min(min_y); + const vfloat16 r_min_z = prefix_min(min_z); + const vfloat16 r_max_x = prefix_max(max_x); + const vfloat16 r_max_y = prefix_max(max_y); + const vfloat16 r_max_z = prefix_max(max_z); + const vfloat16 dx = r_max_x - r_min_x; + const vfloat16 dy = r_max_y - r_min_y; + const vfloat16 dz = r_max_z - r_min_z; + const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz)); + return area_lr; + } + + + /*! bins an array of primitives */ + __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping) + { + if (unlikely(N == 0)) return; + + const vfloat16 init_min(pos_inf); + const vfloat16 init_max(neg_inf); + + vfloat16 min_x0,min_x1,min_x2; + vfloat16 min_y0,min_y1,min_y2; + vfloat16 min_z0,min_z1,min_z2; + vfloat16 max_x0,max_x1,max_x2; + vfloat16 max_y0,max_y1,max_y2; + vfloat16 max_z0,max_z1,max_z2; + vuint16 count0,count1,count2; + + min_x0 = init_min; + min_x1 = init_min; + min_x2 = init_min; + min_y0 = init_min; + min_y1 = init_min; + min_y2 = init_min; + min_z0 = init_min; + min_z1 = init_min; + min_z2 = init_min; + + max_x0 = init_max; + max_x1 = init_max; + max_x2 = init_max; + max_y0 = init_max; + max_y1 = init_max; + max_y2 = init_max; + max_z0 = init_max; + max_z1 = init_max; + max_z2 = init_max; + + count0 = zero; + count1 = zero; + count2 = zero; + + const vint16 step16(step); + size_t i; + for (i=0; i<N-1; i+=2) + { + /*! map even and odd primitive to bin */ + const BBox3fa primA = prims[i+0].bounds(); + const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper); + const vint16 binA = mapping.bin16(centerA); + + const BBox3fa primB = prims[i+1].bounds(); + const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper); + const vint16 binB = mapping.bin16(centerB); + + /* A */ + { + const vfloat16 b_min_x = prims[i+0].lower.x; + const vfloat16 b_min_y = prims[i+0].lower.y; + const vfloat16 b_min_z = prims[i+0].lower.z; + const vfloat16 b_max_x = prims[i+0].upper.x; + const vfloat16 b_max_y = prims[i+0].upper.y; + const vfloat16 b_max_z = prims[i+0].upper.z; + + const vint16 bin0 = shuffle<0>(binA); + const vint16 bin1 = shuffle<1>(binA); + const vint16 bin2 = shuffle<2>(binA); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + + /* B */ + { + const vfloat16 b_min_x = prims[i+1].lower.x; + const vfloat16 b_min_y = prims[i+1].lower.y; + const vfloat16 b_min_z = prims[i+1].lower.z; + const vfloat16 b_max_x = prims[i+1].upper.x; + const vfloat16 b_max_y = prims[i+1].upper.y; + const vfloat16 b_max_z = prims[i+1].upper.z; + + const vint16 bin0 = shuffle<0>(binB); + const vint16 bin1 = shuffle<1>(binB); + const vint16 bin2 = shuffle<2>(binB); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + } + + if (i < N) + { + const BBox3fa prim0 = prims[i].bounds(); + const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); + const vint16 bin = mapping.bin16(center0); + + const vfloat16 b_min_x = prims[i].lower.x; + const vfloat16 b_min_y = prims[i].lower.y; + const vfloat16 b_min_z = prims[i].lower.z; + const vfloat16 b_max_x = prims[i].upper.x; + const vfloat16 b_max_y = prims[i].upper.y; + const vfloat16 b_max_z = prims[i].upper.z; + + const vint16 bin0 = shuffle<0>(bin); + const vint16 bin1 = shuffle<1>(bin); + const vint16 bin2 = shuffle<2>(bin); + + const vbool16 m_update_x = step16 == bin0; + const vbool16 m_update_y = step16 == bin1; + const vbool16 m_update_z = step16 == bin2; + + assert(popcnt((size_t)m_update_x) == 1); + assert(popcnt((size_t)m_update_y) == 1); + assert(popcnt((size_t)m_update_z) == 1); + + min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x); + min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y); + min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z); + // ------------------------------------------------------------------------ + max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x); + max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y); + max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z); + // ------------------------------------------------------------------------ + min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x); + min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y); + min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z); + // ------------------------------------------------------------------------ + max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x); + max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y); + max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z); + // ------------------------------------------------------------------------ + min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x); + min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y); + min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z); + // ------------------------------------------------------------------------ + max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x); + max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y); + max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z); + // ------------------------------------------------------------------------ + count0 = mask_add(m_update_x,count0,count0,vuint16(1)); + count1 = mask_add(m_update_y,count1,count1,vuint16(1)); + count2 = mask_add(m_update_z,count2,count2,vuint16(1)); + } + + lower[0] = Vec3vf16( min_x0, min_y0, min_z0 ); + lower[1] = Vec3vf16( min_x1, min_y1, min_z1 ); + lower[2] = Vec3vf16( min_x2, min_y2, min_z2 ); + + upper[0] = Vec3vf16( max_x0, max_y0, max_z0 ); + upper[1] = Vec3vf16( max_x1, max_y1, max_z1 ); + upper[2] = Vec3vf16( max_x2, max_y2, max_z2 ); + + count[0] = count0; + count[1] = count1; + count[2] = count2; + } + + __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) { + bin(prims+begin,end-begin,mapping); + } + + /*! merges in other binning information */ + __forceinline void merge (const BinInfoT& other, size_t numBins) + { + for (size_t i=0; i<3; i++) + { + lower[i] = min(lower[i],other.lower[i]); + upper[i] = max(upper[i],other.upper[i]); + count[i] += other.count[i]; + } + } + + /*! reducesr binning information */ + static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b) + { + BinInfoT c; + for (size_t i=0; i<3; i++) + { + c.counts[i] = a.counts[i] + b.counts[i]; + c.lower[i] = min(a.lower[i],b.lower[i]); + c.upper[i] = max(a.upper[i],b.upper[i]); + } + return c; + } + + /*! finds the best split by scanning binning information */ + __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const + { + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + const vuint16 blocks_add = (1 << blocks_shift)-1; + const vfloat16 inf(pos_inf); + for (size_t dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); + const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z); + const vuint16 lCount16 = prefix_sum(count[dim]); + const vuint16 rCount16 = reverse_prefix_sum(count[dim]); + + /* compute best split in this dimension */ + const vfloat16 leftArea = lArea16; + const vfloat16 rightArea = align_shift_right<1>(zero,rArea16); + const vuint16 lC = lCount16; + const vuint16 rC = align_shift_right<1>(zero,rCount16); + const vuint16 leftCount = ( lC + blocks_add) >> blocks_shift; + const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift; + const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries + const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf)); + /* test if this is a better dimension */ + if (any(sah < vfloat16(bestSAH))) + { + const size_t index = select_min(sah); + assert(index < 15); + assert(sah[index] < bestSAH); + bestDim = dim; + bestPos = index+1; + bestSAH = sah[index]; + } + } + + return Split(bestSAH,bestDim,bestPos,mapping); + + } + + /*! calculates extended split information */ + __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const + { + if (split.dim == -1) { + new (&info) SplitInfo(0,empty,0,empty); + return; + } + // FIXME: horizontal reduction! + + size_t leftCount = 0; + BBox3fa leftBounds = empty; + for (size_t i=0; i<(size_t)split.pos; i++) { + leftCount += count[split.dim][i]; + Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]); + Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]); + leftBounds.extend(BBox3fa(bounds_lower,bounds_upper)); + } + size_t rightCount = 0; + BBox3fa rightBounds = empty; + for (size_t i=split.pos; i<mapping.size(); i++) { + rightCount += count[split.dim][i]; + Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]); + Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]); + rightBounds.extend(BBox3fa(bounds_lower,bounds_upper)); + } + new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds); + } + + /*! gets the number of primitives left of the split */ + __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t leftCount = 0; + for (size_t i = 0; i < (size_t)split.pos; i++) { + leftCount += count[split.dim][i]; + } + return leftCount; + } + + /*! gets the number of primitives right of the split */ + __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const + { + if (unlikely(split.dim == -1)) return -1; + + size_t rightCount = 0; + for (size_t i = (size_t)split.pos; i<mapping.size(); i++) { + rightCount += count[split.dim][i]; + } + return rightCount; + } + + private: + Vec3vf16 lower[3]; + Vec3vf16 upper[3]; + vuint16 count[3]; + }; +#endif + } + + template<typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (likely(end-begin < parallelThreshold)) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } + + template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef> + __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter) + { + if (!parallel) { + binner.bin(prims,begin,end,mapping,binBoundsAndCenter); + } else { + binner = parallel_reduce(begin,end,blockSize,binner, + [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; }, + [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; }); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h new file mode 100644 index 0000000000..a4c272f015 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h @@ -0,0 +1,205 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t> + { + __forceinline PrimInfoRange () { + } + + __forceinline PrimInfoRange(const PrimInfo& pinfo) + : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {} + + __forceinline PrimInfoRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {} + + __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + } + }; + + /*! Performs standard object binning */ + template<typename PrimRef, size_t BINS> + struct HeuristicArrayBinningSAH + { + typedef BinSplit<BINS> Split; + typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner; + typedef range<size_t> Set; + +#if defined(__AVX512ER__) // KNL + static const size_t PARALLEL_THRESHOLD = 4*768; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768; +#else + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#endif + __forceinline HeuristicArrayBinningSAH () + : prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayBinningSAH (PrimRef* prims) + : prims(prims) {} + + /*! finds the best split */ + __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + return find_template<false>(pinfo,logBlockSize); + else + return find_template<true>(pinfo,logBlockSize); + } + + template<bool parallel> + __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<BINS> mapping(pinfo); + bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + if (likely(pinfo.size() < PARALLEL_THRESHOLD)) + split_template<false>(split,pinfo,linfo,rinfo); + else + split_template<true>(split,pinfo,linfo,rinfo); + } + + template<bool parallel> + __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename Binner::vint vSplitPos(splitPos); + const typename Binner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + size_t center = 0; + if (!parallel) + center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning( + prims,begin,end,EmptyTy(),local_left,local_right,isLeft, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const PrimInfoRange& pinfo) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]); + } + + void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + const size_t begin = pinfo.begin(); + const size_t end = pinfo.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend_center2(prims[i]); + new (&linfo) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend_center2(prims[i]); + new (&rinfo) PrimInfoRange(center,end,right); + } + + void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo) + { + assert(range.size() > 1); + CentGeomBBox3fa left(empty); + CentGeomBBox3fa right(empty); + unsigned int geomID = prims[range.begin()].geomID(); + size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right, + [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; }, + [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); }); + + new (&linfo) PrimInfoRange(range.begin(),center,left); + new (&rinfo) PrimInfoRange(center,range.end(),right); + } + + private: + PrimRef* const prims; + }; + + /*! Performs standard object binning */ + template<typename PrimRefMB, size_t BINS> + struct HeuristicArrayBinningMB + { + typedef BinSplit<BINS> Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner; + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + ObjectBinner binner(empty); + const BinMapping<BINS> mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h new file mode 100644 index 0000000000..1370244586 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h @@ -0,0 +1,302 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename PrimRef, size_t BINS> + struct UnalignedHeuristicArrayBinningSAH + { + typedef BinSplit<BINS> Split; + typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner; + typedef range<size_t> Set; + + __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + const LinearSpace3fa computeAlignedSpace(const range<size_t>& set) + { + Vec3fa axis(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const unsigned int geomID = prims[i].geomID(); + const unsigned int primID = prims[i].primID(); + const uint64_t geomprimID = prims[i].ID64(); + if (geomprimID >= bestGeomPrimID) continue; + const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID); + if (sqr_length(axis1) > 1E-18f) { + axis = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + return frame(axis).transposed(); + } + + const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space) + { + auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa + { + CentGeomBBox3fa bounds(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + Geometry* mesh = scene->get(prims[i].geomID()); + bounds.extend(mesh->vbounds(space,prims[i].primID())); + } + return bounds; + }; + + const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), + CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2); + + return PrimInfo(set.begin(),set.end(),bounds); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space) + : scene(scene), space(space) {} + + /*! returns center for binning */ + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + return embree::center2(bounds); + } + + /*! returns bounds and centroid used for binning */ + __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const + { + Geometry* mesh = (Geometry*) scene->get(ref.geomID()); + BBox3fa bounds = mesh->vbounds(space,ref.primID()); + bounds_o = bounds; + center_o = embree::center2(bounds); + } + + private: + Scene* scene; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space) + { + if (likely(pinfo.size() < 10000)) + return find_template<false>(pinfo,logBlockSize,space); + else + return find_template<true>(pinfo,logBlockSize,space); + } + + /*! finds the best split */ + template<bool parallel> + const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + Binner binner(empty); + const BinMapping<BINS> mapping(set); + BinBoundsAndCenter binBoundsAndCenter(scene,space); + bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (likely(set.size() < 10000)) + split_template<false>(split,space,set,lset,rset); + else + split_template<true>(split,space,set,lset,rset); + } + + /*! array partitioning */ + template<bool parallel> + __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + const int splitPos = split.pos; + const int splitDim = split.dim; + BinBoundsAndCenter binBoundsAndCenter(scene,space); + + size_t center = 0; + if (likely(set.size() < 10000)) + center = serial_partitioning(prims,begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }); + else + center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; }, + [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); }, + [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); }, + 128); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const range<size_t>& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend_center2(prims[i]); + new (&lset) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend_center2(prims[i]); + new (&rset) PrimInfoRange(center,end,right); + } + + private: + Scene* const scene; + PrimRef* const prims; + }; + + /*! Performs standard object binning */ + template<typename PrimRefMB, size_t BINS> + struct UnalignedHeuristicArrayBinningMB + { + typedef BinSplit<BINS> Split; + typedef typename PrimRefMB::BBox BBox; + typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + UnalignedHeuristicArrayBinningMB(Scene* scene) + : scene(scene) {} + + const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /*! find curve with minimum ID that defines valid direction */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const PrimRefMB& prim = (*set.prims)[i]; + const unsigned int geomID = prim.geomID(); + const unsigned int primID = prim.primID(); + const uint64_t geomprimID = prim.ID64(); + if (geomprimID >= bestGeomPrimID) continue; + + const Geometry* mesh = scene->get(geomID); + const range<int> tbounds = mesh->timeSegmentRange(set.time_range); + if (tbounds.size() == 0) continue; + + const size_t t = (tbounds.begin()+tbounds.end())/2; + const Vec3fa axis1 = mesh->computeDirection(primID,t); + if (sqr_length(axis1) > 1E-18f) { + axis0 = normalize(axis1); + bestGeomPrimID = geomprimID; + } + } + + return frame(axis0).transposed(); + } + + struct BinBoundsAndCenter + { + __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space) + : scene(scene), time_range(time_range), space(space) {} + + /*! returns center for binning */ + template<typename PrimRef> + __forceinline Vec3fa binCenter(const PrimRef& ref) const + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + return center2(lbounds.interpolate(0.5f)); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds.interpolate(0.5f); + center_o = center2(bounds_o); + } + + /*! returns bounds and centroid used for binning */ + __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX + { + Geometry* mesh = scene->get(ref.geomID()); + LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range); + bounds_o = lbounds; + center_o = center2(lbounds.interpolate(0.5f)); + } + + private: + Scene* scene; + BBox1f time_range; + const LinearSpace3fa space; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + ObjectBinner binner(empty); + const BinMapping<BINS> mapping(set.size(),set.centBounds); + bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter); + Split osplit = binner.best(mapping,logBlockSize); + osplit.sah *= set.time_range.size(); + if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split + return osplit; + } + + /*! array partitioning */ + __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset) + { + BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space); + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfoMB left = empty; + PrimInfoMB right = empty; + const vint4 vSplitPos(split.pos); + const vbool4 vSplitMask(1 << split.dim); + auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); }; + auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); }; + auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); }; + size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD); + new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range); + new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range); + } + + private: + Scene* scene; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h new file mode 100644 index 0000000000..21f18c0208 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h @@ -0,0 +1,443 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +// TODO: +// - adjust parallel build thresholds +// - openNodesBasedOnExtend should consider max extended size + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +/* stop opening of all bref.geomIDs are the same */ +#define EQUAL_GEOMID_STOP_CRITERIA 1 + +/* 10% spatial extend threshold */ +#define MAX_EXTEND_THRESHOLD 0.1f + +/* maximum is 8 children */ +#define MAX_OPENED_CHILD_NODES 8 + +/* open until all build refs are below threshold size in one step */ +#define USE_LOOP_OPENING 0 + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS> + struct HeuristicArrayOpenMergeSAH + { + typedef BinSplit<OBJECT_BINS> Split; + typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner; + + static const size_t PARALLEL_THRESHOLD = 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 512; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 128; + + __forceinline HeuristicArrayOpenMergeSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size) + : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) + { + assert(max_open_size <= MAX_OPENED_CHILD_NODES); + } + + struct OpenHeuristic + { + __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo ) + { + const Vec3fa diag = pinfo.geomBounds.size(); + dim = maxDim(diag); + assert(diag[dim] > 0.0f); + inv_max_extend = 1.0f / diag[dim]; + } + + __forceinline bool operator () ( PrimRef& prim ) const { + return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD; + } + + private: + size_t dim; + float inv_max_extend; + }; + + /*! compute extended ranges */ + __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+right_size] = prims0[i]; + }); + } + else + { + /* no overlap, move entire right range to new location, can be made fully parallel */ + parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+left_ext_range_size] = prims0[i]; + }); + } + /* update right range */ + assert(rset.ext_end() + left_ext_range_size == set.ext_end()); + rset.move_right(left_ext_range_size); + } + } + + /* estimates the extra space required when opening, and checks if all primitives are from same geometry */ + __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const unsigned int geomID = prims0[set.begin()].geomID(); + + auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { + bool commonGeomID = true; + size_t opens = 0; + for (size_t i=r.begin(); i<r.end(); i++) { + commonGeomID &= prims0[i].geomID() == geomID; + if (heuristic(prims0[i])) + opens += prims0[i].node.getN()-1; // coarse approximation + } + return std::pair<size_t,bool>(opens,commonGeomID); + }; + auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { + return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); + }; + return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction); + } + + // FIXME: should consider maximum available extended size + __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set) + { + const OpenHeuristic heuristic(set); + const size_t ext_range_start = set.end(); + + if (false && set.size() < PARALLEL_THRESHOLD) + { + size_t extra_elements = 0; + for (size_t i=set.begin(); i<set.end(); i++) + { + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + assert(extra_elements + n-1 <= set.ext_range_size()); + for (size_t j=0; j<n; j++) + set.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1; j<n; j++) + prims0[ext_range_start+extra_elements+j-1] = tmp[j]; + extra_elements += n-1; + } + } + set._end += extra_elements; + } + else + { + std::atomic<size_t> ext_elements; + ext_elements.store(0); + PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo { + PrimInfo info(empty); + for (size_t i=r.begin(); i<r.end(); i++) + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + const size_t ID = ext_elements.fetch_add(n-1); + assert(ID + n-1 <= set.ext_range_size()); + + for (size_t j=0; j<n; j++) + info.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1; j<n; j++) + prims0[ext_range_start+ID+j-1] = tmp[j]; + } + return info; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); + set.centBounds.extend(info.centBounds); + assert(ext_elements.load() <= set.ext_range_size()); + set._end += ext_elements.load(); + } + } + + __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements) + { + const OpenHeuristic heuristic(set); + size_t next_iteration_extra_elements = est_new_elements; + + while (next_iteration_extra_elements <= set.ext_range_size()) + { + next_iteration_extra_elements = 0; + size_t extra_elements = 0; + const size_t ext_range_start = set.end(); + + for (size_t i=set.begin(); i<set.end(); i++) + { + if (heuristic(prims0[i])) + { + PrimRef tmp[MAX_OPENED_CHILD_NODES]; + const size_t n = nodeOpenerFunc(prims0[i],tmp); + assert(extra_elements + n-1 <= set.ext_range_size()); + for (size_t j=0;j<n;j++) + set.extend_center2(tmp[j]); + + prims0[i] = tmp[0]; + for (size_t j=1;j<n;j++) + prims0[ext_range_start+extra_elements+j-1] = tmp[j]; + extra_elements += n-1; + + for (size_t j=0; j<n; j++) + if (heuristic(tmp[j])) + next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation + + } + } + assert( extra_elements <= set.ext_range_size()); + set._end += extra_elements; + + for (size_t i=set.begin();i<set.end();i++) + assert(prims0[i].numPrimitives() > 0); + + if (unlikely(next_iteration_extra_elements == 0)) break; + } + } + + __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize) + { + /* single element */ + if (set.size() <= 1) + return Split(); + + /* disable opening if there is no overlap */ + const size_t D = 4; + if (unlikely(set.has_ext_range() && set.size() <= D)) + { + bool disjoint = true; + for (size_t j=set.begin(); j<set.end()-1; j++) { + for (size_t i=set.begin()+1; i<set.end(); i++) { + if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { + disjoint = false; break; + } + } + } + if (disjoint) set.set_ext_range(set.end()); /* disables opening */ + } + + std::pair<size_t,bool> p(0,false); + + /* disable opening when all primitives are from same geometry */ + if (unlikely(set.has_ext_range())) + { + p = getProperties(set); +#if EQUAL_GEOMID_STOP_CRITERIA == 1 + if (p.second) set.set_ext_range(set.end()); /* disable opening */ +#endif + } + + /* open nodes when we have sufficient space available */ + if (unlikely(set.has_ext_range())) + { +#if USE_LOOP_OPENING == 1 + openNodesBasedOnExtendLoop(set,p.first); +#else + if (p.first <= set.ext_range_size()) + openNodesBasedOnExtend(set); +#endif + + /* disable opening when unsufficient space for opening a node available */ + if (set.ext_range_size() < max_open_size-1) + set.set_ext_range(set.end()); /* disable opening */ + } + + /* find best split */ + return object_find(set,logBlockSize); + } + + + /*! finds the best object split */ + __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize); + else return parallel_object_find (set,logBlockSize); + } + + /*! finds the best object split */ + __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set.centBounds); + binner.bin(prims0,set.begin(),set.end(),mapping); + return binner.best(mapping,logBlockSize); + } + + /*! finds the best split */ + __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + Binner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set.centBounds); + const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + auto body = [&] (const range<size_t>& r) -> Binner { + Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; + }; + auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { + Binner r = b0; r.merge(b1,_mapping.size()); return r; + }; + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction); + return binner.best(mapping,logBlockSize); + } + + /*! array partitioning */ + __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + splitFallback(set,lset,rset); + return; + } + + std::pair<size_t,size_t> ext_weights(0,0); + + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split,set,lset,rset); + else + ext_weights = parallel_object_split(split,set,lset,rset); + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }); + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(local_left.size(),local_right.size()); + } + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); }, + [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + + return std::pair<size_t,size_t>(left.size(),right.size()); + } + + void deterministic_order(const extended_range<size_t>& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i<center; i++) + left.add_center2(prims0[i]); + + const size_t lweight = left.end; + + PrimInfo right(empty); + for (size_t i=center; i<end; i++) + right.add_center2(prims0[i]); + + const size_t rweight = right.end; + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + /* if we have an extended range */ + if (set.has_ext_range()) + { + setExtentedRanges(set,lset,rset,lweight,rweight); + moveExtentedRange(set,lset,rset); + } + } + + private: + PrimRef* const prims0; + const NodeOpenerFunc& nodeOpenerFunc; + size_t max_open_size; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h new file mode 100644 index 0000000000..d8ca6cb92c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h @@ -0,0 +1,414 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "priminfo.h" + +namespace embree +{ + static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5; + + namespace isa + { + + /*! mapping into bins */ + template<size_t BINS> + struct SpatialBinMapping + { + public: + __forceinline SpatialBinMapping() {} + + /*! calculates the mapping */ + __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo) + { + const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower; + const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper; + const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper)); + const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size()); + scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag); + ofs = (vfloat4) pinfo.geomBounds.lower; + inv_scale = 1.0f / scale; + } + + /*! slower but safe binning */ + __forceinline vint4 bin(const Vec3fa& p) const + { + const vint4 i = floori((vfloat4(p)-ofs)*scale); + return clamp(i,vint4(0),vint4(BINS-1)); + } + + __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const + { +#if defined(__AVX__) + const vfloat8 ofs8(ofs); + const vfloat8 scale8(scale); + const vint8 lu = floori((vfloat8::loadu(&b)-ofs8)*scale8); + const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1)); + return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu)); +#else + const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale); + const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale); + const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1)); + const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1)); + return std::pair<vint4,vint4>(c_lower,c_upper); +#endif + } + + + /*! calculates left spatial position of bin */ + __forceinline float pos(const size_t bin, const size_t dim) const { + return madd(float(bin),inv_scale[dim],ofs[dim]); + } + + /*! calculates left spatial position of bin */ + template<size_t N> + __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const { + return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim])); + } + + /*! returns true if the mapping is invalid in some dimension */ + __forceinline bool invalid(const size_t dim) const { + return scale[dim] == 0.0f; + } + + public: + vfloat4 ofs,scale,inv_scale; //!< linear function that maps to bin ID + }; + + /*! stores all information required to perform some split */ + template<size_t BINS> + struct SpatialBinSplit + { + /*! construct an invalid split by default */ + __forceinline SpatialBinSplit() + : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {} + + /*! constructs specified split */ + __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping) + : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {} + + /*! tests if this split is valid */ + __forceinline bool valid() const { return dim != -1; } + + /*! calculates surface area heuristic for performing the split */ + __forceinline float splitSAH() const { return sah; } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) { + return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}"; + } + + public: + float sah; //!< SAH cost of the split + int dim; //!< split dimension + int pos; //!< split position + int left; //!< number of elements on the left side + int right; //!< number of elements on the right side + float factor; //!< factor splitting the extended range + SpatialBinMapping<BINS> mapping; //!< mapping into bins + }; + + /*! stores all binning information */ + template<size_t BINS, typename PrimRef> + struct __aligned(64) SpatialBinInfo + { + SpatialBinInfo() { + } + + __forceinline SpatialBinInfo(EmptyTy) { + clear(); + } + + /*! clears the bin info */ + __forceinline void clear() + { + for (size_t i=0; i<BINS; i++) { + bounds[i][0] = bounds[i][1] = bounds[i][2] = empty; + numBegin[i] = numEnd[i] = 0; + } + } + + /*! adds binning data */ + __forceinline void add(const size_t dim, + const size_t beginID, + const size_t endID, + const size_t binID, + const BBox3fa &b, + const size_t n = 1) + { + assert(beginID < BINS); + assert(endID < BINS); + assert(binID < BINS); + + numBegin[beginID][dim]+=(unsigned int)n; + numEnd [endID][dim]+=(unsigned int)n; + bounds [binID][dim].extend(b); + } + + /*! extends binning bounds */ + __forceinline void extend(const size_t dim, + const size_t binID, + const BBox3fa &b) + { + assert(binID < BINS); + bounds [binID][dim].extend(b); + } + + /*! bins an array of triangles */ + template<typename SplitPrimitive> + __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=0; i<N; i++) + { + const PrimRef prim = prims[i]; + unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (unlikely(splits == 1)) + { + const vint4 bin = mapping.bin(center(prim.bounds())); + for (size_t dim=0; dim<3; dim++) + { + assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS); + numBegin[bin[dim]][dim]++; + numEnd [bin[dim]][dim]++; + bounds [bin[dim]][dim].extend(prim.bounds()); + } + } + else + { + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + size_t bin; + PrimRef rest = prim; + size_t l = bin0[dim]; + size_t r = bin1[dim]; + + // same bin optimization + if (likely(l == r)) + { + numBegin[l][dim]++; + numEnd [l][dim]++; + bounds [l][dim].extend(prim.bounds()); + continue; + } + + for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) + { + const float pos = mapping.pos(bin+1,dim); + + PrimRef left,right; + splitPrimitive(rest,(int)dim,pos,left,right); + if (unlikely(left.bounds().empty())) l++; + bounds[bin][dim].extend(left.bounds()); + rest = right; + } + if (unlikely(rest.bounds().empty())) r--; + numBegin[l][dim]++; + numEnd [r][dim]++; + bounds [bin][dim].extend(rest.bounds()); + } + } + } + } + + /*! bins a range of primitives inside an array */ + template<typename SplitPrimitive> + void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) { + bin(splitPrimitive,prims+begin,end-begin,mapping); + } + + /*! bins an array of primitives */ + template<typename PrimitiveSplitterFactory> + __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=begin; i<end; i++) + { + const PrimRef &prim = source[i]; + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + if (unlikely(mapping.invalid(dim))) + continue; + + size_t bin; + size_t l = bin0[dim]; + size_t r = bin1[dim]; + + // same bin optimization + if (likely(l == r)) + { + add(dim,l,l,l,prim.bounds()); + continue; + } + const size_t bin_start = bin0[dim]; + const size_t bin_end = bin1[dim]; + BBox3fa rest = prim.bounds(); + const auto splitter = splitterFactory(prim); + for (bin=bin_start; bin<bin_end; bin++) + { + const float pos = mapping.pos(bin+1,dim); + BBox3fa left,right; + splitter(rest,dim,pos,left,right); + if (unlikely(left.empty())) l++; + extend(dim,bin,left); + rest = right; + } + if (unlikely(rest.empty())) r--; + add(dim,l,r,bin,rest); + } + } + } + + + + /*! bins an array of primitives */ + __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) + { + for (size_t i=begin; i<end; i++) + { + const PrimRef &prim = source[i]; + const vint4 bin0 = mapping.bin(prim.bounds().lower); + const vint4 bin1 = mapping.bin(prim.bounds().upper); + + for (size_t dim=0; dim<3; dim++) + { + if (unlikely(mapping.invalid(dim))) + continue; + + const size_t l = bin0[dim]; + const size_t r = bin1[dim]; + + const unsigned int n = prim.primID(); + + // same bin optimization + if (likely(l == r)) + { + add(dim,l,l,l,prim.bounds(),n); + continue; + } + const size_t bin_start = bin0[dim]; + const size_t bin_end = bin1[dim]; + for (size_t bin=bin_start; bin<bin_end; bin++) + add(dim,l,r,bin,prim.bounds(),n); + } + } + } + + /*! merges in other binning information */ + void merge (const SpatialBinInfo& other) + { + for (size_t i=0; i<BINS; i++) + { + numBegin[i] += other.numBegin[i]; + numEnd [i] += other.numEnd [i]; + bounds[i][0].extend(other.bounds[i][0]); + bounds[i][1].extend(other.bounds[i][1]); + bounds[i][2].extend(other.bounds[i][2]); + } + } + + /*! merges in other binning information */ + static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b) + { + SpatialBinInfo c(empty); + for (size_t i=0; i<BINS; i++) + { + c.numBegin[i] += a.numBegin[i]+b.numBegin[i]; + c.numEnd [i] += a.numEnd [i]+b.numEnd [i]; + c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]); + c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]); + c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]); + } + return c; + } + + /*! finds the best split by scanning binning information */ + SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const + { + /* sweep from right to left and compute parallel prefix of merged bounds */ + vfloat4 rAreas[BINS]; + vuint4 rCounts[BINS]; + vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty; + for (size_t i=BINS-1; i>0; i--) + { + count += numEnd[i]; + rCounts[i] = count; + bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx); + by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by); + bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz); + rAreas[i][3] = 0.0f; + } + + /* sweep from left to right and compute SAH */ + vuint4 blocks_add = (1 << blocks_shift)-1; + vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0; + count = 0; bx = empty; by = empty; bz = empty; + for (size_t i=1; i<BINS; i++, ii+=1) + { + count += numBegin[i-1]; + bx.extend(bounds[i-1][0]); float Ax = halfArea(bx); + by.extend(bounds[i-1][1]); float Ay = halfArea(by); + bz.extend(bounds[i-1][2]); float Az = halfArea(bz); + const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az); + const vfloat4 rArea = rAreas[i]; + const vuint4 lCount = (count +blocks_add) >> (unsigned int)(blocks_shift); + const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift); + const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount)); + // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount))); + const vbool4 mask = sah < vbestSAH; + vbestPos = select(mask,ii ,vbestPos); + vbestSAH = select(mask,sah,vbestSAH); + vbestlCount = select(mask,count,vbestlCount); + vbestrCount = select(mask,rCounts[i],vbestrCount); + } + + /* find best dimension */ + float bestSAH = inf; + int bestDim = -1; + int bestPos = 0; + unsigned int bestlCount = 0; + unsigned int bestrCount = 0; + for (int dim=0; dim<3; dim++) + { + /* ignore zero sized dimensions */ + if (unlikely(mapping.invalid(dim))) + continue; + + /* test if this is a better dimension */ + if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) { + bestDim = dim; + bestPos = vbestPos[dim]; + bestSAH = vbestSAH[dim]; + bestlCount = vbestlCount[dim]; + bestrCount = vbestrCount[dim]; + } + } + assert(bestSAH >= 0.0f); + + /* return invalid split if no split found */ + if (bestDim == -1) + return SpatialBinSplit<BINS>(inf,-1,0,mapping); + + /* return best found split */ + return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping); + } + + private: + BBox3fa bounds[BINS][3]; //!< geometry bounds for each bin in each dimension + vuint4 numBegin[BINS]; //!< number of primitives starting in bin + vuint4 numEnd[BINS]; //!< number of primitives ending in bin + }; + } +} + diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h new file mode 100644 index 0000000000..911dcf950c --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h @@ -0,0 +1,552 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "heuristic_binning.h" +#include "heuristic_spatial.h" + +namespace embree +{ + namespace isa + { +#if 0 +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f +#else +#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f +#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f +#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f +#endif + + struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t> + { + __forceinline PrimInfoExtRange() { + } + + __forceinline PrimInfoExtRange(EmptyTy) + : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {} + + __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) + : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {} + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + } + }; + + template<typename ObjectSplit, typename SpatialSplit> + struct Split2 + { + __forceinline Split2 () {} + + __forceinline Split2 (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + } + + __forceinline Split2& operator= (const Split2& other) + { + spatial = other.spatial; + sah = other.sah; + if (spatial) spatialSplit() = other.spatialSplit(); + else objectSplit() = other.objectSplit(); + return *this; + } + + __forceinline ObjectSplit& objectSplit() { return *( ObjectSplit*)data; } + __forceinline const ObjectSplit& objectSplit() const { return *(const ObjectSplit*)data; } + + __forceinline SpatialSplit& spatialSplit() { return *( SpatialSplit*)data; } + __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; } + + __forceinline Split2 (const ObjectSplit& objectSplit, float sah) + : spatial(false), sah(sah) + { + new (data) ObjectSplit(objectSplit); + } + + __forceinline Split2 (const SpatialSplit& spatialSplit, float sah) + : spatial(true), sah(sah) + { + new (data) SpatialSplit(spatialSplit); + } + + __forceinline float splitSAH() const { + return sah; + } + + __forceinline bool valid() const { + return sah < float(inf); + } + + public: + __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)]; + bool spatial; + float sah; + }; + + /*! Performs standard object binning */ + template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS> + struct HeuristicArraySpatialSAH + { + typedef BinSplit<OBJECT_BINS> ObjectSplit; + typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner; + + typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit; + typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner; + + //typedef extended_range<size_t> Set; + typedef Split2<ObjectSplit,SpatialSplit> Split; + +#if defined(__AVX512ER__) // KNL + static const size_t PARALLEL_THRESHOLD = 3*1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 768; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#else + static const size_t PARALLEL_THRESHOLD = 3*1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; +#endif + + static const size_t MOVE_STEP_SIZE = 64; + static const size_t CREATE_SPLITS_STEP_SIZE = 64; + + __forceinline HeuristicArraySpatialSAH () + : prims0(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info) + : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {} + + + /*! compute extended ranges */ + __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight) + { + assert(set.ext_range_size() > 0); + const float left_factor = (float)lweight / (lweight + rweight); + const size_t ext_range_size = set.ext_range_size(); + const size_t left_ext_range_size = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size); + const size_t right_ext_range_size = ext_range_size - left_ext_range_size; + lset.set_ext_range(lset.end() + left_ext_range_size); + rset.set_ext_range(rset.end() + right_ext_range_size); + } + + /*! move ranges */ + __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t left_ext_range_size = lset.ext_range_size(); + const size_t right_size = rset.size(); + + /* has the left child an extended range? */ + if (left_ext_range_size > 0) + { + /* left extended range smaller than right range ? */ + if (left_ext_range_size < right_size) + { + /* only move a small part of the beginning of the right range to the end */ + parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+right_size] = prims0[i]; + }); + } + else + { + /* no overlap, move entire right range to new location, can be made fully parallel */ + parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) + prims0[i+left_ext_range_size] = prims0[i]; + }); + } + /* update right range */ + assert(rset.ext_end() + left_ext_range_size == set.ext_end()); + rset.move_right(left_ext_range_size); + } + } + + /*! finds the best split */ + const Split find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SplitInfo oinfo; + const ObjectSplit object_split = object_find(set,logBlockSize,oinfo); + const float object_split_sah = object_split.splitSAH(); + + if (unlikely(set.has_ext_range())) + { + const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds); + + /* do only spatial splits if the child bounds overlap */ + if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) && + safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds)) + { + const SpatialSplit spatial_split = spatial_find(set, logBlockSize); + const float spatial_split_sah = spatial_split.splitSAH(); + + /* valid spatial split, better SAH and number of splits do not exceed extended range */ + if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah && + spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size()) + { + return Split(spatial_split,spatial_split_sah); + } + } + } + + return Split(object_split,object_split_sah); + } + + /*! finds the best object split */ + __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info); + else return parallel_object_find (set,logBlockSize,info); + } + + /*! finds the best object split */ + __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set); + binner.bin(prims0,set.begin(),set.end(),mapping); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best split */ + __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info) + { + ObjectBinner binner(empty); + const BinMapping<OBJECT_BINS> mapping(set); + const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; }, + [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; }); + ObjectSplit s = binner.best(mapping,logBlockSize); + binner.getSplitInfo(mapping, s, info); + return s; + } + + /*! finds the best spatial split */ + __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize); + else return parallel_spatial_find (set, logBlockSize); + } + + /*! finds the best spatial split */ + __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping<SPATIAL_BINS> mapping(set); + binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize) + { + SpatialBinner binner(empty); + const SpatialBinMapping<SPATIAL_BINS> mapping(set); + const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround + binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner, + [&] (const range<size_t>& r) -> SpatialBinner { + SpatialBinner binner(empty); + binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping); + return binner; }, + [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); }); + /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/ + return binner.best(mapping,logBlockSize); //,set.ext_size()); + } + + + /*! subdivides primitives based on a spatial split */ + __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping) + { + assert(set.has_ext_range()); + const size_t max_ext_range_size = set.ext_range_size(); + const size_t ext_range_start = set.end(); + + /* atomic counter for number of primref splits */ + std::atomic<size_t> ext_elements; + ext_elements.store(0); + + const float fpos = split.mapping.pos(split.pos,split.dim); + + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + + parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) { + for (size_t i=r.begin();i<r.end();i++) + { + const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS); + + if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */ + + //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim]; + //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim]; + //if (unlikely(bin0 < split.pos && bin1 >= split.pos)) + if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos)) + { + assert(splits > 1); + + PrimRef left,right; + const auto splitter = splitterFactory(prims0[i]); + splitter(prims0[i],split.dim,fpos,left,right); + + // no empty splits + if (unlikely(left.bounds().empty() || right.bounds().empty())) continue; + + left.lower.u = (left.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + + const size_t ID = ext_elements.fetch_add(1); + + /* break if the number of subdivided elements are greater than the maximum allowed size */ + if (unlikely(ID >= max_ext_range_size)) + break; + + /* only write within the correct bounds */ + assert(ID < max_ext_range_size); + prims0[i] = left; + prims0[ext_range_start+ID] = right; + } + } + }); + + const size_t numExtElements = min(max_ext_range_size,ext_elements.load()); + assert(set.end()+numExtElements<=set.ext_end()); + set._end += numExtElements; + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + PrimInfoExtRange set = set_i; + + /* valid split */ + if (unlikely(!split.valid())) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + std::pair<size_t,size_t> ext_weights(0,0); + + if (unlikely(split.spatial)) + { + create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); + + /* spatial split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset); + else + ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset); + } + else + { + /* object split */ + if (likely(set.size() < PARALLEL_THRESHOLD)) + ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset); + else + ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset); + } + + /* if we have an extended range, set extended child ranges and move right split range */ + if (unlikely(set.has_ext_range())) + { + setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second); + moveExtentedRange(set,lset,rset); + } + } + + /*! array partitioning */ + std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo local_left(empty); + PrimInfo local_right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + size_t center = serial_partitioning(prims0, + begin,end,local_left,local_right, + [&] (const PrimRef& ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); + }, + [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }); + + const size_t left_weight = local_left.end; + const size_t right_weight = local_right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,local_left); + new (&rset) PrimInfoExtRange(center,end,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + const typename ObjectBinner::vint vSplitPos(splitPos); + const typename ObjectBinner::vbool vSplitMask(splitDimMask); + auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + /*! array partitioning */ + __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + PrimInfo left(empty); + PrimInfo right(empty); + const unsigned int splitPos = split.pos; + const unsigned int splitDim = split.dim; + const unsigned int splitDimMask = (unsigned int)1 << splitDim; + + /* init spatial mapping */ + const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping; + const vint4 vSplitPos(splitPos); + const vbool4 vSplitMask( (int)splitDimMask ); + + auto isLeft = [&] (const PrimRef &ref) { + const Vec3fa c = ref.bounds().center(); + return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); }; + + const size_t center = parallel_partitioning( + prims0,begin,end,EmptyTy(),left,right,isLeft, + [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); }, + [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); }, + PARALLEL_PARTITION_BLOCK_SIZE); + + const size_t left_weight = left.end; + const size_t right_weight = right.end; + + left.begin = begin; left.end = center; + right.begin = center; right.end = end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + assert(area(left.geomBounds) >= 0.0f); + assert(area(right.geomBounds) >= 0.0f); + return std::pair<size_t,size_t>(left_weight,right_weight); + } + + void deterministic_order(const PrimInfoExtRange& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims0[set.begin()],&prims0[set.end()]); + } + + void splitFallback(const PrimInfoExtRange& set, + PrimInfoExtRange& lset, + PrimInfoExtRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + PrimInfo left(empty); + for (size_t i=begin; i<center; i++) { + left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t lweight = left.end; + + PrimInfo right(empty); + for (size_t i=center; i<end; i++) { + right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); + } + const size_t rweight = right.end; + + new (&lset) PrimInfoExtRange(begin,center,center,left); + new (&rset) PrimInfoExtRange(center,end,end,right); + + /* if we have an extended range */ + if (set.has_ext_range()) { + setExtentedRanges(set,lset,rset,lweight,rweight); + moveExtentedRange(set,lset,rset); + } + } + + private: + PrimRef* const prims0; + const PrimitiveSplitterFactory& splitterFactory; + const CentGeomBBox3fa& root_info; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h new file mode 100644 index 0000000000..ede0d04c78 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h @@ -0,0 +1,188 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "priminfo.h" +#include "../../common/algorithms/parallel_reduce.h" +#include "../../common/algorithms/parallel_partition.h" + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + struct HeuristicStrandSplit + { + typedef range<size_t> Set; + + static const size_t PARALLEL_THRESHOLD = 10000; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64; + + /*! stores all information to perform some split */ + struct Split + { + /*! construct an invalid split by default */ + __forceinline Split() + : sah(inf), axis0(zero), axis1(zero) {} + + /*! constructs specified split */ + __forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1) + : sah(sah), axis0(axis0), axis1(axis1) {} + + /*! calculates standard surface area heuristic for the split */ + __forceinline float splitSAH() const { return sah; } + + /*! test if this split is valid */ + __forceinline bool valid() const { return sah != float(inf); } + + public: + float sah; //!< SAH cost of the split + Vec3fa axis0, axis1; //!< axis the two strands are aligned into + }; + + __forceinline HeuristicStrandSplit () // FIXME: required? + : scene(nullptr), prims(nullptr) {} + + /*! remember prim array */ + __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims) + : scene(scene), prims(prims) {} + + __forceinline const Vec3fa direction(const PrimRef& prim) { + return scene->get(prim.geomID())->computeDirection(prim.primID()); + } + + __forceinline const BBox3fa bounds(const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(prim.primID()); + } + + __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) { + return scene->get(prim.geomID())->vbounds(space,prim.primID()); + } + + /*! finds the best split */ + const Split find(const range<size_t>& set, size_t logBlockSize) + { + Vec3fa axis0(0,0,1); + uint64_t bestGeomPrimID = -1; + + /* curve with minimum ID determines first axis */ + for (size_t i=set.begin(); i<set.end(); i++) + { + const uint64_t geomprimID = prims[i].ID64(); + if (geomprimID >= bestGeomPrimID) continue; + const Vec3fa axis = direction(prims[i]); + if (sqr_length(axis) > 1E-18f) { + axis0 = normalize(axis); + bestGeomPrimID = geomprimID; + } + } + + /* find 2nd axis that is most misaligned with first axis and has minimum ID */ + float bestCos = 1.0f; + Vec3fa axis1 = axis0; + bestGeomPrimID = -1; + for (size_t i=set.begin(); i<set.end(); i++) + { + const uint64_t geomprimID = prims[i].ID64(); + Vec3fa axisi = direction(prims[i]); + float leni = length(axisi); + if (leni == 0.0f) continue; + axisi /= leni; + float cos = abs(dot(axisi,axis0)); + if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) { + bestCos = cos; axis1 = axisi; + bestGeomPrimID = geomprimID; + } + } + + /* partition the two strands */ + size_t lnum = 0, rnum = 0; + BBox3fa lbounds = empty, rbounds = empty; + const LinearSpace3fa space0 = frame(axis0).transposed(); + const LinearSpace3fa space1 = frame(axis1).transposed(); + + for (size_t i=set.begin(); i<set.end(); i++) + { + PrimRef& prim = prims[i]; + const Vec3fa axisi = normalize(direction(prim)); + const float cos0 = abs(dot(axisi,axis0)); + const float cos1 = abs(dot(axisi,axis1)); + + if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); } + else { rnum++; rbounds.extend(bounds(space1,prim)); } + } + + /*! return an invalid split if we do not partition */ + if (lnum == 0 || rnum == 0) + return Split(inf,axis0,axis1); + + /*! calculate sah for the split */ + const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize; + const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize; + const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds)); + return Split(sah,axis0,axis1); + } + + /*! array partitioning */ + void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + if (!split.valid()) { + deterministic_order(set); + return splitFallback(set,lset,rset); + } + + const size_t begin = set.begin(); + const size_t end = set.end(); + CentGeomBBox3fa local_left(empty); + CentGeomBBox3fa local_right(empty); + + auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { + const Vec3fa axisi = normalize(direction(prim)); + const float cos0 = abs(dot(axisi,split.axis0)); + const float cos1 = abs(dot(axisi,split.axis1)); + return cos0 > cos1; + }; + + auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { + pinfo.extend(bounds(ref)); + }; + + size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds); + + new (&lset) PrimInfoRange(begin,center,local_left); + new (&rset) PrimInfoRange(center,end,local_right); + assert(area(lset.geomBounds) >= 0.0f); + assert(area(rset.geomBounds) >= 0.0f); + } + + void deterministic_order(const Set& set) + { + /* required as parallel partition destroys original primitive order */ + std::sort(&prims[set.begin()],&prims[set.end()]); + } + + void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset) + { + const size_t begin = set.begin(); + const size_t end = set.end(); + const size_t center = (begin + end)/2; + + CentGeomBBox3fa left(empty); + for (size_t i=begin; i<center; i++) + left.extend(bounds(prims[i])); + new (&lset) PrimInfoRange(begin,center,left); + + CentGeomBBox3fa right(empty); + for (size_t i=center; i<end; i++) + right.extend(bounds(prims[i])); + new (&rset) PrimInfoRange(center,end,right); + } + + private: + Scene* const scene; + PrimRef* const prims; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h new file mode 100644 index 0000000000..c999941a11 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h @@ -0,0 +1,237 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/primref_mb.h" +#include "../../common/algorithms/parallel_filter.h" + +#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f + +namespace embree +{ + namespace isa + { + /*! Performs standard object binning */ + template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS> + struct HeuristicMBlurTemporalSplit + { + typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split; + typedef mvector<PrimRefMB>* PrimRefVector; + typedef typename PrimRefMB::BBox BBox; + + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef) + : device(device), recalculatePrimRef(recalculatePrimRef) {} + + struct TemporalBinInfo + { + __forceinline TemporalBinInfo () { + } + + __forceinline TemporalBinInfo (EmptyTy) + { + for (size_t i=0; i<BINS-1; i++) + { + count0[i] = count1[i] = 0; + bounds0[i] = bounds1[i] = empty; + } + } + + void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) + { + for (int b=0; b<BINS-1; b++) + { + const float t = float(b+1)/float(BINS); + const float ct = lerp(time_range.lower,time_range.upper,t); + const float center_time = set.align_time(ct); + if (center_time <= time_range.lower) continue; + if (center_time >= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* find linear bounds for both time segments */ + for (size_t i=begin; i<end; i++) + { + if (prims[i].time_range_overlap(dt0)) + { + const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0); +#if MBLUR_BIN_LBBOX + bounds0[b].extend(bn0); +#else + bounds0[b].extend(bn0.interpolate(0.5f)); +#endif + count0[b] += prims[i].timeSegmentRange(dt0).size(); + } + + if (prims[i].time_range_overlap(dt1)) + { + const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1); +#if MBLUR_BIN_LBBOX + bounds1[b].extend(bn1); +#else + bounds1[b].extend(bn1.interpolate(0.5f)); +#endif + count1[b] += prims[i].timeSegmentRange(dt1).size(); + } + } + } + } + + __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) + { + if (likely(end-begin < parallelThreshold)) { + bin(prims,begin,end,time_range,set,recalculatePrimRef); + } + else + { + auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { + TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; + }; + *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2); + } + } + + /*! merges in other binning information */ + __forceinline void merge (const TemporalBinInfo& other) + { + for (size_t i=0; i<BINS-1; i++) + { + count0[i] += other.count0[i]; + count1[i] += other.count1[i]; + bounds0[i].extend(other.bounds0[i]); + bounds1[i].extend(other.bounds1[i]); + } + } + + static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) { + TemporalBinInfo r = a; r.merge(b); return r; + } + + Split best(int logBlockSize, BBox1f time_range, const SetMB& set) + { + float bestSAH = inf; + float bestPos = 0.0f; + for (int b=0; b<BINS-1; b++) + { + float t = float(b+1)/float(BINS); + float ct = lerp(time_range.lower,time_range.upper,t); + const float center_time = set.align_time(ct); + if (center_time <= time_range.lower) continue; + if (center_time >= time_range.upper) continue; + const BBox1f dt0(time_range.lower,center_time); + const BBox1f dt1(center_time,time_range.upper); + + /* calculate sah */ + const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize); + float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size(); + float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size(); + if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time + if (unlikely(rCount == 0)) sah1 = 0.0f; + const float sah = sah0+sah1; + if (sah < bestSAH) { + bestSAH = sah; + bestPos = center_time; + } + } + return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos); + } + + public: + size_t count0[BINS-1]; + size_t count1[BINS-1]; + BBox bounds0[BINS-1]; + BBox bounds1[BINS-1]; + }; + + /*! finds the best split */ + const Split find(const SetMB& set, const size_t logBlockSize) + { + assert(set.size() > 0); + TemporalBinInfo binner(empty); + binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef); + Split tsplit = binner.best((int)logBlockSize,set.time_range,set); + if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split + return tsplit; + } + + __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset) + { + assert(tsplit.sah != float(inf)); + assert(tsplit.fpos > set.time_range.lower); + assert(tsplit.fpos < set.time_range.upper); + + float center_time = tsplit.fpos; + const BBox1f time_range0(set.time_range.lower,center_time); + const BBox1f time_range1(center_time,set.time_range.upper); + mvector<PrimRefMB>& prims = *set.prims; + + /* calculate primrefs for first time range */ + std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size())); + PrimRefVector lprims = new_vector.get(); + + auto reduction_func0 = [&] (const range<size_t>& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); i<r.end(); i++) + { + if (likely(prims[i].time_range_overlap(time_range0))) + { + const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0); + (*lprims)[i-set.begin()] = prim; + pinfo.add_primref(prim); + } + else + { + (*lprims)[i-set.begin()] = prims[i]; + } + } + return pinfo; + }; + PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2); + + /* primrefs for first time range are in lprims[0 .. set.size()) */ + /* some primitives may need to be filtered out */ + if (linfo.size() != set.size()) + linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); }); + + lset = SetMB(linfo,lprims,time_range0); + + /* calculate primrefs for second time range */ + auto reduction_func1 = [&] (const range<size_t>& r) { + PrimInfoMB pinfo = empty; + for (size_t i=r.begin(); i<r.end(); i++) + { + if (likely(prims[i].time_range_overlap(time_range1))) + { + const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1); + prims[i] = prim; + pinfo.add_primref(prim); + } + } + return pinfo; + }; + PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2); + rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size()); + + /* primrefs for second time range are in prims[set.begin() .. set.end()) */ + /* some primitives may need to be filtered out */ + if (rinfo.size() != set.size()) + rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024), + [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); }); + + rset = SetMB(rinfo,&prims,time_range1); + + return new_vector; + } + + private: + MemoryMonitorInterface* device; // device to report memory usage to + const RecalculatePrimRef recalculatePrimRef; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h new file mode 100644 index 0000000000..06c1388742 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h @@ -0,0 +1,362 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" + +namespace embree +{ + // FIXME: maybe there's a better place for this util fct + __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2) + { + const Vec3fa e0 = v1-v0; + const Vec3fa e1 = v2-v0; + const Vec3fa d = cross(e0,e1); + return fabs(d.x) + fabs(d.y) + fabs(d.z); + } + + //namespace isa + //{ + template<typename BBox> + class CentGeom + { + public: + __forceinline CentGeom () {} + + __forceinline CentGeom (EmptyTy) + : geomBounds(empty), centBounds(empty) {} + + __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) + : geomBounds(geomBounds), centBounds(centBounds) {} + + template<typename PrimRef> + __forceinline void extend_primref(const PrimRef& prim) + { + BBox bounds; Vec3fa center; + prim.binBoundsAndCenter(bounds,center); + geomBounds.extend(bounds); + centBounds.extend(center); + } + + template<typename PrimRef> + __forceinline void extend_center2(const PrimRef& prim) + { + BBox3fa bounds = prim.bounds(); + geomBounds.extend(bounds); + centBounds.extend(bounds.center2()); + } + + __forceinline void extend(const BBox& geomBounds_) { + geomBounds.extend(geomBounds_); + centBounds.extend(center2(geomBounds_)); + } + + __forceinline void merge(const CentGeom& other) + { + geomBounds.extend(other.geomBounds); + centBounds.extend(other.centBounds); + } + + static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) { + CentGeom r = a; r.merge(b); return r; + } + + public: + BBox geomBounds; //!< geometry bounds of primitives + BBox3fa centBounds; //!< centroid bounds of primitives + }; + + typedef CentGeom<BBox3fa> CentGeomBBox3fa; + + /*! stores bounding information for a set of primitives */ + template<typename BBox> + class PrimInfoT : public CentGeom<BBox> + { + public: + using CentGeom<BBox>::geomBounds; + using CentGeom<BBox>::centBounds; + + __forceinline PrimInfoT () {} + + __forceinline PrimInfoT (EmptyTy) + : CentGeom<BBox>(empty), begin(0), end(0) {} + + __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) + : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {} + + template<typename PrimRef> + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom<BBox>::extend_primref(prim); + end++; + } + + template<typename PrimRef> + __forceinline void add_center2(const PrimRef& prim) { + CentGeom<BBox>::extend_center2(prim); + end++; + } + + template<typename PrimRef> + __forceinline void add_center2(const PrimRef& prim, const size_t i) { + CentGeom<BBox>::extend_center2(prim); + end+=i; + } + + /*__forceinline void add(const BBox& geomBounds_) { + CentGeom<BBox>::extend(geomBounds_); + end++; + } + + __forceinline void add(const BBox& geomBounds_, const size_t i) { + CentGeom<BBox>::extend(geomBounds_); + end+=i; + }*/ + + __forceinline void merge(const PrimInfoT& other) + { + CentGeom<BBox>::merge(other); + begin += other.begin; + end += other.end; + } + + static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) { + PrimInfoT r = a; r.merge(b); return r; + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return end-begin; + } + + __forceinline float halfArea() { + return expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return expectedApproxHalfArea(geomBounds)*float(size()); + //return halfArea(geomBounds)*blocks(num); + } + + __forceinline float leafSAH(size_t block_shift) const { + return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift); + //return halfArea(geomBounds)*float((num+3) >> 2); + //return halfArea(geomBounds)*blocks(num); + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) { + return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}"; + } + + public: + size_t begin,end; //!< number of primitives + }; + + typedef PrimInfoT<BBox3fa> PrimInfo; + //typedef PrimInfoT<LBBox3fa> PrimInfoMB; + + /*! stores bounding information for a set of primitives */ + template<typename BBox> + class PrimInfoMBT : public CentGeom<BBox> + { + public: + using CentGeom<BBox>::geomBounds; + using CentGeom<BBox>::centBounds; + + __forceinline PrimInfoMBT () { + } + + __forceinline PrimInfoMBT (EmptyTy) + : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + __forceinline PrimInfoMBT (size_t begin, size_t end) + : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {} + + template<typename PrimRef> + __forceinline void add_primref(const PrimRef& prim) + { + CentGeom<BBox>::extend_primref(prim); + time_range.extend(prim.time_range); + object_range._end++; + num_time_segments += prim.size(); + if (max_num_time_segments < prim.totalTimeSegments()) { + max_num_time_segments = prim.totalTimeSegments(); + max_time_range = prim.time_range; + } + } + + __forceinline void merge(const PrimInfoMBT& other) + { + CentGeom<BBox>::merge(other); + time_range.extend(other.time_range); + object_range._begin += other.object_range.begin(); + object_range._end += other.object_range.end(); + num_time_segments += other.num_time_segments; + if (max_num_time_segments < other.max_num_time_segments) { + max_num_time_segments = other.max_num_time_segments; + max_time_range = other.max_time_range; + } + } + + static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) { + PrimInfoMBT r = a; r.merge(b); return r; + } + + __forceinline size_t begin() const { + return object_range.begin(); + } + + __forceinline size_t end() const { + return object_range.end(); + } + + /*! returns the number of primitives */ + __forceinline size_t size() const { + return object_range.size(); + } + + __forceinline float halfArea() const { + return time_range.size()*expectedApproxHalfArea(geomBounds); + } + + __forceinline float leafSAH() const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); + } + + __forceinline float leafSAH(size_t block_shift) const { + return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift); + } + + __forceinline float align_time(float ct) const + { + //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments); + float t0 = (ct-max_time_range.lower)/max_time_range.size(); + float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments); + return t1*max_time_range.size()+max_time_range.lower; + } + + /*! stream output */ + friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) + { + return cout << "PrimInfo { " << + "object_range = " << pinfo.object_range << + ", time_range = " << pinfo.time_range << + ", time_segments = " << pinfo.num_time_segments << + ", geomBounds = " << pinfo.geomBounds << + ", centBounds = " << pinfo.centBounds << + "}"; + } + + public: + range<size_t> object_range; //!< primitive range + size_t num_time_segments; //!< total number of time segments of all added primrefs + size_t max_num_time_segments; //!< maximum number of time segments of a primitive + BBox1f max_time_range; //!< time range of primitive with max_num_time_segments + BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB + }; + + typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB; + + struct SetMB : public PrimInfoMB + { + static const size_t PARALLEL_THRESHOLD = 3 * 1024; + static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024; + static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128; + + typedef mvector<PrimRefMB>* PrimRefVector; + + __forceinline SetMB() {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims) + : PrimInfoMB(pinfo_i), prims(prims) {} + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + object_range = object_range_in; + time_range = intersect(time_range,time_range_in); + } + + __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in) + : PrimInfoMB(pinfo_i), prims(prims) + { + time_range = intersect(time_range,time_range_in); + } + + void deterministic_order() const + { + /* required as parallel partition destroys original primitive order */ + PrimRefMB* prim = prims->data(); + std::sort(&prim[object_range.begin()],&prim[object_range.end()]); + } + + template<typename RecalculatePrimRef> + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const + { + auto reduce = [&](const range<size_t>& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template<typename RecalculatePrimRef> + __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto reduce = [&](const range<size_t>& r) -> LBBox3fa + { + LBBox3fa cbounds(empty); + for (size_t j = r.begin(); j < r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space); + cbounds.extend(bn); + }; + return cbounds; + }; + + return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty), + reduce, + [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); }); + } + + template<typename RecalculatePrimRef> + const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const + { + auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB + { + PrimInfoMB pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + PrimRefMB& ref = (*prims)[j]; + PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space); + pinfo.add_primref(ref1); + }; + return pinfo; + }; + + const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, + PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2); + + return SetMB(pinfo,prims,object_range,time_range); + } + + public: + PrimRefVector prims; + }; +//} +} diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp new file mode 100644 index 0000000000..e23de3df28 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp @@ -0,0 +1,244 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "primrefgen.h" +#include "primrefgen_presplit.h" + +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +namespace embree +{ + namespace isa + { + PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState<PrimInfo> pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1) + { + ParallelForForPrefixSumState<PrimInfoMB> pstate; + Scene::Iterator2 iter(scene,types,true); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID); + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + } + + /* the BVH starts with that time range, even though primitives might have smaller/larger time range */ + pinfo.time_range = t0t1; + return pinfo; + } + + template<typename Mesh> + size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor) + { + size_t numPrimitives = morton.size(); + + /* compute scene bounds */ + std::pair<size_t,BBox3fa> cb_empty(0,empty); + auto cb = parallel_reduce + ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa> + { + size_t num = 0; + BBox3fa bounds = empty; + + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa prim_bounds = empty; + if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue; + bounds.extend(center2(prim_bounds)); + num++; + } + return std::make_pair(num,bounds); + }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) { + return std::make_pair(a.first + b.first,merge(a.second,b.second)); + }); + + + size_t numPrimitivesGen = cb.first; + const BBox3fa centBounds = cb.second; + + /* compute morton codes */ + if (likely(numPrimitivesGen == numPrimitives)) + { + /* fast path if all primitives were valid */ + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void { + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); j<r.end(); j++) + generator(mesh->bounds(j),unsigned(j)); + }); + } + else + { + /* slow path, fallback in case some primitives were invalid */ + ParallelPrefixSumState<size_t> pstate; + BVHBuilderMorton::MortonCodeMapping mapping(centBounds); + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (unlikely(!mesh->buildBounds(j,&bounds))) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus<size_t>()); + + parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t { + size_t num = 0; + BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(j,&bounds)) continue; + generator(bounds,unsigned(j)); + num++; + } + return num; + }, std::plus<size_t>()); + } + return numPrimitivesGen; + } + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + // template for grid meshes + +#if 0 + template<> + PrimInfo createPrimRefArray<GridMesh,false>(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + PING; + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator<GridMesh,false> iter(scene); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k) -> PrimInfo + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,mesh->geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != prims.size()) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, const PrimInfo& base) -> PrimInfo + { + k = base.size(); + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(j,&bounds)) continue; + const PrimRef prim(bounds,mesh->geomID,unsigned(j)); + pinfo.add_center2(prim); + prims[k++] = prim; + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } +#endif + + // ==================================================================================================== + // ==================================================================================================== + // ==================================================================================================== + + IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor)); + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h new file mode 100644 index 0000000000..9919c945c3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h @@ -0,0 +1,28 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" +#include "../common/primref_mb.h" +#include "priminfo.h" +#include "bvh_builder_morton.h" + +namespace embree +{ + namespace isa + { + PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor); + + PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0); + + PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)); + + template<typename Mesh> + size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor); + } +} + diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h new file mode 100644 index 0000000000..8bdb38b955 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h @@ -0,0 +1,371 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../builders/primrefgen.h" +#include "../builders/heuristic_spatial.h" +#include "../builders/splitter.h" + +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define DBG_PRESPLIT(x) +#define CHECK_PRESPLIT(x) + +#define GRID_SIZE 1024 +#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5 +#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG) +#define PRIORITY_CUTOFF_THRESHOLD 1.0f +#define PRIORITY_SPLIT_POS_WEIGHT 1.5f + +namespace embree +{ + namespace isa + { + + struct PresplitItem + { + union { + float priority; + unsigned int data; + }; + unsigned int index; + + __forceinline operator unsigned() const + { + return reinterpret_cast<const unsigned&>(priority); + } + __forceinline bool operator < (const PresplitItem& item) const + { + return (priority < item.priority); + } + + template<typename Mesh> + __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc) + { + const unsigned int geomID = ref.geomID(); + const unsigned int primID = ref.primID(); + const float area_aabb = area(ref.bounds()); + const float area_prim = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID); + const unsigned int diff = 31 - lzcnt(mc.x^mc.y); + assert(area_prim <= area_aabb); + //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f); + const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) )); + assert(priority >= 0.0f && priority < FLT_LARGE); + return priority; + } + + + }; + + inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) { + return cout << "index " << item.index << " priority " << item.priority; + }; + + template<typename SplitterFactory> + void splitPrimitive(SplitterFactory &Splitter, + const PrimRef &prim, + const unsigned int geomID, + const unsigned int primID, + const unsigned int split_level, + const Vec3fa &grid_base, + const float grid_scale, + const float grid_extend, + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE], + unsigned int& numSubPrims) + { + assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); + if (split_level == 0) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + } + else + { + const Vec3fa lower = prim.lower; + const Vec3fa upper = prim.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + + /* if all bits are equal then we cannot split */ + if(unlikely(lower_code == upper_code)) + { + assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE); + subPrims[numSubPrims++] = prim; + return; + } + + /* compute octree level and dimension to perform the split in */ + const unsigned int diff = 31 - lzcnt(lower_code^upper_code); + const unsigned int level = diff / 3; + const unsigned int dim = diff % 3; + + /* now we compute the grid position of the split */ + const unsigned int isplit = iupper[dim] & ~((1<<level)-1); + + /* compute world space position of split */ + const float inv_grid_size = 1.0f / GRID_SIZE; + const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend; + + assert(prim.lower[dim] <= fsplit && + prim.upper[dim] >= fsplit); + + /* split primitive */ + const auto splitter = Splitter(prim); + BBox3fa left,right; + splitter(prim.bounds(),dim,fsplit,left,right); + assert(!left.empty()); + assert(!right.empty()); + + + splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + } + } + + + template<typename Mesh, typename SplitterFactory> + PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + ParallelPrefixSumState<PrimInfo> pstate; + + /* first try */ + progressMonitor(0); + PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,r.begin(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo { + return geometry->createPrimRefArray(prims,r,base.size(),geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + return pinfo; + } + + __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref) + { + const Vec3fa lower = ref.lower; + const Vec3fa upper = ref.upper; + const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f); + const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f); + Vec3ia ilower(floor(glower)); + Vec3ia iupper(floor(gupper)); + + /* this ignores dimensions that are empty */ + iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)); + + /* compute a morton code for the lower and upper grid coordinates. */ + const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z); + const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z); + return Vec2i(lower_code,upper_code); + } + + template<typename Mesh, typename SplitterFactory> + PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor) + { + static const size_t MIN_STEP_SIZE = 128; + + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator2 iter(scene,types,mblur); + + /* first try */ + progressMonitor(0); + pstate.init(iter,size_t(1024)); + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + /* if we need to filter out geometry, run again */ + if (pinfo.size() != numPrimRefs) + { + progressMonitor(0); + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID); + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + } + + /* use correct number of primitives */ + size_t numPrimitives = pinfo.size(); + const size_t alloc_numPrimitives = prims.size(); + const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives; + + /* set up primitive splitter */ + SplitterFactory Splitter(scene); + + + DBG_PRESPLIT( + const size_t org_numPrimitives = pinfo.size(); + PRINT(numPrimitives); + PRINT(alloc_numPrimitives); + PRINT(numSplitPrimitivesBudget); + ); + + /* allocate double buffer presplit items */ + const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives; + PresplitItem *presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64); + + /* compute grid */ + const Vec3fa grid_base = pinfo.geomBounds.lower; + const Vec3fa grid_diag = pinfo.geomBounds.size(); + const float grid_extend = max(grid_diag.x,max(grid_diag.y,grid_diag.z)); + const float grid_scale = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend; + + /* init presplit items and get total sum */ + const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float { + float sum = 0.0f; + for (size_t i=r.begin(); i<r.end(); i++) + { + presplitItem[i].index = (unsigned int)i; + const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]); + /* if all bits are equal then we cannot split */ + presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f; + /* FIXME: sum undeterministic */ + sum += presplitItem[i].priority; + } + return sum; + },[](const float& a, const float& b) -> float { return a+b; }); + + /* compute number of splits per primitive */ + const float inv_psum = 1.0f / psum; + parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { + for (size_t i=r.begin(); i<r.end(); i++) + { + if (presplitItem[i].priority > 0.0f) + { + const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum; + if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims + { + presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f); + //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG); + } + else + presplitItem[i].priority = 0.0f; + } + } + }); + + auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; }; + size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024); + + /* anything to split ? */ + if (center < numPrimitives) + { + const size_t numPrimitivesToSplit = numPrimitives - center; + assert(presplitItem[center].priority >= 1.0f); + + /* sort presplit items in ascending order */ + radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024); + + CHECK_PRESPLIT( + parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void { + for (size_t i=r.begin(); i<r.end(); i++) + assert(presplitItem[i-1].priority <= presplitItem[i].priority); + }); + ); + + unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem; + unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit; + + /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */ + const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t { + size_t sum = 0; + for (size_t i=t.begin(); i<t.end(); i++) + { + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; + assert(presplitItem[i].priority >= 1.0f); + const unsigned int primrefID = presplitItem[i].index; + const float prio = presplitItem[i].priority; + const unsigned int geomID = prims[primrefID].geomID(); + const unsigned int primID = prims[primrefID].primID(); + const unsigned int split_levels = (unsigned int)prio; + unsigned int numSubPrims = 0; + splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + assert(numSubPrims); + numSubPrims--; // can reuse slot + sum+=numSubPrims; + presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels; + primOffset0[i-center] = numSubPrims; + } + return sum; + },[](const size_t& a, const size_t& b) -> size_t { return a+b; }); + + /* if we are over budget, need to shrink the range */ + if (totalNumSubPrims > numSplitPrimitivesBudget) + { + size_t new_center = numPrimitives-1; + size_t sum = 0; + for (;new_center>=center;new_center--) + { + const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG; + if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break; + sum += numSubPrims; + } + new_center++; + center = new_center; + } + + /* parallel prefix sum to compute offsets for storing sub-primitives */ + const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>()); + + /* iterate over range, and split primitives into sub primitives and append them to prims array */ + parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void { + for (size_t j=rn.begin(); j<rn.end(); j++) + { + PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE]; + const unsigned int primrefID = presplitItem[j].index; + const unsigned int geomID = prims[primrefID].geomID(); + const unsigned int primID = prims[primrefID].primID(); + const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1); + + assert(split_levels); + assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG); + unsigned int numSubPrims = 0; + splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims); + const size_t newID = numPrimitives + primOffset1[j-center]; + assert(newID+numSubPrims <= alloc_numPrimitives); + prims[primrefID] = subPrims[0]; + for (size_t i=1;i<numSubPrims;i++) + prims[newID+i-1] = subPrims[i]; + } + }); + + numPrimitives += offset; + DBG_PRESPLIT( + PRINT(pinfo.size()); + PRINT(numPrimitives); + PRINT((float)numPrimitives/org_numPrimitives)); + } + + /* recompute centroid bounding boxes */ + pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo { + PrimInfo p(empty); + for (size_t j=r.begin(); j<r.end(); j++) + p.add_center2(prims[j]); + return p; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + + /* free double buffer presplit items */ + alignedFree(tmp_presplitItem); + alignedFree(presplitItem); + return pinfo; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h new file mode 100644 index 0000000000..dbd6cf07c7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h @@ -0,0 +1,169 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/scene.h" +#include "../common/primref.h" + +namespace embree +{ + namespace isa + { + template<size_t N> + __forceinline void splitPolygon(const BBox3fa& bounds, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + const Vec3fa (&inv_length)[N], + BBox3fa& left_o, + BBox3fa& right_o) + { + BBox3fa left = empty, right = empty; + /* clip triangle to left and right box by processing all edges */ + for (size_t i=0; i<N; i++) + { + const Vec3fa &v0 = v[i]; + const Vec3fa &v1 = v[i+1]; + const float v0d = v0[dim]; + const float v1d = v1[dim]; + + if (v0d <= pos) left. extend(v0); // this point is on left side + if (v0d >= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + left_o = intersect(left,bounds); + right_o = intersect(right,bounds); + } + + template<size_t N> + __forceinline void splitPolygon(const PrimRef& prim, + const size_t dim, + const float pos, + const Vec3fa (&v)[N+1], + PrimRef& left_o, + PrimRef& right_o) + { + BBox3fa left = empty, right = empty; + for (size_t i=0; i<N; i++) + { + const Vec3fa &v0 = v[i]; + const Vec3fa &v1 = v[i+1]; + const float v0d = v0[dim]; + const float v1d = v1[dim]; + + if (v0d <= pos) left. extend(v0); // this point is on left side + if (v0d >= pos) right.extend(v0); // this point is on right side + + if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location + { + assert((v1d-v0d) != 0.0f); + const float inv_length = 1.0f/(v1d-v0d); + const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0); + left.extend(c); + right.extend(c); + } + } + + /* clip against current bounds */ + new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID()); + new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID()); + } + + struct TriangleSplitter + { + __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask ); + TriangleMesh::Triangle tri = mesh->triangle(prim.primID()); + v[0] = mesh->vertex(tri.v[0]); + v[1] = mesh->vertex(tri.v[1]); + v[2] = mesh->vertex(tri.v[2]); + v[3] = mesh->vertex(tri.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<3>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[4]; + Vec3fa inv_length[3]; + }; + + struct TriangleSplitterFactory + { + __forceinline TriangleSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline TriangleSplitter operator() (const PrimRef& prim) const { + return TriangleSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + + struct QuadSplitter + { + __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim) + { + const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS; + const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask ); + QuadMesh::Quad quad = mesh->quad(prim.primID()); + v[0] = mesh->vertex(quad.v[0]); + v[1] = mesh->vertex(quad.v[1]); + v[2] = mesh->vertex(quad.v[2]); + v[3] = mesh->vertex(quad.v[3]); + v[4] = mesh->vertex(quad.v[0]); + inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]); + inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]); + inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]); + inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]); + } + + __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const { + splitPolygon<4>(prim,dim,pos,v,left_o,right_o); + } + + __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const { + splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o); + } + + private: + Vec3fa v[5]; + Vec3fa inv_length[4]; + }; + + struct QuadSplitterFactory + { + __forceinline QuadSplitterFactory(const Scene* scene) + : scene(scene) {} + + __forceinline QuadSplitter operator() (const PrimRef& prim) const { + return QuadSplitter(scene,prim); + } + + private: + const Scene* scene; + }; + } +} + |