summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
diff options
context:
space:
mode:
authorjfons <joan.fonssanchez@gmail.com>2021-04-20 18:38:09 +0200
committerjfons <joan.fonssanchez@gmail.com>2021-04-23 15:57:28 +0200
commit34b3e8f9e2ae076990ecf3b2827eff759ba2abf9 (patch)
tree854a526a5ba2d6128e44d995d1bc138cf84ee722 /thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
parenteeccab26c5641409092547e02ad11e6253ac1b87 (diff)
Add Embree-aarch64 thirdparty library
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h')
-rw-r--r--thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h501
1 files changed, 501 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..92be2f7e65
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct BVHBuilderMorton
+ {
+ static const size_t MAX_BRANCHING_FACTOR = 8; //!< maximum supported BVH branching factor
+ static const size_t MIN_LARGE_LEAF_LEVELS = 8; //!< create balanced tree of we are that many levels before the maximum tree depth
+
+ /*! settings for morton builder */
+ struct Settings
+ {
+ /*! default settings */
+ Settings ()
+ : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+ /*! initialize settings from API settings */
+ Settings (const RTCBuildArguments& settings)
+ : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+ {
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth )) maxDepth = settings.maxDepth;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize )) minLeafSize = settings.minLeafSize;
+ if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize )) maxLeafSize = settings.maxLeafSize;
+
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+ : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+ {
+ minLeafSize = min(minLeafSize,maxLeafSize);
+ }
+
+ public:
+ size_t branchingFactor; //!< branching factor of BVH to build
+ size_t maxDepth; //!< maximum depth of BVH to build
+ size_t minLeafSize; //!< minimum size of a leaf
+ size_t maxLeafSize; //!< maximum size of a leaf
+ size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+ };
+
+ /*! Build primitive consisting of morton code and primitive ID. */
+ struct __aligned(8) BuildPrim
+ {
+ union {
+ struct {
+ unsigned int code; //!< morton code
+ unsigned int index; //!< i'th primitive
+ };
+ uint64_t t;
+ };
+
+ /*! interface for radix sort */
+ __forceinline operator unsigned() const { return code; }
+
+ /*! interface for standard sort */
+ __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+ };
+
+ /*! maps bounding box to morton code */
+ struct MortonCodeMapping
+ {
+ static const size_t LATTICE_BITS_PER_DIM = 10;
+ static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+ vfloat4 base;
+ vfloat4 scale;
+
+ __forceinline MortonCodeMapping(const BBox3fa& bounds)
+ {
+ base = (vfloat4)bounds.lower;
+ const vfloat4 diag = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+ scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+ }
+
+ __forceinline const vint4 bin (const BBox3fa& box) const
+ {
+ const vfloat4 lower = (vfloat4)box.lower;
+ const vfloat4 upper = (vfloat4)box.upper;
+ const vfloat4 centroid = lower+upper;
+ return vint4((centroid-base)*scale);
+ }
+
+ __forceinline unsigned int code (const BBox3fa& box) const
+ {
+ const vint4 binID = bin(box);
+ const unsigned int x = extract<0>(binID);
+ const unsigned int y = extract<1>(binID);
+ const unsigned int z = extract<2>(binID);
+ const unsigned int xyz = bitInterleave(x,y,z);
+ return xyz;
+ }
+ };
+
+#if defined (__AVX2__)
+
+ /*! for AVX2 there is a fast scalar bitInterleave */
+ struct MortonCodeGenerator
+ {
+ __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+ : mapping(mapping), dest(dest) {}
+
+ __forceinline void operator() (const BBox3fa& b, const unsigned index)
+ {
+ dest->index = index;
+ dest->code = mapping.code(b);
+ dest++;
+ }
+
+ public:
+ const MortonCodeMapping mapping;
+ BuildPrim* dest;
+ size_t currentID;
+ };
+
+#else
+
+ /*! before AVX2 is it better to use the SSE version of bitInterleave */
+ struct MortonCodeGenerator
+ {
+ __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+ : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+ __forceinline ~MortonCodeGenerator()
+ {
+ if (slots != 0)
+ {
+ const vint4 code = bitInterleave(ax,ay,az);
+ for (size_t i=0; i<slots; i++) {
+ dest[currentID-slots+i].index = ai[i];
+ dest[currentID-slots+i].code = code[i];
+ }
+ }
+ }
+
+ __forceinline void operator() (const BBox3fa& b, const unsigned index)
+ {
+ const vint4 binID = mapping.bin(b);
+ ax[slots] = extract<0>(binID);
+ ay[slots] = extract<1>(binID);
+ az[slots] = extract<2>(binID);
+ ai[slots] = index;
+ slots++;
+ currentID++;
+
+ if (slots == 4)
+ {
+ const vint4 code = bitInterleave(ax,ay,az);
+ vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+ vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+ slots = 0;
+ }
+ }
+
+ public:
+ const MortonCodeMapping mapping;
+ BuildPrim* dest;
+ size_t currentID;
+ size_t slots;
+ vint4 ax, ay, az, ai;
+ };
+
+#endif
+
+ template<
+ typename ReductionTy,
+ typename Allocator,
+ typename CreateAllocator,
+ typename CreateNodeFunc,
+ typename SetNodeBoundsFunc,
+ typename CreateLeafFunc,
+ typename CalculateBounds,
+ typename ProgressMonitor>
+
+ class BuilderT : private Settings
+ {
+ ALIGNED_CLASS_(16);
+
+ public:
+
+ BuilderT (CreateAllocator& createAllocator,
+ CreateNodeFunc& createNode,
+ SetNodeBoundsFunc& setBounds,
+ CreateLeafFunc& createLeaf,
+ CalculateBounds& calculateBounds,
+ ProgressMonitor& progressMonitor,
+ const Settings& settings)
+
+ : Settings(settings),
+ createAllocator(createAllocator),
+ createNode(createNode),
+ setBounds(setBounds),
+ createLeaf(createLeaf),
+ calculateBounds(calculateBounds),
+ progressMonitor(progressMonitor),
+ morton(nullptr) {}
+
+ ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+ {
+ /* this should never occur but is a fatal error */
+ if (depth > maxDepth)
+ throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+ /* create leaf for few primitives */
+ if (current.size() <= maxLeafSize)
+ return createLeaf(current,alloc);
+
+ /* fill all children by always splitting the largest one */
+ range<unsigned> children[MAX_BRANCHING_FACTOR];
+ size_t numChildren = 1;
+ children[0] = current;
+
+ do {
+
+ /* find best child with largest number of primitives */
+ size_t bestChild = -1;
+ size_t bestSize = 0;
+ for (size_t i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= maxLeafSize)
+ continue;
+
+ /* remember child with largest size */
+ if (children[i].size() > bestSize) {
+ bestSize = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == size_t(-1)) break;
+
+ /*! split best child into left and right child */
+ auto split = children[bestChild].split();
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = split.first;
+ children[numChildren+0] = split.second;
+ numChildren++;
+
+ } while (numChildren < branchingFactor);
+
+ /* create node */
+ auto node = createNode(alloc,numChildren);
+
+ /* recurse into each child */
+ ReductionTy bounds[MAX_BRANCHING_FACTOR];
+ for (size_t i=0; i<numChildren; i++)
+ bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+ return setBounds(node,bounds,numChildren);
+ }
+
+ /*! recreates morton codes when reaching a region where all codes are identical */
+ __noinline void recreateMortonCodes(const range<unsigned>& current) const
+ {
+ /* fast path for small ranges */
+ if (likely(current.size() < 1024))
+ {
+ /*! recalculate centroid bounds */
+ BBox3fa centBounds(empty);
+ for (size_t i=current.begin(); i<current.end(); i++)
+ centBounds.extend(center2(calculateBounds(morton[i])));
+
+ /* recalculate morton codes */
+ MortonCodeMapping mapping(centBounds);
+ for (size_t i=current.begin(); i<current.end(); i++)
+ morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+ /* sort morton codes */
+ std::sort(morton+current.begin(),morton+current.end());
+ }
+ else
+ {
+ /*! recalculate centroid bounds */
+ auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+ BBox3fa centBounds = empty;
+ for (size_t i=r.begin(); i<r.end(); i++)
+ centBounds.extend(center2(calculateBounds(morton[i])));
+ return centBounds;
+ };
+ const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+ BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+ /* recalculate morton codes */
+ MortonCodeMapping mapping(centBounds);
+ parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ morton[i].code = mapping.code(calculateBounds(morton[i]));
+ }
+ });
+
+ /*! sort morton codes */
+#if defined(TASKING_TBB)
+ tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+ radixsort32(morton+current.begin(),current.size());
+#endif
+ }
+ }
+
+ __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+ {
+ const unsigned int code_start = morton[current.begin()].code;
+ const unsigned int code_end = morton[current.end()-1].code;
+ unsigned int bitpos = lzcnt(code_start^code_end);
+
+ /* if all items mapped to same morton code, then re-create new morton codes for the items */
+ if (unlikely(bitpos == 32))
+ {
+ recreateMortonCodes(current);
+ const unsigned int code_start = morton[current.begin()].code;
+ const unsigned int code_end = morton[current.end()-1].code;
+ bitpos = lzcnt(code_start^code_end);
+
+ /* if the morton code is still the same, goto fall back split */
+ if (unlikely(bitpos == 32)) {
+ current.split(left,right);
+ return;
+ }
+ }
+
+ /* split the items at the topmost different morton code bit */
+ const unsigned int bitpos_diff = 31-bitpos;
+ const unsigned int bitmask = 1 << bitpos_diff;
+
+ /* find location where bit differs using binary search */
+ unsigned begin = current.begin();
+ unsigned end = current.end();
+ while (begin + 1 != end) {
+ const unsigned mid = (begin+end)/2;
+ const unsigned bit = morton[mid].code & bitmask;
+ if (bit == 0) begin = mid; else end = mid;
+ }
+ unsigned center = end;
+#if defined(DEBUG)
+ for (unsigned int i=begin; i<center; i++) assert((morton[i].code & bitmask) == 0);
+ for (unsigned int i=center; i<end; i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+ left = make_range(current.begin(),center);
+ right = make_range(center,current.end());
+ }
+
+ ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+ {
+ /* get thread local allocator */
+ if (!alloc)
+ alloc = createAllocator();
+
+ /* call memory monitor function to signal progress */
+ if (toplevel && current.size() <= singleThreadThreshold)
+ progressMonitor(current.size());
+
+ /* create leaf node */
+ if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+ return createLargeLeaf(depth,current,alloc);
+
+ /* fill all children by always splitting the one with the largest surface area */
+ range<unsigned> children[MAX_BRANCHING_FACTOR];
+ split(current,children[0],children[1]);
+ size_t numChildren = 2;
+
+ while (numChildren < branchingFactor)
+ {
+ /* find best child with largest number of primitives */
+ int bestChild = -1;
+ unsigned bestItems = 0;
+ for (unsigned int i=0; i<numChildren; i++)
+ {
+ /* ignore leaves as they cannot get split */
+ if (children[i].size() <= minLeafSize)
+ continue;
+
+ /* remember child with largest area */
+ if (children[i].size() > bestItems) {
+ bestItems = children[i].size();
+ bestChild = i;
+ }
+ }
+ if (bestChild == -1) break;
+
+ /*! split best child into left and right child */
+ range<unsigned> left, right;
+ split(children[bestChild],left,right);
+
+ /* add new children left and right */
+ children[bestChild] = children[numChildren-1];
+ children[numChildren-1] = left;
+ children[numChildren+0] = right;
+ numChildren++;
+ }
+
+ /* create leaf node if no split is possible */
+ if (unlikely(numChildren == 1))
+ return createLeaf(current,alloc);
+
+ /* allocate node */
+ auto node = createNode(alloc,numChildren);
+
+ /* process top parts of tree parallel */
+ ReductionTy bounds[MAX_BRANCHING_FACTOR];
+ if (current.size() > singleThreadThreshold)
+ {
+ /*! parallel_for is faster than spawing sub-tasks */
+ parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ bounds[i] = recurse(depth+1,children[i],nullptr,true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ }
+ });
+ }
+
+ /* finish tree sequentially */
+ else
+ {
+ for (size_t i=0; i<numChildren; i++)
+ bounds[i] = recurse(depth+1,children[i],alloc,false);
+ }
+
+ return setBounds(node,bounds,numChildren);
+ }
+
+ /* build function */
+ ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+ {
+ /* sort morton codes */
+ morton = src;
+ radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+ /* build BVH */
+ const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+ _mm_mfence(); // to allow non-temporal stores during build
+ return root;
+ }
+
+ public:
+ CreateAllocator& createAllocator;
+ CreateNodeFunc& createNode;
+ SetNodeBoundsFunc& setBounds;
+ CreateLeafFunc& createLeaf;
+ CalculateBounds& calculateBounds;
+ ProgressMonitor& progressMonitor;
+
+ public:
+ BuildPrim* morton;
+ };
+
+
+ template<
+ typename ReductionTy,
+ typename CreateAllocFunc,
+ typename CreateNodeFunc,
+ typename SetBoundsFunc,
+ typename CreateLeafFunc,
+ typename CalculateBoundsFunc,
+ typename ProgressMonitor>
+
+ static ReductionTy build(CreateAllocFunc createAllocator,
+ CreateNodeFunc createNode,
+ SetBoundsFunc setBounds,
+ CreateLeafFunc createLeaf,
+ CalculateBoundsFunc calculateBounds,
+ ProgressMonitor progressMonitor,
+ BuildPrim* src,
+ BuildPrim* tmp,
+ size_t numPrimitives,
+ const Settings& settings)
+ {
+ typedef BuilderT<
+ ReductionTy,
+ decltype(createAllocator()),
+ CreateAllocFunc,
+ CreateNodeFunc,
+ SetBoundsFunc,
+ CreateLeafFunc,
+ CalculateBoundsFunc,
+ ProgressMonitor> Builder;
+
+ Builder builder(createAllocator,
+ createNode,
+ setBounds,
+ createLeaf,
+ calculateBounds,
+ progressMonitor,
+ settings);
+
+ return builder.build(src,tmp,numPrimitives);
+ }
+ };
+ }
+}