diff options
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/bvh')
45 files changed, 14202 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp new file mode 100644 index 0000000000..bd102bd6ef --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp @@ -0,0 +1,190 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" + +namespace embree +{ + template<int N> + BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene) + : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN), + primTy(&primTy), device(scene->device), scene(scene), + root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0) + { + } + + template<int N> + BVHN<N>::~BVHN () + { + for (size_t i=0; i<objects.size(); i++) + delete objects[i]; + } + + template<int N> + void BVHN<N>::clear() + { + set(BVHN::emptyNode,empty,0); + alloc.clear(); + } + + template<int N> + void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives) + { + this->root = root; + this->bounds = bounds; + this->numPrimitives = numPrimitives; + } + + template<int N> + void BVHN<N>::clearBarrier(NodeRef& node) + { + if (node.isBarrier()) + node.clearBarrier(); + else if (!node.isLeaf()) { + BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH + for (size_t c=0; c<N; c++) + clearBarrier(n->child(c)); + } + } + + template<int N> + void BVHN<N>::layoutLargeNodes(size_t num) + { +#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues + struct NodeArea + { + __forceinline NodeArea() {} + + __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds) + : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {} + + __forceinline bool operator< (const NodeArea& other) const { + return this->A < other.A; + } + + NodeRef* node; + float A; + }; + std::vector<NodeArea> lst; + lst.reserve(num); + lst.push_back(NodeArea(root,empty)); + + while (lst.size() < num) + { + std::pop_heap(lst.begin(), lst.end()); + NodeArea n = lst.back(); lst.pop_back(); + if (!n.node->isAABBNode()) break; + AABBNode* node = n.node->getAABBNode(); + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVHN::emptyNode) continue; + lst.push_back(NodeArea(node->child(i),node->bounds(i))); + std::push_heap(lst.begin(), lst.end()); + } + } + + for (size_t i=0; i<lst.size(); i++) + lst[i].node->setBarrier(); + + root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator()); +#endif + } + + template<int N> + typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator) + { + if (node.isBarrier()) { + node.clearBarrier(); + return node; + } + else if (node.isAABBNode()) + { + AABBNode* oldnode = node.getAABBNode(); + AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment); + *newnode = *oldnode; + for (size_t c=0; c<N; c++) + newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator); + return encodeNode(newnode); + } + else return node; + } + + template<int N> + double BVHN<N>::preBuild(const std::string& builderName) + { + if (builderName == "") + return inf; + + if (device->verbosity(2)) + { + Lock<MutexSys> lock(g_printMutex); + std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush; + } + + double t0 = 0.0; + if (device->benchmark || device->verbosity(2)) t0 = getSeconds(); + return t0; + } + + template<int N> + void BVHN<N>::postBuild(double t0) + { + if (t0 == double(inf)) + return; + + double dt = 0.0; + if (device->benchmark || device->verbosity(2)) + dt = getSeconds()-t0; + + std::unique_ptr<BVHNStatistics<N>> stat; + + /* print statistics */ + if (device->verbosity(2)) + { + if (!stat) stat.reset(new BVHNStatistics<N>(this)); + const size_t usedBytes = alloc.getUsedBytes(); + Lock<MutexSys> lock(g_printMutex); + std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl; + + if (device->verbosity(2)) + std::cout << stat->str(); + + if (device->verbosity(2)) + { + FastAllocator::AllStatistics stat(&alloc); + for (size_t i=0; i<objects.size(); i++) + if (objects[i]) + stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc); + + stat.print(numPrimitives); + } + + if (device->verbosity(3)) + { + alloc.print_blocks(); + for (size_t i=0; i<objects.size(); i++) + if (objects[i]) + objects[i]->alloc.print_blocks(); + } + + std::cout << std::flush; + } + + /* benchmark mode */ + if (device->benchmark) + { + if (!stat) stat.reset(new BVHNStatistics<N>(this)); + Lock<MutexSys> lock(g_printMutex); + std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush; + } + } + +#if defined(__AVX__) + template class BVHN<8>; +#endif + +#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) + template class BVHN<4>; +#endif +} + diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h new file mode 100644 index 0000000000..8fdf912e52 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.h @@ -0,0 +1,235 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +/* include all node types */ +#include "bvh_node_aabb.h" +#include "bvh_node_aabb_mb.h" +#include "bvh_node_aabb_mb4d.h" +#include "bvh_node_obb.h" +#include "bvh_node_obb_mb.h" +#include "bvh_node_qaabb.h" + +namespace embree +{ + /*! flags used to enable specific node types in intersectors */ + enum BVHNodeFlags + { + BVH_FLAG_ALIGNED_NODE = 0x00001, + BVH_FLAG_ALIGNED_NODE_MB = 0x00010, + BVH_FLAG_UNALIGNED_NODE = 0x00100, + BVH_FLAG_UNALIGNED_NODE_MB = 0x01000, + BVH_FLAG_QUANTIZED_NODE = 0x100000, + BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000, + + /* short versions */ + BVH_AN1 = BVH_FLAG_ALIGNED_NODE, + BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB, + BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_UN1 = BVH_FLAG_UNALIGNED_NODE, + BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB, + BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D, + BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE, + BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB, + BVH_QN1 = BVH_FLAG_QUANTIZED_NODE + }; + + /*! Multi BVH with N children. Each node stores the bounding box of + * it's N children as well as N child references. */ + template<int N> + class BVHN : public AccelData + { + ALIGNED_CLASS_(16); + public: + + /*! forward declaration of node ref type */ + typedef NodeRefPtr<N> NodeRef; + typedef BaseNode_t<NodeRef,N> BaseNode; + typedef AABBNode_t<NodeRef,N> AABBNode; + typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB; + typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D; + typedef OBBNode_t<NodeRef,N> OBBNode; + typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB; + typedef QuantizedBaseNode_t<N> QuantizedBaseNode; + typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB; + typedef QuantizedNode_t<NodeRef,N> QuantizedNode; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! Empty node */ + static const size_t emptyNode = NodeRef::emptyNode; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = NodeRef::invalidNode; + static const size_t popRay = NodeRef::popRay; + + /*! Maximum depth of the BVH. */ + static const size_t maxBuildDepth = 32; + static const size_t maxBuildDepthLeaf = maxBuildDepth+8; + static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks; + + public: + + /*! Builder interface to create allocator */ + struct CreateAlloc : public FastAllocator::Create { + __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {} + }; + + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + public: + + /*! BVHN default constructor. */ + BVHN (const PrimitiveType& primTy, Scene* scene); + + /*! BVHN destruction */ + ~BVHN (); + + /*! clears the acceleration structure */ + void clear(); + + /*! sets BVH members after build */ + void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives); + + /*! Clears the barrier bits of a subtree. */ + void clearBarrier(NodeRef& node); + + /*! lays out num large nodes of the BVH */ + void layoutLargeNodes(size_t num); + NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator); + + /*! called by all builders before build starts */ + double preBuild(const std::string& builderName); + + /*! called by all builders after build ended */ + void postBuild(double t0); + + /*! allocator class */ + struct Allocator { + BVHN* bvh; + Allocator (BVHN* bvh) : bvh(bvh) {} + __forceinline void* operator() (size_t bytes) const { + return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); + } + }; + + /*! post build cleanup */ + void cleanup() { + alloc.cleanup(); + } + + public: + + /*! Encodes a node */ + static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); } + static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); } + static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); } + + public: + + /*! Prefetches the node this reference points to */ + __forceinline static void prefetch(const NodeRef ref, int types=0) + { +#if defined(__AVX512PF__) // MIC + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL2(((char*)ref.ptr)+2*64); + prefetchL2(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* KNL still needs L2 prefetches for large nodes */ + prefetchL2(((char*)ref.ptr)+4*64); + prefetchL2(((char*)ref.ptr)+5*64); + prefetchL2(((char*)ref.ptr)+6*64); + prefetchL2(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL2(((char*)ref.ptr)+0*64); + prefetchL2(((char*)ref.ptr)+1*64); + prefetchL2(((char*)ref.ptr)+2*64); + } +#else + if (types != BVH_FLAG_QUANTIZED_NODE) { + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + prefetchL1(((char*)ref.ptr)+2*64); + prefetchL1(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + /* deactivate for large nodes on Xeon, as it introduces regressions */ + //prefetchL1(((char*)ref.ptr)+4*64); + //prefetchL1(((char*)ref.ptr)+5*64); + //prefetchL1(((char*)ref.ptr)+6*64); + //prefetchL1(((char*)ref.ptr)+7*64); + } + } + else + { + /* todo: reduce if 32bit offsets are enabled */ + prefetchL1(((char*)ref.ptr)+0*64); + prefetchL1(((char*)ref.ptr)+1*64); + prefetchL1(((char*)ref.ptr)+2*64); + } +#endif + } + + __forceinline static void prefetchW(const NodeRef ref, int types=0) + { + embree::prefetchEX(((char*)ref.ptr)+0*64); + embree::prefetchEX(((char*)ref.ptr)+1*64); + if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+2*64); + embree::prefetchEX(((char*)ref.ptr)+3*64); + } + if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) { + embree::prefetchEX(((char*)ref.ptr)+4*64); + embree::prefetchEX(((char*)ref.ptr)+5*64); + embree::prefetchEX(((char*)ref.ptr)+6*64); + embree::prefetchEX(((char*)ref.ptr)+7*64); + } + } + + /*! bvh type information */ + public: + const PrimitiveType* primTy; //!< primitive type stored in the BVH + + /*! bvh data */ + public: + Device* device; //!< device pointer + Scene* scene; //!< scene pointer + NodeRef root; //!< root node + FastAllocator alloc; //!< allocator used to allocate nodes + + /*! statistics data */ + public: + size_t numPrimitives; //!< number of primitives the BVH is build over + size_t numVertices; //!< number of vertices the BVH references + + /*! data arrays for special builders */ + public: + std::vector<BVHN*> objects; + vector_t<char,aligned_allocator<char,32>> subdiv_patches; + }; + + typedef BVHN<4> BVH4; + typedef BVHN<8> BVH8; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp new file mode 100644 index 0000000000..23f4f63d45 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp @@ -0,0 +1,1325 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh4_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + + BVH4Factory::BVH4Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH4Factory::selectBuilders(int features) + { + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH)); + + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH)); + } + + void BVH4Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16)); + IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::ROBUST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH4Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH4Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH4Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH4Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH4Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH4Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH4Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH4VirtualIntersectorStream(); +#endif + intersectors.collider = BVH4ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH4VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH4InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH4InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1Intersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1Intersector4(); + intersectors.intersector8 = BVH4SubdivPatch1Intersector8(); + intersectors.intersector16 = BVH4SubdivPatch1Intersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4SubdivPatch1MBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4SubdivPatch1MBIntersector4(); + intersectors.intersector8 = BVH4SubdivPatch1MBIntersector8(); + intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8i::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4v::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve4iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + +#if defined(EMBREE_TARGET_SIMD8) + Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant); + + Builder* builder = nullptr; + if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } +#endif + + Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4v::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant); + else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default" ) { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Triangle4vMB::type,scene); + + Accel::Intersectors intersectors; + if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant); + else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST); + else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>"); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "sah" ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->quad_builder == "dynamic" ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene) + { + BVH4* accel = new BVH4(Quad4i::type,scene); + Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene) + { + BVH4* accel = new BVH4(Triangle4i::type,scene); + Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel); + Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene) + { + BVH4* accel = new BVH4(SubdivPatch1::type,scene); + Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel); + Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene) + { + BVH4* accel = new BVH4(Object::type,scene); + Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel); + Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break; + case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive) + { + BVH4* accel = new BVH4(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP; + Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH4GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH4GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH4GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH4GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH4GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH4GridMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH4GridMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH4IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH4* accel = new BVH4(SubGridQBVH4::type,scene); + Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + builder = BVH4GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>"); + return new AccelInstance(accel,builder,intersectors); + } + +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h new file mode 100644 index 0000000000..a68227b41f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h @@ -0,0 +1,316 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH4 instantiations */ + class BVH4Factory : public BVHFactory + { + public: + BVH4Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant); + Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH4Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST); + Accel* BVH4Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH4QuantizedTriangle4i(Scene* scene); + Accel* BVH4QuantizedQuad4i(Scene* scene); + + Accel* BVH4SubdivPatch1(Scene* scene); + Accel* BVH4SubdivPatch1MB(Scene* scene); + + Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4UserGeometryMB(Scene* scene); + + Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH4InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh); + Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh); + Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh); + Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh); + Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh); + + Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant); + + private: + + DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker); + + // ============== + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // spatial scene builder + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp new file mode 100644 index 0000000000..9fe057c392 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp @@ -0,0 +1,1165 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8 + +#if defined (EMBREE_TARGET_SIMD8) + +#include "bvh8_factory.h" +#include "../bvh/bvh.h" + +#include "../geometry/curveNv.h" +#include "../geometry/curveNi.h" +#include "../geometry/curveNi_mb.h" +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/subdivpatch1.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" +#include "../common/accelinstance.h" + +namespace embree +{ + DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void); + DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + + BVH8Factory::BVH8Factory(int bfeatures, int ifeatures) + { + SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom); + + selectBuilders(bfeatures); + selectIntersectors(ifeatures); + } + + void BVH8Factory::selectBuilders(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH)); + + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH)); + IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH)); + IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH)); + IF_ENABLED_USER (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH)); + IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH)); + } + + void BVH8Factory::selectIntersectors(int features) + { + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB)); + + /* select intersectors1 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller)) + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker)); + +#if defined (EMBREE_RAY_PACKETS) + + /* select intersectors4 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker)); + + /* select intersectors8 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker)); + + /* select intersectors16 */ + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid)); + IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker)); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk)); + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk)); + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk)); + + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller)); + IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker)); + + /* select stream intersectors */ + + SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback); + + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker)); + IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker)); + + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker)); + IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker)); + + IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream)); + + IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream)); + +#endif + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4Hybrid(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8Hybrid(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.leafIntersector = leafIntersector; + intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1MB(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4HybridMB(); + intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8HybridMB(); + intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + default: assert(false); + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant) + { + assert(ivariant == IntersectVariant::FAST); + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4Intersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Triangle4Intersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Triangle4Intersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Triangle4Intersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Triangle4Intersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Triangle4Intersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Triangle4IntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Triangle4IntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; +#define ENABLE_WOOP_TEST 0 +#if ENABLE_WOOP_TEST == 0 + //assert(ivariant == IntersectVariant::ROBUST); + intersectors.intersector1 = BVH8Triangle4vIntersector1Pluecker(); +#else + intersectors.intersector1 = BVH8Triangle4vIntersector1Woop(); +#endif + +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Triangle4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4vMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Triangle4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4_filter = BVH8Quad4vIntersector4HybridMoeller(); + intersectors.intersector4_nofilter = BVH8Quad4vIntersector4HybridMoellerNoFilter(); + intersectors.intersector8_filter = BVH8Quad4vIntersector8HybridMoeller(); + intersectors.intersector8_nofilter = BVH8Quad4vIntersector8HybridMoellerNoFilter(); + intersectors.intersector16_filter = BVH8Quad4vIntersector16HybridMoeller(); + intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter(); + intersectors.intersectorN_filter = BVH8Quad4vIntersectorStreamMoeller(); + intersectors.intersectorN_nofilter = BVH8Quad4vIntersectorStreamMoellerNoFilter(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4vIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4vIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4vIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4vIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamMoeller(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8Quad4iIntersectorStreamPluecker(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + switch (ivariant) { + case IntersectVariant::FAST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + case IntersectVariant::ROBUST: + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8Quad4iMBIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + } + return Accel::Intersectors(); + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualIntersector16Chunk(); + intersectors.intersectorN = BVH8VirtualIntersectorStream(); +#endif + intersectors.collider = BVH8ColliderUserGeom(); + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8VirtualMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8VirtualMBIntersector4Chunk(); + intersectors.intersector8 = BVH8VirtualMBIntersector8Chunk(); + intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceIntersector16Chunk(); + intersectors.intersectorN = BVH8InstanceIntersectorStream(); +#endif + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8InstanceMBIntersector1(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8InstanceMBIntersector4Chunk(); + intersectors.intersector8 = BVH8InstanceMBIntersector8Chunk(); + intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8v::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant); + Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Curve8iMB::type,scene); + Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant); + Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah" ) builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else if (scene->device->tri_builder == "sah_presplit") builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY); + else if (scene->device->tri_builder == "dynamic" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); + else if (scene->device->tri_builder == "morton" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4v::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { // FIXME: implement + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Triangle4vMB::type,scene); + Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->tri_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4i::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel); + Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene) + { + BVH8* accel = new BVH8(Triangle4::type,scene); + Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel); + Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4v::type,scene); + Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break; + } + } + else if (scene->device->quad_builder == "dynamic" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); + else if (scene->device->quad_builder == "morton" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true); + else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant); + + Builder* builder = nullptr; + if (scene->device->quad_builder_mb == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene) + { + BVH8* accel = new BVH8(Quad4i::type,scene); + Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel); + Builder* builder = nullptr; + if (scene->device->quad_builder == "default" ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>"); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene) + { + BVH8* accel = new BVH8(Object::type,scene); + Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel); + Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0); + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + + Builder* builder = nullptr; + if (scene->device->object_builder == "default") { + switch (bvariant) { + case BuildVariant::STATIC : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break; + case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break; + case BuildVariant::HIGH_QUALITY: assert(false); break; + } + } + else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype); + else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive) + { + BVH8* accel = new BVH8(InstancePrimitive::type,scene); + Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel); + auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; + Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype); + return new AccelInstance(accel,builder,intersectors); + } + + Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + if (ivariant == IntersectVariant::FAST) + { + intersectors.intersector1 = BVH8GridIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridMoeller(); + intersectors.intersector8 = BVH8GridIntersector8HybridMoeller(); + intersectors.intersector16 = BVH8GridIntersector16HybridMoeller(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + else /* if (ivariant == IntersectVariant::ROBUST) */ + { + intersectors.intersector1 = BVH8GridIntersector1Pluecker(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = BVH8GridIntersector4HybridPluecker(); + intersectors.intersector8 = BVH8GridIntersector8HybridPluecker(); + intersectors.intersector16 = BVH8GridIntersector16HybridPluecker(); + intersectors.intersectorN = BVH8IntersectorStreamPacketFallback(); +#endif + } + return intersectors; + } + + Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant) + { + Accel::Intersectors intersectors; + intersectors.ptr = bvh; + intersectors.intersector1 = BVH8GridMBIntersector1Moeller(); +#if defined (EMBREE_RAY_PACKETS) + intersectors.intersector4 = nullptr; + intersectors.intersector8 = nullptr; + intersectors.intersector16 = nullptr; + intersectors.intersectorN = nullptr; +#endif + return intersectors; + } + + Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder == "default") { + builder = BVH8GridSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>"); + + return new AccelInstance(accel,builder,intersectors); + } + + Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant) + { + BVH8* accel = new BVH8(SubGridQBVH8::type,scene); + Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant); + Builder* builder = nullptr; + if (scene->device->grid_builder_mb == "default") { + builder = BVH8GridMBSceneBuilderSAH(accel,scene,0); + } + else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>"); + return new AccelInstance(accel,builder,intersectors); + } +} + +#endif diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h new file mode 100644 index 0000000000..b92188e7d3 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h @@ -0,0 +1,280 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_factory.h" + +namespace embree +{ + /*! BVH8 instantiations */ + class BVH8Factory : public BVHFactory + { + public: + BVH8Factory(int bfeatures, int ifeatures); + + public: + Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant); + Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v); + DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB); + + Accel* BVH8Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + Accel* BVH8QuantizedTriangle4i(Scene* scene); + Accel* BVH8QuantizedTriangle4(Scene* scene); + Accel* BVH8QuantizedQuad4i(Scene* scene); + + Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8UserGeometryMB(Scene* scene); + + Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC); + Accel* BVH8InstanceMB(Scene* scene, bool isExpensive); + + Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST); + + private: + void selectBuilders(int features); + void selectIntersectors(int features); + + private: + Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant); + + Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh); + Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh); + Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh); + Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh); + Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh); + + Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant); + Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant); + + private: + DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1); + + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller); + DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk); + + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk); + + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk); + + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller); + DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker); + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream); + + DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream); + + // SAH scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask); + + DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t); + + // SAH spatial scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t); + + // twolevel scene builders + private: + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool); + DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool); + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp new file mode 100644 index 0000000000..e832537ec5 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp @@ -0,0 +1,60 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder.h" + +namespace embree +{ + namespace isa + { + template<int N> + typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRef> + (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + + template<int N> + typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRef> + (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template<int N> + typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) + { + auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB { + return createLeaf(prims,set,alloc); + }; + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + return BVHBuilderBinnedSAH::build<NodeRecordMB> + (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings); + } + + template struct BVHNBuilderVirtual<4>; + template struct BVHNBuilderQuantizedVirtual<4>; + template struct BVHNBuilderMblurVirtual<4>; + +#if defined(__AVX__) + template struct BVHNBuilderVirtual<8>; + template struct BVHNBuilderQuantizedVirtual<8>; + template struct BVHNBuilderMblurVirtual<8>; +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h new file mode 100644 index 0000000000..1b86bb45ad --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h @@ -0,0 +1,114 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "../builders/bvh_builder_sah.h" + +namespace embree +{ + namespace isa + { + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N> + struct BVHNBuilderVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template<int N> + struct BVHNBuilderQuantizedVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings); + virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings); + } + }; + + template<int N> + struct BVHNBuilderMblurVirtual + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + typedef FastAllocator::CachedAllocator Allocator; + + struct BVHNBuilderV { + NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange); + virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0; + }; + + template<typename CreateLeafFunc> + struct BVHNBuilderT : public BVHNBuilderV + { + BVHNBuilderT (CreateLeafFunc createLeafFunc) + : createLeafFunc(createLeafFunc) {} + + NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) { + return createLeafFunc(prims,set,alloc); + } + + private: + CreateLeafFunc createLeafFunc; + }; + + template<typename CreateLeafFunc> + static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) { + return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange); + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp new file mode 100644 index 0000000000..64759c1294 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp @@ -0,0 +1,531 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_statistics.h" +#include "bvh_rotate.h" +#include "../common/profile.h" +#include "../../common/algorithms/parallel_prefix_sum.h" + +#include "../builders/primrefgen.h" +#include "../builders/bvh_builder_morton.h" + +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +#if defined(__X86_64__) || defined(__aarch64__) +# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform +#else +# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues +#endif + +namespace embree +{ + namespace isa + { + template<int N> + struct SetBVHNBounds + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + typedef typename BVH::AABBNode AABBNode; + + BVH* bvh; + __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num) + { + AABBNode* node = ref.getAABBNode(); + + BBox3fa res = empty; + for (size_t i=0; i<num; i++) { + const BBox3fa b = children[i].bounds; + res.extend(b); + node->setRef(i,children[i].ref); + node->setBounds(i,b); + } + + BBox3fx result = (BBox3fx&)res; +#if ROTATE_TREE + if (N == 4) + { + size_t n = 0; + for (size_t i=0; i<num; i++) + n += children[i].bounds.lower.a; + + if (n >= 4096) { + for (size_t i=0; i<num; i++) { + if (children[i].bounds.lower.a < 4096) { + for (int j=0; j<ROTATE_TREE; j++) + BVHNRotate<N>::rotate(node->child(i)); + node->child(i).setBarrier(); + } + } + } + result.lower.a = unsigned(n); + } +#endif + + return NodeRecord(ref,result); + } + }; + + template<int N, typename Primitive> + struct CreateMortonLeaf; + + template<int N> + struct CreateMortonLeaf<N,Triangle4> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + + Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = unsigned(current.size()); +#endif + return NodeRecord(ref,box_o); + } + + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Triangle4v> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero; + const TriangleMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + } + Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Triangle4i> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 v0 = zero, v1 = zero, v2 = zero; + vuint4 vgeomID = -1, vprimID = -1; + const TriangleMesh* __restrict__ const mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const TriangleMesh::Triangle& tri = mesh->triangle(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2); + vgeomID[i] = geomID_; + vprimID[i] = primID; + unsigned int int_stride = mesh->vertices0.getStride()/4; + v0[i] = tri.v[0] * int_stride; + v1[i] = tri.v[1] * int_stride; + v2[i] = tri.v[2] * int_stride; + } + + for (size_t i=items; i<4; i++) + { + vgeomID[i] = vgeomID[0]; + vprimID[i] = -1; + v0[i] = 0; + v1[i] = 0; + v2[i] = 0; + } + Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + TriangleMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Quad4v> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items<=4); + + /* allocate leaf node */ + Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,1); + + vuint4 vgeomID = -1, vprimID = -1; + Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero; + const QuadMesh* __restrict__ mesh = this->mesh; + + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + const QuadMesh::Quad& tri = mesh->quad(primID); + const Vec3fa& p0 = mesh->vertex(tri.v[0]); + const Vec3fa& p1 = mesh->vertex(tri.v[1]); + const Vec3fa& p2 = mesh->vertex(tri.v[2]); + const Vec3fa& p3 = mesh->vertex(tri.v[3]); + lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3); + vgeomID [i] = geomID_; + vprimID [i] = primID; + v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z; + v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z; + v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z; + v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z; + } + Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID)); + BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper); +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + QuadMesh* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,Object> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + + /* allocate leaf node */ + Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const UserGeometry* mesh = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + { + const unsigned int index = morton[start+i].index; + const unsigned int primID = index; + bounds.extend(mesh->bounds(primID)); + new (&accel[i]) Object(geomID_,primID); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + UserGeometry* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<int N> + struct CreateMortonLeaf<N,InstancePrimitive> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton) + : mesh(mesh), morton(morton), geomID_(geomID) {} + + __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) + { + vfloat4 lower(pos_inf); + vfloat4 upper(neg_inf); + size_t items = current.size(); + size_t start = current.begin(); + assert(items <= 1); + + /* allocate leaf node */ + InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment); + NodeRef ref = BVH::encodeLeaf((char*)accel,items); + const Instance* instance = this->mesh; + + BBox3fa bounds = empty; + for (size_t i=0; i<items; i++) + { + const unsigned int primID = morton[start+i].index; + bounds.extend(instance->bounds(primID)); + new (&accel[i]) InstancePrimitive(instance, geomID_); + } + + BBox3fx box_o = (BBox3fx&)bounds; +#if ROTATE_TREE + if (N == 4) + box_o.lower.a = current.size(); +#endif + return NodeRecord(ref,box_o); + } + private: + Instance* mesh; + BVHBuilderMorton::BuildPrim* morton; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + }; + + template<typename Mesh> + struct CalculateMeshBounds + { + __forceinline CalculateMeshBounds (Mesh* mesh) + : mesh(mesh) {} + + __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) { + return mesh->bounds(morton.index); + } + + private: + Mesh* mesh; + }; + + template<int N, typename Mesh, typename Primitive> + class BVHNMeshBuilderMorton : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecord NodeRecord; + + public: + + BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD) + : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {} + + /* build function */ + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + morton.clear(); + } + size_t numPrimitives = mesh->size(); + numPreviousPrimitives = numPrimitives; + + /* skip build for empty scene */ + if (numPrimitives == 0) { + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* preallocate arrays */ + morton.resize(numPrimitives); + size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim); + bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes + bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated); + + /* create morton code array */ + BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes); + size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface); + + /* create BVH */ + SetBVHNBounds<N> setBounds(bvh); + CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data()); + CalculateMeshBounds<Mesh> calculateBounds(mesh); + auto root = BVHBuilderMorton::build<NodeRecord>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create(), + setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface, + morton.data(),dest,numPrimitivesGen,settings); + + bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives); + +#if ROTATE_TREE + if (N == 4) + { + for (int i=0; i<ROTATE_TREE; i++) + BVHNRotate<N>::rotate(bvh->root); + bvh->clearBarrier(bvh->root); + } +#endif + + /* clear temporary data for static geometry */ + if (bvh->scene->isStaticAccel()) { + morton.clear(); + } + bvh->cleanup(); + } + + void clear() { + morton.clear(); + } + + private: + BVH* bvh; + Mesh* mesh; + mvector<BVHBuilderMorton::BuildPrim> morton; + BVHBuilderMorton::Settings settings; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + }; + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); } + Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); } + Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); } +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); } +#endif +#endif + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp new file mode 100644 index 0000000000..cf5b2eb47f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp @@ -0,0 +1,640 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + +#define PROFILE 0 +#define PROFILE_RUNS 20 + +namespace embree +{ + namespace isa + { + template<int N, typename Primitive> + struct CreateLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + + template<int N, typename Primitive> + struct CreateLeafQuantized + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N, typename Primitive> + struct BVHNBuilderSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector<PrimRef> prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max (); + bool primrefarrayalloc; + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, + const Geometry::GTypeMask gtype, bool primrefarrayalloc = false) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), + settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {} + + BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + /* create primref array */ + if (primrefarrayalloc) { + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + } + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + prims.resize(numPrimitives); + + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + +#if PROFILE + }); +#endif + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + template<int N, typename Primitive> + struct BVHNBuilderSAHQuantized : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + Geometry* mesh; + mvector<PrimRef> prims; + GeneralBVHBuilder::Settings settings; + Geometry::GTypeMask gtype_; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {} + + BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false); + numPreviousPrimitives = numPrimitives; + if (numPrimitives == 0) { + prims.clear(); + bvh->clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + /* create primref array */ + prims.resize(numPrimitives); + PrimInfo pinfo = mesh ? + createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) : + createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* call BVH builder */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!! +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + + template<int N, typename Primitive> + struct CreateLeafGrid + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); //Primitive::blocks(n); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs); + + for (size_t g=0;g<num_geomIDs;g++) + { + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + bounds[pos] = prims[start+i].bounds(); + pos++; + } + assert(pos <= N); + new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos); + } + + return node; + } + + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + + + template<int N> + struct BVHNBuilderSAHGrid : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + + BVH* bvh; + Scene* scene; + GridMesh* mesh; + mvector<PrimRef> prims; + mvector<SubGridBuildData> sgrids; + GeneralBVHBuilder::Settings settings; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {} + + BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {} + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* if we use the primrefarray for allocations we have to take it back from the BVH */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.unshare(prims); + + const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false); + numPreviousPrimitives = numGridPrimitives; + + PrimInfo pinfo(empty); + size_t numPrimitives = 0; + + if (!mesh) + { + /* first run to get #primitives */ + + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator<GridMesh,false> iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j); + if (!mesh->valid(j)) continue; + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + assert(pinfo.size() == numPrimitives); + } + else + { + ParallelPrefixSumState<PrimInfo> pstate; + /* iterate over all grids in a single mesh */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo + { + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,geomID_,unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + numPrimitives = pinfo.size(); + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo + { + + size_t p_index = base.size(); + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j)) continue; + const GridMesh::Grid &g = mesh->grid(j); + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,geomID_,unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + } + + /* no primitives */ + if (numPrimitives == 0) { + bvh->clear(); + prims.clear(); + sgrids.clear(); + return; + } + + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH"); + + /* create primref array */ + settings.primrefarrayalloc = numPrimitives/1000; + if (settings.primrefarrayalloc < 1000) + settings.primrefarrayalloc = inf; + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + /* initialize allocator */ + const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes); + + /* pinfo might has zero size due to invalid geometry */ + if (unlikely(pinfo.size() == 0)) + { + bvh->clear(); + sgrids.clear(); + prims.clear(); + return; + } + + /* call BVH builder */ + NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings); + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary array */ + sgrids.clear(); + + /* if we allocated using the primrefarray we have to keep it alive */ + if (settings.primrefarrayalloc != size_t(inf)) + bvh->alloc.share(prims); + + /* for static geometries we can do some cleanups */ + else if (scene && scene->isStaticAccel()) { + prims.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH4Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + + + Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); } + + Builder* BVH8Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); } + Builder* BVH8QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + Builder* BVH8QuantizedTriangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); } + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH4QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH4QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); } + Builder* BVH8QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); } + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); } + +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + + Builder* BVH4VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type); + } +#if defined(__AVX__) + + Builder* BVH8VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_min_leaf_size; + int maxLeafSize = scene->device->object_accel_max_leaf_size; + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type); + } + + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype); + } +#if defined(__AVX__) + Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { + return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); } + Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct + +#if defined(__AVX__) + Builder* BVH8GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); } + Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct +#endif +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp new file mode 100644 index 0000000000..9c01553ec6 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp @@ -0,0 +1,705 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" +#include "../builders/bvh_builder_msmblur.h" + +#include "../builders/primrefgen.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH +#include "../../common/algorithms/parallel_for_for.h" +#include "../../common/algorithms/parallel_for_for_prefix_sum.h" + + +namespace embree +{ + namespace isa + { + +#if 0 + template<int N, typename Primitive> + struct CreateMBlurLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(set.size()); + size_t start = set.begin(); + for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + NodeRef node = bvh->encodeLeaf((char*)accel,items); + + LBBox3fa allBounds = empty; + for (size_t i=0; i<items; i++) + allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time)); + + return NodeRecordMB(node,allBounds); + } + + BVH* bvh; + PrimRef* prims; + size_t time; + }; +#endif + + template<int N, typename Mesh, typename Primitive> + struct CreateMSMBlurLeaf + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + size_t items = Primitive::blocks(current.prims.size()); + size_t start = current.prims.begin(); + size_t end = current.prims.end(); + for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment); + NodeRef node = bvh->encodeLeaf((char*)accel,items); + LBBox3fa allBounds = empty; + for (size_t i=0; i<items; i++) + allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range)); + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + BVH* bvh; + }; + + /* Motion blur BVH with 4D nodes and internal time splits */ + template<int N, typename Mesh, typename Primitive> + struct BVHNBuilderMBlurSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + typedef typename BVHN<N>::NodeRecordMB NodeRecordMB; + typedef typename BVHN<N>::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + const Geometry::GTypeMask gtype_; + + BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {} + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype_,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH"); + +#if PROFILE + profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) { +#endif + + //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + + /*if (numTimeSegments == 1) + buildSingleSegment(numPrimitives); + else*/ + buildMultiSegment(numPrimitives); + +#if PROFILE + }); +#endif + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf. + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRef> prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build<NodeRecordMB> + (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(), + CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRefMB> prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = Primitive::singleTimeSegment; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device, + RecalculatePrimRef<Mesh>(scene), + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh), + bvh->scene->progressInterface, + settings); + + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + struct GridRecalculatePrimRef + { + Scene* scene; + const SubGridBuildData * const sgrids; + + __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids) + : scene(scene), sgrids(sgrids) {} + + __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const + { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get<GridMesh>(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range); + const unsigned num_time_segments = mesh->numTimeSegments(); + const range<int> tbounds = mesh->timeSegmentRange(time_range); + return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID); + } + + __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const { + const unsigned int geomID = prim.geomID(); + const GridMesh* mesh = scene->get<GridMesh>(geomID); + const unsigned int buildID = prim.primID(); + const SubGridBuildData &subgrid = sgrids[buildID]; + const unsigned int primID = subgrid.primID; + const size_t x = subgrid.x(); + const size_t y = subgrid.y(); + return mesh->linearBounds(mesh->grid(primID),x,y,time_range); + } + + }; + + template<int N> + struct CreateMSMBlurLeafGrid + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB4D NodeRecordMB4D; + + __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = current.prims.size(); + const size_t start = current.prims.begin(); + + const PrimRefMB* prims = current.prims.prims->data(); + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;g<num_geomIDs;g++) + { + const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]); + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + const size_t x = sgrid_bd.x(); + const size_t y = sgrid_bd.y(); + LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range); + allBounds.extend(newBounds); + bounds0[pos] = newBounds.bounds0; + bounds1[pos] = newBounds.bounds1; + pos++; + } + assert(pos <= N); + new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos); + } + return NodeRecordMB4D(node,allBounds,current.prims.time_range); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; + +#if 0 + template<int N> + struct CreateLeafGridMB + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::NodeRecordMB NodeRecordMB; + + __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) + : scene(scene), bvh(bvh), sgrids(sgrids) {} + + __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + const size_t items = set.size(); + const size_t start = set.begin(); + + /* collect all subsets with unique geomIDs */ + assert(items <= N); + unsigned int geomIDs[N]; + unsigned int num_geomIDs = 1; + geomIDs[0] = prims[start].geomID(); + + for (size_t i=1;i<items;i++) + { + bool found = false; + const unsigned int new_geomID = prims[start+i].geomID(); + for (size_t j=0;j<num_geomIDs;j++) + if (new_geomID == geomIDs[j]) + { found = true; break; } + if (!found) + geomIDs[num_geomIDs++] = new_geomID; + } + + /* allocate all leaf memory in one single block */ + SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment); + typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs); + + LBBox3fa allBounds = empty; + + for (size_t g=0;g<num_geomIDs;g++) + { + const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]); + + unsigned int x[N]; + unsigned int y[N]; + unsigned int primID[N]; + BBox3fa bounds0[N]; + BBox3fa bounds1[N]; + unsigned int pos = 0; + for (size_t i=0;i<items;i++) + { + if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue; + + const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()]; + x[pos] = sgrid_bd.sx; + y[pos] = sgrid_bd.sy; + primID[pos] = sgrid_bd.primID; + const size_t x = sgrid_bd.x(); + const size_t y = sgrid_bd.y(); + bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]); + bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]); + assert(valid0); + assert(valid1); + allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos])); + pos++; + } + new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos); + } + return NodeRecordMB(node,allBounds); + } + + Scene *scene; + BVH* bvh; + const SubGridBuildData * const sgrids; + }; +#endif + + + /* Motion blur BVH with 4D nodes and internal time splits */ + template<int N> + struct BVHNBuilderMBlurSAHGrid : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVHN<N>::NodeRef NodeRef; + typedef typename BVHN<N>::NodeRecordMB NodeRecordMB; + typedef typename BVHN<N>::AABBNodeMB AABBNodeMB; + + BVH* bvh; + Scene* scene; + const size_t sahBlockSize; + const float intCost; + const size_t minLeafSize; + const size_t maxLeafSize; + mvector<SubGridBuildData> sgrids; + + + BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize) + : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {} + + + PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState<PrimInfo> pstate; + Scene::Iterator<GridMesh,true> iter(scene); + + pstate.init(iter,size_t(1024)); + + /* iterate over all meshes in the scene */ + PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j,range<size_t>(0,1))) continue; + BBox3fa bounds = empty; + const PrimRef prim(bounds,unsigned(geomID),unsigned(j)); + pinfo.add_center2(prim,mesh->getNumSubGrids(j)); + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + size_t numPrimitives = pinfo.size(); + if (numPrimitives == 0) return pinfo; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo { + + k = base.size(); + size_t p_index = k; + PrimInfo pinfo(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + const GridMesh::Grid &g = mesh->grid(j); + if (!mesh->valid(j,range<size_t>(0,1))) continue; + + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + BBox3fa bounds = empty; + if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid + const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index)); + pinfo.add_center2(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfo; + }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); }); + + assert(pinfo.size() == numPrimitives); + return pinfo; + } + + PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f)) + { + /* first run to get #primitives */ + ParallelForForPrefixSumState<PrimInfoMB> pstate; + Scene::Iterator<GridMesh,true> iter(scene); + + pstate.init(iter,size_t(1024)); + /* iterate over all meshes in the scene */ + PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB { + + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue; + LBBox3fa bounds(empty); + PrimInfoMB gridMB(0,mesh->getNumSubGrids(j)); + pinfoMB.merge(gridMB); + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + size_t numPrimitives = pinfoMB.size(); + if (numPrimitives == 0) return pinfoMB; + + /* resize arrays */ + sgrids.resize(numPrimitives); + prims.resize(numPrimitives); + /* second run to fill primrefs and SubGridBuildData arrays */ + pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB { + + k = base.size(); + size_t p_index = k; + PrimInfoMB pinfoMB(empty); + for (size_t j=r.begin(); j<r.end(); j++) + { + if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue; + const GridMesh::Grid &g = mesh->grid(j); + + for (unsigned int y=0; y<g.resY-1u; y+=2) + for (unsigned int x=0; x<g.resX-1u; x+=2) + { + const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index)); + pinfoMB.add_primref(prim); + sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j)); + prims[p_index++] = prim; + } + } + return pinfoMB; + }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); }); + + assert(pinfoMB.size() == numPrimitives); + pinfoMB.time_range = t0t1; + return pinfoMB; + } + + void build() + { + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true); + if (numPrimitives == 0) { bvh->clear(); return; } + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid"); + + //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>(); + //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1); + //if (numTimeSegments == 1) + // buildSingleSegment(numPrimitives); + //else + buildMultiSegment(numPrimitives); + + /* clear temporary data for static geometry */ + bvh->cleanup(); + bvh->postBuild(t0); + } + +#if 0 + void buildSingleSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRef> prims(scene->device,numPrimitives); + const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0); + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N); + //TODO: check leaf_bytes + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = BVHBuilderBinnedSAH::build<NodeRecordMB> + (typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB::Create(), + typename BVH::AABBNodeMB::Set(), + CreateLeafGridMB<N>(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + prims.data(),pinfo,settings); + + bvh->set(root.ref,root.lbounds,pinfo.size()); + } +#endif + + void buildMultiSegment(size_t numPrimitives) + { + /* create primref array */ + mvector<PrimRefMB> prims(scene->device,numPrimitives); + PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface); + + /* early out if no valid primitives */ + if (pinfo.size() == 0) { bvh->clear(); return; } + + + + GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data()); + + /* estimate acceleration structure size */ + const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N); + //FIXME: check leaf_bytes + //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>)); + const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>)); + + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + /* settings for BVH build */ + BVHBuilderMSMBlur::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxDepth; + settings.logBlockSize = bsr(sahBlockSize); + settings.minLeafSize = min(minLeafSize,maxLeafSize); + settings.maxLeafSize = maxLeafSize; + settings.travCost = travCost; + settings.intCost = intCost; + settings.singleLeafTimeSegment = false; + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + /* build hierarchy */ + auto root = + BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device, + recalculatePrimRef, + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNodeMB4D::Create(), + typename BVH::AABBNodeMB4D::Set(), + CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()), + bvh->scene->progressInterface, + settings); + bvh->set(root.ref,root.lbounds,pinfo.num_time_segments); + } + + void clear() { + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#if defined(__AVX__) + Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } + Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#if defined(__AVX__) + Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#if defined(__AVX__) + Builder* BVH8VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { + int minLeafSize = scene->device->object_accel_mb_min_leaf_size; + int maxLeafSize = scene->device->object_accel_mb_max_leaf_size; + return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY); + } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); } +#if defined(__AVX__) + Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_GRID) + Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); } +#if defined(__AVX__) + Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); } +#endif +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp new file mode 100644 index 0000000000..285b38c39d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp @@ -0,0 +1,201 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh.h" +#include "bvh_builder.h" + +#include "../builders/primrefgen.h" +#include "../builders/primrefgen_presplit.h" +#include "../builders/splitter.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglev_mb.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" +#include "../geometry/subgrid.h" + +#include "../common/state.h" + +namespace embree +{ + namespace isa + { + template<int N, typename Primitive> + struct CreateLeafSpatial + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + + __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {} + + __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const + { + size_t n = set.size(); + size_t items = Primitive::blocks(n); + size_t start = set.begin(); + Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items); + for (size_t i=0; i<items; i++) { + accel[i].fill(prims,start,set.end(),bvh->scene); + } + return node; + } + + BVH* bvh; + }; + + template<int N, typename Mesh, typename Primitive, typename Splitter> + struct BVHNBuilderFastSpatialSAH : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + BVH* bvh; + Scene* scene; + Mesh* mesh; + mvector<PrimRef> prims0; + GeneralBVHBuilder::Settings settings; + const float splitFactor; + unsigned int geomID_ = std::numeric_limits<unsigned int>::max(); + unsigned int numPreviousPrimitives = 0; + + BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications) {} + + BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode) + : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), + splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {} + + // FIXME: shrink bvh->alloc in destructor here and in other builders too + + void build() + { + /* we reset the allocator when the mesh size changed */ + if (mesh && mesh->numPrimitives != numPreviousPrimitives) { + bvh->alloc.clear(); + } + + /* skip build for empty scene */ + const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false); + numPreviousPrimitives = numOriginalPrimitives; + if (numOriginalPrimitives == 0) { + prims0.clear(); + bvh->clear(); + return; + } + + const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>(); + const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))); + double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH")); + + /* create primref array */ + const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives)); + prims0.resize(numSplitPrimitives); + + /* enable os_malloc for two level build */ + if (mesh) + bvh->alloc.setOSallocation(true); + + NodeRef root(0); + PrimInfo pinfo; + + + if (likely(usePreSplits)) + { + /* spatial presplit SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) : + createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings); + } + else + { + /* standard spatial split SAH BVH builder */ + pinfo = mesh ? + createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) : + createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface); + + Splitter splitter(scene); + + const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N); + const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes); + + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + + /* call BVH builder */ + root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + CreateLeafSpatial<N,Primitive>(bvh), + splitter, + bvh->scene->progressInterface, + prims0.data(), + numSplitPrimitives, + pinfo,settings); + + /* ==================== */ + } + + bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size()); + bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f)); + + /* clear temporary data for static geometry */ + if (scene && scene->isStaticAccel()) { + prims0.clear(); + } + bvh->cleanup(); + bvh->postBuild(t0); + } + + void clear() { + prims0.clear(); + } + }; + + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + /************************************************************************************/ + + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + + Builder* BVH4Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } + Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); } +#endif + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp new file mode 100644 index 0000000000..1a78f347ac --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp @@ -0,0 +1,377 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_builder_twolevel.h" +#include "bvh_statistics.h" +#include "../builders/bvh_builder_sah.h" +#include "../common/scene_line_segments.h" +#include "../common/scene_triangle_mesh.h" +#include "../common/scene_quad_mesh.h" + +#define PROFILE 0 + +namespace embree +{ + namespace isa + { + template<int N, typename Mesh, typename Primitive> + BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold) + : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {} + + template<int N, typename Mesh, typename Primitive> + BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () { + } + + // =========================================================================== + // =========================================================================== + // =========================================================================== + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build() + { + /* delete some objects */ + size_t num = scene->size(); + if (num < bvh->objects.size()) { + parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + builders[i].reset(); + delete bvh->objects[i]; bvh->objects[i] = nullptr; + } + }); + } + +#if PROFILE + while(1) +#endif + { + /* reset memory allocator */ + bvh->alloc.reset(); + + /* skip build for empty scene */ + const size_t numPrimitives = scene->getNumPrimitives(gtype,false); + + if (numPrimitives == 0) { + prims.resize(0); + bvh->set(BVH::emptyNode,empty,0); + return; + } + + /* calculate the size of the entire BVH */ + const size_t numLeafBlocks = Primitive::blocks(numPrimitives); + const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N; + const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive)); + bvh->alloc.init_estimate(node_bytes+leaf_bytes); + + double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel"); + + /* resize object array if scene got larger */ + if (bvh->objects.size() < num) bvh->objects.resize(num); + if (builders.size() < num) builders.resize(num); + resizeRefsList (); + nextRef.store(0); + + /* create acceleration structures */ + parallel_for(size_t(0), num, [&] (const range<size_t>& r) + { + for (size_t objectID=r.begin(); objectID<r.end(); objectID++) + { + Mesh* mesh = scene->getSafe<Mesh>(objectID); + + /* ignore meshes we do not support */ + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + + if (isSmallGeometry(mesh)) { + setupSmallBuildRefBuilder (objectID, mesh); + } else { + setupLargeBuildRefBuilder (objectID, mesh); + } + } + }); + + /* parallel build of acceleration structures */ + parallel_for(size_t(0), num, [&] (const range<size_t>& r) + { + for (size_t objectID=r.begin(); objectID<r.end(); objectID++) + { + /* ignore if no triangle mesh or not enabled */ + Mesh* mesh = scene->getSafe<Mesh>(objectID); + if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) + continue; + + builders[objectID]->attachBuildRefs (this); + } + }); + + +#if PROFILE + double d0 = getSeconds(); +#endif + /* fast path for single geometry scenes */ + if (nextRef == 1) { + bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives); + } + + else + { + /* open all large nodes */ + refs.resize(nextRef); + + /* this probably needs some more tuning */ + const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR)); + +#if !ENABLE_DIRECT_SAH_MERGE_BUILDER + +#if ENABLE_OPEN_SEQUENTIAL + open_sequential(extSize); +#endif + /* compute PrimRefs */ + prims.resize(refs.size()); +#endif + +#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL + tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount())); + limited.execute([&] +#endif + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + pinfo.add_center2(refs[i]); + } + return pinfo; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); + +#else + const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo { + + PrimInfo pinfo(empty); + for (size_t i=r.begin(); i<r.end(); i++) { + pinfo.add_center2(refs[i]); + prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node); + } + return pinfo; + }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); }); +#endif + + /* skip if all objects where empty */ + if (pinfo.size() == 0) + bvh->set(BVH::emptyNode,empty,0); + + /* otherwise build toplevel hierarchy */ + else + { + /* settings for BVH build */ + GeneralBVHBuilder::Settings settings; + settings.branchingFactor = N; + settings.maxDepth = BVH::maxBuildDepthLeaf; + settings.logBlockSize = bsr(N); + settings.minLeafSize = 1; + settings.maxLeafSize = 1; + settings.travCost = 1.0f; + settings.intCost = 1.0f; + settings.singleThreadThreshold = singleThreadThreshold; + +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + + refs.resize(extSize); + + NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) refs[range.begin()].node; + }, + [&] (BuildRef &bref, BuildRef *refs) -> size_t { + return openBuildRef(bref,refs); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + refs.data(),extSize,pinfo,settings); +#else + NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>( + typename BVH::CreateAlloc(bvh), + typename BVH::AABBNode::Create2(), + typename BVH::AABBNode::Set2(), + + [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef { + assert(range.size() == 1); + return (NodeRef) prims[range.begin()].ID(); + }, + [&] (size_t dn) { bvh->scene->progressMonitor(0); }, + prims.data(),pinfo,settings); +#endif + + + bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives); + } + } +#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL + ); +#endif + + } + + bvh->alloc.cleanup(); + bvh->postBuild(t0); +#if PROFILE + double d1 = getSeconds(); + std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl; +#endif + } + + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID) + { + if (geomID >= bvh->objects.size()) return; + if (builders[geomID]) builders[geomID].reset(); + delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr; + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear() + { + for (size_t i=0; i<bvh->objects.size(); i++) + if (bvh->objects[i]) bvh->objects[i]->clear(); + + for (size_t i=0; i<builders.size(); i++) + if (builders[i]) builders[i].reset(); + + refs.clear(); + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize) + { + if (refs.size() == 0) + return; + + refs.reserve(extSize); + +#if 1 + for (size_t i=0;i<refs.size();i++) + { + NodeRef ref = refs[i].node; + if (ref.isAABBNode()) + BVH::prefetch(ref); + } +#endif + + std::make_heap(refs.begin(),refs.end()); + while (refs.size()+N-1 <= extSize) + { + std::pop_heap (refs.begin(),refs.end()); + NodeRef ref = refs.back().node; + if (ref.isLeaf()) break; + refs.pop_back(); + + AABBNode* node = ref.getAABBNode(); + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVH::emptyNode) continue; + refs.push_back(BuildRef(node->bounds(i),node->child(i))); + +#if 1 + NodeRef ref_pre = node->child(i); + if (ref_pre.isAABBNode()) + ref_pre.prefetch(); +#endif + std::push_heap (refs.begin(),refs.end()); + } + } + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/) + { + if (builders[objectID] == nullptr || // new mesh + dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr) // size change resulted in large->small change + { + builders[objectID].reset (new RefBuilderSmall(objectID)); + } + } + + template<int N, typename Mesh, typename Primitive> + void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh) + { + if (bvh->objects[objectID] == nullptr || // new mesh + builders[objectID]->meshQualityChanged (mesh->quality) || // changed build quality + dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr) // size change resulted in small->large change + { + Builder* builder = nullptr; + delete bvh->objects[objectID]; + createMeshAccel(objectID, builder); + builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality)); + } + } + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#if defined(__AVX__) +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } + Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder); + } +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) { + return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder); + } +#endif + +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h new file mode 100644 index 0000000000..8f57c3b406 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h @@ -0,0 +1,263 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <type_traits> + +#include "bvh_builder_twolevel_internal.h" +#include "bvh.h" +#include "../common/primref.h" +#include "../builders/priminfo.h" +#include "../builders/primrefgen.h" + +/* new open/merge builder */ +#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1 +#define ENABLE_OPEN_SEQUENTIAL 0 +#define SPLIT_MEMORY_RESERVE_FACTOR 1000 +#define SPLIT_MEMORY_RESERVE_SCALE 2 +#define SPLIT_MIN_EXT_SPACE 1000 + +namespace embree +{ + namespace isa + { + template<int N, typename Mesh, typename Primitive> + class BVHNBuilderTwoLevel : public Builder + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + __forceinline static bool isSmallGeometry(Mesh* mesh) { + return mesh->size() <= 4; + } + + public: + + typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder); + + struct BuildRef : public PrimRef + { + public: + __forceinline BuildRef () {} + + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node) + : PrimRef(bounds,(size_t)node), node(node) + { + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + /* used by the open/merge bvh builder */ + __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives) + : PrimRef(bounds,geomID,numPrimitives), node(node) + { + /* important for relative buildref ordering */ + if (node.isLeaf()) + bounds_area = 0.0f; + else + bounds_area = area(this->bounds()); + } + + __forceinline size_t size() const { + return primID(); + } + + friend bool operator< (const BuildRef& a, const BuildRef& b) { + return a.bounds_area < b.bounds_area; + } + + friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) { + return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }"; + } + + __forceinline unsigned int numPrimitives() const { return primID(); } + + public: + NodeRef node; + float bounds_area; + }; + + + __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) { + if (bref.node.isLeaf()) + { + refs[0] = bref; + return 1; + } + NodeRef ref = bref.node; + unsigned int geomID = bref.geomID(); + unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1); + AABBNode* node = ref.getAABBNode(); + size_t n = 0; + for (size_t i=0; i<N; i++) { + if (node->child(i) == BVH::emptyNode) continue; + refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims); + n++; + } + assert(n > 1); + return n; + } + + /*! Constructor. */ + BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD); + + /*! Destructor */ + ~BVHNBuilderTwoLevel (); + + /*! builder entry point */ + void build(); + void deleteGeometry(size_t geomID); + void clear(); + + void open_sequential(const size_t extSize); + + private: + + class RefBuilderBase { + public: + virtual ~RefBuilderBase () {} + virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0; + virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0; + }; + + class RefBuilderSmall : public RefBuilderBase { + public: + + RefBuilderSmall (size_t objectID) + : objectID_ (objectID) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) { + + Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_); + size_t meshSize = mesh->size(); + assert(isSmallGeometry(mesh)); + + mvector<PrimRef> prefs(topBuilder->scene->device, meshSize); + auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface); + + size_t begin=0; + while (begin < pinfo.size()) + { + Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment); + typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1); + accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene); + + /* create build primitive */ +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node); +#endif + } + assert(begin == pinfo.size()); + } + + bool meshQualityChanged (RTCBuildQuality /*currQuality*/) { + return false; + } + + size_t objectID_; + }; + + class RefBuilderLarge : public RefBuilderBase { + public: + + RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality) + : objectID_ (objectID), builder_ (builder), quality_ (quality) {} + + void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) + { + BVH* object = topBuilder->getBVH(objectID_); assert(object); + + /* build object if it got modified */ + if (topBuilder->isGeometryModified(objectID_)) + builder_->build(); + + /* create build primitive */ + if (!object->getBounds().empty()) + { +#if ENABLE_DIRECT_SAH_MERGE_BUILDER + Mesh* mesh = topBuilder->getMesh(objectID_); + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size()); +#else + topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root); +#endif + } + } + + bool meshQualityChanged (RTCBuildQuality currQuality) { + return currQuality != quality_; + } + + private: + size_t objectID_; + Ref<Builder> builder_; + RTCBuildQuality quality_; + }; + + void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh); + void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh); + + BVH* getBVH (size_t objectID) { + return this->bvh->objects[objectID]; + } + Mesh* getMesh (size_t objectID) { + return this->scene->template getSafe<Mesh>(objectID); + } + bool isGeometryModified (size_t objectID) { + return this->scene->isGeometryModified(objectID); + } + + void resizeRefsList () + { + size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), + [this](const range<size_t>& r)->size_t { + size_t c = 0; + for (auto i=r.begin(); i<r.end(); ++i) { + Mesh* mesh = scene->getSafe<Mesh>(i); + if (mesh == nullptr || mesh->numTimeSteps != 1) + continue; + size_t meshSize = mesh->size(); + c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1; + } + return c; + }, + std::plus<size_t>() + ); + + if (refs.size() < num) { + refs.resize(num); + } + } + + void createMeshAccel (size_t geomID, Builder*& builder) + { + bvh->objects[geomID] = new BVH(Primitive::type,scene); + BVH* accel = bvh->objects[geomID]; + auto mesh = scene->getSafe<Mesh>(geomID); + if (nullptr == mesh) { + throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type"); + return; + } + + __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder); + } + + using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>; + + BuilderList builders; + BVH* bvh; + Scene* scene; + mvector<BuildRef> refs; + mvector<PrimRef> prims; + std::atomic<int> nextRef; + const size_t singleThreadThreshold; + Geometry::GTypeMask gtype; + bool useMortonBuilder_ = false; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h new file mode 100644 index 0000000000..1c1ae8d6a7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h @@ -0,0 +1,267 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/quadi.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t); + DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) + + namespace isa + { + + namespace __internal_two_level_builder__ { + + template<int N, typename Mesh, typename Primitive> + struct MortonBuilder {}; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<4,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,TriangleMesh,Triangle4i> { + MortonBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,QuadMesh,Quad4v> { + MortonBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,UserGeometry,Object> { + MortonBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);} + }; + template<> + struct MortonBuilder<8,Instance,InstancePrimitive> { + MortonBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct SAHBuilder {}; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<4,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,TriangleMesh,Triangle4i> { + SAHBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,QuadMesh,Quad4v> { + SAHBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,UserGeometry,Object> { + SAHBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);} + }; + template<> + struct SAHBuilder<8,Instance,InstancePrimitive> { + SAHBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct RefitBuilder {}; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<4,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,TriangleMesh,Triangle4i> { + RefitBuilder () {} + Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,QuadMesh,Quad4v> { + RefitBuilder () {} + Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,UserGeometry,Object> { + RefitBuilder () {} + Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);} + }; + template<> + struct RefitBuilder<8,Instance,InstancePrimitive> { + RefitBuilder () {} + Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);} + }; + + template<int N, typename Mesh, typename Primitive> + struct MeshBuilder { + MeshBuilder () {} + void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) { + if(useMortonBuilder) { + builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); + return; + } + switch (mesh->quality) { + case RTC_BUILD_QUALITY_LOW: builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_MEDIUM: + case RTC_BUILD_QUALITY_HIGH: builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + case RTC_BUILD_QUALITY_REFIT: builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break; + default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality"); + } + } + }; + } + } +}
\ No newline at end of file diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp new file mode 100644 index 0000000000..a27be8bae8 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp @@ -0,0 +1,375 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_collider.h" +#include "../geometry/triangle_triangle_intersector.h" + +namespace embree +{ + namespace isa + { +#define CSTAT(x) + + size_t parallel_depth_threshold = 3; + CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0)); + CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0)); + CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0)); + CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0)); + + struct Collision + { + __forceinline Collision() {} + + __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1) + : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {} + + unsigned geomID0; + unsigned primID0; + unsigned geomID1; + unsigned primID1; + }; + + template<int N> + __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template<int N> + __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + template<int N> + __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1) + { + const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x); + const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y); + const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z); + const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x); + const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y); + const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z); + return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z)); + } + + bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1) + { + CSTAT(bvh_collide_prim_intersections1++); + const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0); + const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1); + const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0); + const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1); + + /* special culling for scene intersection with itself */ + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore self intersections */ + if (primID0 == primID1) + return false; + } + CSTAT(bvh_collide_prim_intersections2++); + + if (scene0 == scene1 && geomID0 == geomID1) + { + /* ignore intersection with topological neighbors */ + const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]); + if (any(vint4(tri1.v[0]) == t0)) return false; + if (any(vint4(tri1.v[1]) == t0)) return false; + if (any(vint4(tri1.v[2]) == t0)) return false; + } + CSTAT(bvh_collide_prim_intersections3++); + + const Vec3fa a0 = mesh0->vertex(tri0.v[0]); + const Vec3fa a1 = mesh0->vertex(tri0.v[1]); + const Vec3fa a2 = mesh0->vertex(tri0.v[2]); + const Vec3fa b0 = mesh1->vertex(tri1.v[0]); + const Vec3fa b1 = mesh1->vertex(tri1.v[1]); + const Vec3fa b2 = mesh1->vertex(tri1.v[2]); + + return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2); + } + + template<int N> + __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1) + { + Collision collisions[16]; + size_t num_collisions = 0; + + size_t N0; Object* leaf0 = (Object*) node0.leaf(N0); + size_t N1; Object* leaf1 = (Object*) node1.leaf(N1); + for (size_t i=0; i<N0; i++) { + for (size_t j=0; j<N1; j++) { + const unsigned geomID0 = leaf0[i].geomID(); + const unsigned primID0 = leaf0[i].primID(); + const unsigned geomID1 = leaf1[j].geomID(); + const unsigned primID1 = leaf1[j].primID(); + if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue; + collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1); + if (num_collisions == 16) { + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + num_collisions = 0; + } + } + } + if (num_collisions) + this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions); + } + + template<int N> + void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1) + { + CSTAT(bvh_collide_traversal_steps++); + if (unlikely(ref0.isLeaf())) { + if (unlikely(ref1.isLeaf())) { + CSTAT(bvh_collide_leaf_pairs++); + processLeaf(ref0,ref1); + return; + } else goto recurse_node1; + + } else { + if (unlikely(ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(bounds0) > area(bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + AABBNode* node0 = ref0.getAABBNode(); + size_t mask = overlap<N>(bounds1,*node0); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i<N; i++) { +#if 0 + if (depth0 < parallel_depth_threshold) + { + parallel_for(size_t(N), [&] ( size_t i ) { + if (mask & ( 1 << i)) { + BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1); + } + } + return; + } + + { + recurse_node1: + AABBNode* node1 = ref1.getAABBNode(); + size_t mask = overlap<N>(bounds0,*node1); + //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + //for (size_t i=0; i<N; i++) { +#if 0 + if (depth1 < parallel_depth_threshold) + { + parallel_for(size_t(N), [&] ( size_t i ) { + if (mask & ( 1 << i)) { + BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + }); + } + else +#endif + { + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE); + collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1); + } + } + return; + } + } + + template<int N> + void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs) + { + if (unlikely(job.ref0.isLeaf())) { + if (unlikely(job.ref1.isLeaf())) { + jobs.push_back(job); + return; + } else goto recurse_node1; + } else { + if (unlikely(job.ref1.isLeaf())) { + goto recurse_node0; + } else { + if (area(job.bounds0) > area(job.bounds1)) { + goto recurse_node0; + } + else { + goto recurse_node1; + } + } + } + + { + recurse_node0: + const AABBNode* node0 = job.ref0.getAABBNode(); + size_t mask = overlap<N>(job.bounds1,*node0); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1)); + } + return; + } + + { + recurse_node1: + const AABBNode* node1 = job.ref1.getAABBNode(); + size_t mask = overlap<N>(job.bounds0,*node1); + for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) { + jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1)); + } + return; + } + } + + template<int N> + void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1) + { + CSTAT(bvh_collide_traversal_steps = 0); + CSTAT(bvh_collide_leaf_pairs = 0); + CSTAT(bvh_collide_leaf_iterations = 0); + CSTAT(bvh_collide_prim_intersections1 = 0); + CSTAT(bvh_collide_prim_intersections2 = 0); + CSTAT(bvh_collide_prim_intersections3 = 0); + CSTAT(bvh_collide_prim_intersections4 = 0); + CSTAT(bvh_collide_prim_intersections5 = 0); + CSTAT(bvh_collide_prim_intersections = 0); +#if 0 + collide_recurse(ref0,bounds0,ref1,bounds1,0,0); +#else + const int M = 2048; + jobvector jobs[2]; + jobs[0].reserve(M); + jobs[1].reserve(M); + jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0)); + int source = 0; + int target = 1; + + /* try to split job until job list is full */ + while (jobs[source].size()+8 <= M) + { + for (size_t i=0; i<jobs[source].size(); i++) + { + const CollideJob& job = jobs[source][i]; + size_t remaining = jobs[source].size()-i; + if (jobs[target].size()+remaining+8 > M) { + jobs[target].push_back(job); + } else { + split(job,jobs[target]); + } + } + + /* stop splitting jobs if we reached only leaves and cannot make progress anymore */ + if (jobs[target].size() == jobs[source].size()) + break; + + jobs[source].resize(0); + std::swap(source,target); + } + + /* parallel processing of all jobs */ + parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) { + CollideJob& j = jobs[source][i]; + collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1); + }); + + +#endif + CSTAT(PRINT(bvh_collide_traversal_steps)); + CSTAT(PRINT(bvh_collide_leaf_pairs)); + CSTAT(PRINT(bvh_collide_leaf_iterations)); + CSTAT(PRINT(bvh_collide_prim_intersections1)); + CSTAT(PRINT(bvh_collide_prim_intersections2)); + CSTAT(PRINT(bvh_collide_prim_intersections3)); + CSTAT(PRINT(bvh_collide_prim_intersections4)); + CSTAT(PRINT(bvh_collide_prim_intersections5)); + CSTAT(PRINT(bvh_collide_prim_intersections)); + } + + template<int N> + void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr) + { + BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr). + collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds()); + } + +#if defined (EMBREE_LOWEST_ISA) + struct collision_regression_test : public RegressionTest + { + collision_regression_test(const char* name) : RegressionTest(name) { + registerRegressionTest(this); + } + + bool run () + { + bool passed = true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f), + Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false; + passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), + Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false; + return passed; + } + }; + + collision_regression_test collision_regression("collision_regression_test"); +#endif + + //////////////////////////////////////////////////////////////////////////////// + /// Collider Definitions + //////////////////////////////////////////////////////////////////////////////// + + DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>); + +#if defined(__AVX__) + DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>); +#endif + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h new file mode 100644 index 0000000000..ac4f99c96a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h @@ -0,0 +1,72 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../geometry/trianglev.h" +#include "../geometry/object.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNCollider + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + struct CollideJob + { + CollideJob () {} + + CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0, + NodeRef ref1, const BBox3fa& bounds1, size_t depth1) + : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {} + + NodeRef ref0; + BBox3fa bounds0; + size_t depth0; + NodeRef ref1; + BBox3fa bounds1; + size_t depth1; + }; + + typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector; + + void split(const CollideJob& job, jobvector& jobs); + + public: + __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {} + + public: + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0; + void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1); + void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1); + + protected: + Scene* scene0; + Scene* scene1; + RTCCollideFunc callback; + void* userPtr; + }; + + template<int N> + class BVHNColliderUserGeom : public BVHNCollider<N> + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + + __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr) + : BVHNCollider<N>(scene0,scene1,callback,userPtr) {} + + virtual void processLeaf(NodeRef leaf0, NodeRef leaf1); + public: + static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h new file mode 100644 index 0000000000..54021ca6eb --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h @@ -0,0 +1,21 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" +#include "../common/isa.h" +#include "../common/accel.h" +#include "../common/scene.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + /*! BVH instantiations */ + class BVHFactory + { + public: + enum class BuildVariant { STATIC, DYNAMIC, HIGH_QUALITY }; + enum class IntersectVariant { FAST, ROBUST }; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp new file mode 100644 index 0000000000..ea6adc2717 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp @@ -0,0 +1,330 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.h" +#include "node_intersector1.h" +#include "bvh_traverser1.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +namespace embree +{ + namespace isa + { + template<int N, int types, bool robust, typename PrimitiveIntersector1> + void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This, + RayHit& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + StackItemT<NodeRef> stack[stackSize]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer + StackItemT<NodeRef>* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + if (bvh->root == BVH::emptyNode) + return; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ +#if defined(__AVX512ER__) + /* much faster on KNL */ + if (unlikely(any(vfloat<Nx>(*(float*)&stackPtr->dist) > tray.tfar))) + continue; +#else + if (unlikely(*(float*)&stackPtr->dist > ray.tfar)) + continue; +#endif + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<Nx> tNear; + STAT3(normal.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(normal.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node); + tray.tfar = ray.tfar; + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + } + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This, + Ray& __restrict__ ray, + IntersectContext* __restrict__ context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return; + + /* early out for already occluded rays */ + if (unlikely(ray.tfar < 0.0f)) + return; + + /* perform per ray precalculations required by the primitive intersector */ + Precalculations pre(ray, bvh); + + /* stack state */ + NodeRef stack[stackSize]; // stack of nodes that still need to get traversed + NodeRef* stackPtr = stack+1; // current stack pointer + NodeRef* stackEnd = stack+stackSize; + stack[0] = bvh->root; + + /* filter out invalid rays */ +#if defined(EMBREE_IGNORE_INVALID_RAYS) + if (!ray.valid()) return; +#endif + + /* verify correct input */ + assert(ray.valid()); + assert(ray.tnear() >= 0.0f); + assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f)); + + /* load the ray into SIMD registers */ + TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f)); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser; + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = (NodeRef)*stackPtr; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<Nx> tNear; + STAT3(shadow.trav_nodes,1,1,1); + bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask); + if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(shadow.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) { + ray.tfar = neg_inf; + break; + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + *stackPtr = (NodeRef)lazy_node; + stackPtr++; + } + } + } + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + struct PointQueryDispatch + { + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N; + + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + const BVH* __restrict__ bvh = (const BVH*)This->ptr; + + /* we may traverse an empty BVH in case all geometry was invalid */ + if (bvh->root == BVH::emptyNode) + return false; + + /* stack state */ + StackItemT<NodeRef> stack[stackSize]; // stack of nodes + StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer + StackItemT<NodeRef>* stackEnd = stack+stackSize; + stack[0].ptr = bvh->root; + stack[0].dist = neg_inf; + + /* verify correct input */ + assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f)); + + /* load the point query into SIMD registers */ + TravPointQuery<N> tquery(query->p, context->query_radius); + + /* initialize the node traverser */ + BVHNNodeTraverser1Hit<N, N, types> nodeTraverser; + + bool changed = false; + float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + + /* pop loop */ + while (true) pop: + { + /* pop next node */ + if (unlikely(stackPtr == stack)) break; + stackPtr--; + NodeRef cur = NodeRef(stackPtr->ptr); + + /* if popped node is too far, pop next one */ + if (unlikely(*(float*)&stackPtr->dist > cull_radius)) + continue; + + /* downtraversal loop */ + while (true) + { + /* intersect node */ + size_t mask; vfloat<N> tNear; + STAT3(point_query.trav_nodes,1,1,1); + bool nodeIntersected; + if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) { + nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask); + } else { + nodeIntersected = BVHNNodePointQueryAABB1 <N, types>::pointQuery(cur, tquery, query->time, tNear, mask); + } + if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; } + + /* if no child is hit, pop next node */ + if (unlikely(mask == 0)) + goto pop; + + /* select next child and push other children */ + nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); + } + + /* this is a leaf node */ + assert(cur != BVH::emptyNode); + STAT3(point_query.trav_leaves,1,1,1); + size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + size_t lazy_node = 0; + if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node)) + { + changed = true; + tquery.rad = context->query_radius; + cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE + ? query->radius * query->radius + : dot(context->query_radius, context->query_radius); + } + + /* push lazy node onto stack */ + if (unlikely(lazy_node)) { + stackPtr->ptr = lazy_node; + stackPtr->dist = neg_inf; + stackPtr++; + } + } + return changed; + } + }; + + /* disable point queries for not yet supported geometry types */ + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust> + struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> { + static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; } + }; + + template<int N, int types, bool robust, typename PrimitiveIntersector1> + bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery( + const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) + { + return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h new file mode 100644 index 0000000000..1a269c319a --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/point_query.h" + +namespace embree +{ + namespace isa + { + /*! BVH single ray intersector. */ + template<int N, int types, bool robust, typename PrimitiveIntersector1> + class BVHNIntersector1 + { + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersector1::Precalculations Precalculations; + typedef typename PrimitiveIntersector1::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + + static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N; + + public: + static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context); + static void occluded (const Accel::Intersectors* This, Ray& ray, IntersectContext* context); + static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp new file mode 100644 index 0000000000..989f7354fd --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector1.cpp" + +namespace embree +{ + namespace isa + { + int getISA() { + return VerifyMultiTargetLinking::getISA(); + } + + //////////////////////////////////////////////////////////////////////////////// + /// BVH4Intersector1 Definitions + //////////////////////////////////////////////////////////////////////////////// + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >)); + IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller <SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>)); + IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>)); + + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >)); + IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >)); + + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >)); + IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >)); + + IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >)); + IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >)); + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >)); + //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >)); + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h new file mode 100644 index 0000000000..d764cc928d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h @@ -0,0 +1,61 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" +#include "node_intersector_frustum.h" + +namespace embree +{ + namespace isa + { + template<int K, bool robust> + struct TravRayK; + + /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */ + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true> + class BVHNIntersectorKHybrid + { + /* right now AVX512KNL SIMD extension only for standard node types */ + static const size_t Nx = types == BVH_AN1 ? vextend<N>::size : N; + + /* shortcuts for frequently used types */ + typedef typename PrimitiveIntersectorK::Precalculations Precalculations; + typedef typename PrimitiveIntersectorK::Primitive Primitive; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store + static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth; + + static const size_t switchThresholdIncoherent = \ + (K==4) ? 3 : + (K==8) ? ((N==4) ? 5 : 7) : + (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal + 0; + + private: + static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context); + static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre, + RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context); + + public: + static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context); + static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context); + + static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context); + static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context); + + }; + + /*! BVH packet intersector. */ + template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK> + class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {}; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h new file mode 100644 index 0000000000..83d1fb4d3d --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h @@ -0,0 +1,295 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector_packet_stream.h" +#include "node_intersector_frustum.h" +#include "bvh_traverser_stream.h" + +namespace embree +{ + namespace isa + { + /*! BVH ray stream intersector. */ + template<int N, int Nx, int types, bool robust, typename PrimitiveIntersector> + class BVHNIntersectorStream + { + static const int Nxd = (Nx == N) ? N : Nx/2; + + /* shortcuts for frequently used types */ + template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>; + template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK; + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + + template<int K> + __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays, + TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant) + { + const size_t numPackets = (numOctantRays+K-1)/K; + + Vec3vf<K> tmp_min_rdir(pos_inf); + Vec3vf<K> tmp_max_rdir(neg_inf); + Vec3vf<K> tmp_min_org(pos_inf); + Vec3vf<K> tmp_max_org(neg_inf); + vfloat<K> tmp_min_dist(pos_inf); + vfloat<K> tmp_max_dist(neg_inf); + + size_t m_active = 0; + for (size_t i = 0; i < numPackets; i++) + { + const vfloat<K> tnear = inputPackets[i]->tnear(); + const vfloat<K> tfar = inputPackets[i]->tfar; + vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); + +#if defined(EMBREE_IGNORE_INVALID_RAYS) + m_valid &= inputPackets[i]->valid(); +#endif + + m_active |= (size_t)movemask(m_valid) << (i*K); + + vfloat<K> packet_min_dist = max(tnear, 0.0f); + vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); + tmp_min_dist = min(tmp_min_dist, packet_min_dist); + tmp_max_dist = max(tmp_max_dist, packet_max_dist); + + const Vec3vf<K>& org = inputPackets[i]->org; + const Vec3vf<K>& dir = inputPackets[i]->dir; + + new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist); + + tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf))); + tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf))); + tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf))); + tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf))); + } + + m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1); + + + const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x), + reduce_min(tmp_min_rdir.y), + reduce_min(tmp_min_rdir.z)); + + const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x), + reduce_max(tmp_max_rdir.y), + reduce_max(tmp_max_rdir.z)); + + const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x), + reduce_min(tmp_min_org.y), + reduce_min(tmp_min_org.z)); + + const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x), + reduce_max(tmp_max_org.y), + reduce_max(tmp_max_org.z)); + + commonOctant = + (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) && + (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) && + (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f); + + const float frustum_min_dist = reduce_min(tmp_min_dist); + const float frustum_max_dist = reduce_max(tmp_max_dist); + + frustum.init(reduced_min_origin, reduced_max_origin, + reduced_min_rdir, reduced_max_rdir, + frustum_min_dist, frustum_max_dist, + N); + + return m_active; + } + + template<int K> + __forceinline static size_t intersectAABBNodePacket(size_t m_active, + const TravRayKStream<K,robust>* packets, + const AABBNode* __restrict__ node, + size_t boxID, + const NearFarPrecalculations& nf) + { + assert(m_active); + const size_t startPacketID = bsf(m_active) / K; + const size_t endPacketID = bsr(m_active) / K; + size_t m_trav_active = 0; + for (size_t i = startPacketID; i <= endPacketID; i++) + { + const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf); + m_trav_active |= m_hit << (i*K); + } + return m_trav_active; + } + + template<int K> + __forceinline static size_t traverseCoherentStream(size_t m_active, + TravRayKStream<K, robust>* packets, + const AABBNode* __restrict__ node, + const Frustum<robust>& frustum, + size_t* maskK, + vfloat<Nx>& dist) + { + size_t m_node_hit = intersectNodeFrustum<N,Nx>(node, frustum, dist); + const size_t first_index = bsf(m_active); + const size_t first_packetID = first_index / K; + const size_t first_rayID = first_index % K; + size_t m_first_hit = intersectNode1<N,Nx>(node, packets[first_packetID], first_rayID, frustum.nf); + + /* this make traversal independent of the ordering of rays */ + size_t m_node = m_node_hit ^ m_first_hit; + while (unlikely(m_node)) + { + const size_t boxID = bscf(m_node); + const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf); + m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID); + maskK[boxID] = m_current; + } + return m_node_hit; + } + + // TODO: explicit 16-wide path for KNL + template<int K> + __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active, + TravRayKStreamFast<K>* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint<Nx> vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream<K,robust> &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint<Nx> bitmask(shiftTable[rayID]); + +#if defined (__aarch64__) + const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]); + const vfloat<Nx> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]); + const vfloat<Nx> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]); + const vfloat<Nx> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); +#else + const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]); + const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]); + const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]); + const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); +#endif + + const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i])); + const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i])); + +#if defined(__AVX512ER__) + const vboolx m_node((1 << N)-1); + const vbool<Nx> hit_mask = le(m_node, tNear, tFar); + vmask = mask_or(hit_mask, vmask, vmask, bitmask); +#else + const vbool<Nx> hit_mask = tNear <= tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint<Nx>(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif +#endif + } while(m_active); + return vmask; + } + + template<int K> + __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active, + TravRayKStreamRobust<K>* __restrict__ packets, + const AABBNode* __restrict__ node, + const NearFarPrecalculations& nf, + const int shiftTable[32]) + { + const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + assert(m_active); + vint<Nx> vmask(zero); + do + { + STAT3(shadow.trav_nodes,1,1,1); + const size_t rayID = bscf(m_active); + assert(rayID < MAX_INTERNAL_STREAM_SIZE); + TravRayKStream<K,robust> &p = packets[rayID / K]; + const size_t i = rayID % K; + const vint<Nx> bitmask(shiftTable[rayID]); + const vfloat<Nx> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i]; + const vfloat<Nx> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i]; + const vfloat<Nx> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat<Nx> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i]; + const vfloat<Nx> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i]; + const vfloat<Nx> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i]; + const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i])); + const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i])); + const float round_down = 1.0f-2.0f*float(ulp); + const float round_up = 1.0f+2.0f*float(ulp); +#if defined(__AVX512ER__) + const vboolx m_node((1 << N)-1); + const vbool<Nx> hit_mask = le(m_node, round_down*tNear, round_up*tFar); + vmask = mask_or(hit_mask, vmask, vmask, bitmask); +#else + const vbool<Nx> hit_mask = round_down*tNear <= round_up*tFar; +#if defined(__AVX2__) + vmask = vmask | (bitmask & vint<Nx>(hit_mask)); +#else + vmask = select(hit_mask, vmask | bitmask, vmask); +#endif +#endif + } while(m_active); + return vmask; + } + + + static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth; + + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template<int K> + static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + }; + + + /*! BVH ray stream intersector with direct fallback to packets. */ + template<int N, int Nx> + class BVHNIntersectorStreamPacketFallback + { + public: + static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context); + static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context); + + private: + template<int K> + static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context); + + template<int K> + static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h new file mode 100644 index 0000000000..cdeb923637 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h @@ -0,0 +1,41 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/ray.h" +#include "../common/scene.h" + +namespace embree +{ + namespace isa + { + class RayStreamFilter + { + public: + static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context); + static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context); + static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context); + + static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context); + static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context); + static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context); + + private: + template<int K, bool intersect> + static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context); + + template<int K, bool intersect> + static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context); + + template<int K, bool intersect> + static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context); + + template<int K, bool intersect> + static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context); + }; + } +}; diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h new file mode 100644 index 0000000000..baa4a8d805 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h @@ -0,0 +1,213 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN AABBNode */ + template<typename NodeRef, int N> + struct AABBNode_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const { + node.getAABBNode()->setRef(i,child); + node.getAABBNode()->setBounds(i,bounds); + } + }; + + struct Create2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear(); + for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds()); + return NodeRef::encodeNode(node); + } + }; + + struct Set2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + return ref; + } + }; + + struct Set3 + { + Set3 (FastAllocator* allocator, PrimRef* prims) + : allocator(allocator), prims(prims) {} + + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + AABBNode_t* node = ref.getAABBNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + + if (unlikely(precord.alloc_barrier)) + { + PrimRef* begin = &prims[precord.prims.begin()]; + PrimRef* end = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!! + size_t bytes = (size_t)end - (size_t)begin; + allocator->addBlock(begin,bytes); + } + + return ref; + } + + FastAllocator* const allocator; + PrimRef* const prims; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = pos_inf; + upper_x = upper_y = upper_z = neg_inf; + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds) + { + assert(i < N); + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) { + setBounds(i,bounds); + children[i] = ref; + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z)); + const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z)); + return BBox3fa(lower,upper); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]); + const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]); + return BBox3fa(lower,upper); + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extend(size_t i) const { + return bounds(i).size(); + } + + /*! Returns bounds of all children (implemented later as specializations) */ + __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const; + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(i<N && j<N); + std::swap(children[i],children[j]); + std::swap(lower_x[i],lower_x[j]); + std::swap(lower_y[i],lower_y[j]); + std::swap(lower_z[i],lower_z[j]); + std::swap(upper_x[i],upper_x[j]); + std::swap(upper_y[i],upper_y[j]); + std::swap(upper_z[i],upper_z[j]); + } + + /*! swap the children of two nodes */ + __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j) + { + assert(i<N && j<N); + std::swap(a->children[i],b->children[j]); + std::swap(a->lower_x[i],b->lower_x[j]); + std::swap(a->lower_y[i],b->lower_y[j]); + std::swap(a->lower_z[i],b->lower_z[j]); + std::swap(a->upper_x[i],b->upper_x[j]); + std::swap(a->upper_y[i],b->upper_y[j]); + std::swap(a->upper_z[i],b->upper_z[j]); + } + + /*! compacts a node (moves empty children to the end) */ + __forceinline static void compact(AABBNode_t* a) + { + /* find right most filled node */ + ssize_t j=N; + for (j=j-1; j>=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; i<j; i++) { + if (a->child(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! output operator */ + friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n) + { + o << "AABBNode { " << embree_endl; + o << " lower_x " << n.lower_x << embree_endl; + o << " upper_x " << n.upper_x << embree_endl; + o << " lower_y " << n.lower_y << embree_endl; + o << " upper_y " << n.upper_y << embree_endl; + o << " lower_z " << n.lower_z << embree_endl; + o << " upper_z " << n.upper_z << embree_endl; + o << " children = "; + for (size_t i=0; i<N; i++) o << n.children[i] << " "; + o << embree_endl; + o << "}" << embree_endl; + return o; + } + + public: + vfloat<N> lower_x; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_x; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children. + }; + + template<> + __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const { + transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower); + transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper); + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h new file mode 100644 index 0000000000..501f4bce5b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h @@ -0,0 +1,247 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! Motion Blur AABBNode */ + template<typename NodeRef, int N> + struct AABBNodeMB_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + struct Create + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const + { + AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + template<typename BuildRecord> + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; i<num; i++) { + node->setRef(i,children[i].ref); + node->setBounds(i,children[i].lbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + }; + + struct SetTimeRange + { + __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {} + + template<typename BuildRecord> + __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const + { + AABBNodeMB_t* node = ref.getAABBNodeMB(); + + LBBox3fa bounds = empty; + for (size_t i=0; i<num; i++) { + node->setRef(i, children[i].ref); + node->setBounds(i, children[i].lbounds, tbounds); + bounds.extend(children[i].lbounds); + } + return NodeRecordMB(ref,bounds); + } + + BBox1f tbounds; + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_x = lower_y = lower_z = vfloat<N>(pos_inf); + upper_x = upper_y = upper_z = vfloat<N>(neg_inf); + lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f); + upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, NodeRef ref) { + children[i] = ref; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i) + { + /*! for empty bounds we have to avoid inf-inf=nan */ + BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX))); + BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX))); + bounds0 = bounds0.enlarge_by(4.0f*float(ulp)); + bounds1 = bounds1.enlarge_by(4.0f*float(ulp)); + Vec3fa dlower = bounds1.lower-bounds0.lower; + Vec3fa dupper = bounds1.upper-bounds0.upper; + + lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z; + upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z; + + lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z; + upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z; + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds) { + setBounds(i, bounds.bounds0, bounds.bounds1); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) { + setBounds(i, bounds.global(tbounds)); + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) { + lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z; + upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z; + children[i] = ref; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) + { + setRef(i, child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Return bounding box for time 0 */ + __forceinline BBox3fa bounds0(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]), + Vec3fa(upper_x[i],upper_y[i],upper_z[i])); + } + + /*! Return bounding box for time 1 */ + __forceinline BBox3fa bounds1(size_t i) const { + return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]), + Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i])); + } + + /*! Returns bounds of node. */ + __forceinline BBox3fa bounds() const { + return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)), + reduce_min(min(lower_y,lower_y+lower_dy)), + reduce_min(min(lower_z,lower_z+lower_dz))), + Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)), + reduce_max(max(upper_y,upper_y+upper_dy)), + reduce_max(max(upper_z,upper_z+upper_dz)))); + } + + /*! Return bounding box of child i */ + __forceinline BBox3fa bounds(size_t i) const { + return merge(bounds0(i),bounds1(i)); + } + + /*! Return linear bounding box of child i */ + __forceinline LBBox3fa lbounds(size_t i) const { + return LBBox3fa(bounds0(i),bounds1(i)); + } + + /*! Return bounding box of child i at specified time */ + __forceinline BBox3fa bounds(size_t i, float time) const { + return lerp(bounds0(i),bounds1(i),time); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return lbounds(i).expectedHalfArea(); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const { + return lbounds(i).expectedHalfArea(t0t1); + } + + /*! swap two children of the node */ + __forceinline void swap(size_t i, size_t j) + { + assert(i<N && j<N); + std::swap(children[i],children[j]); + + std::swap(lower_x[i],lower_x[j]); + std::swap(upper_x[i],upper_x[j]); + std::swap(lower_y[i],lower_y[j]); + std::swap(upper_y[i],upper_y[j]); + std::swap(lower_z[i],lower_z[j]); + std::swap(upper_z[i],upper_z[j]); + + std::swap(lower_dx[i],lower_dx[j]); + std::swap(upper_dx[i],upper_dx[j]); + std::swap(lower_dy[i],lower_dy[j]); + std::swap(upper_dy[i],upper_dy[j]); + std::swap(lower_dz[i],lower_dz[j]); + std::swap(upper_dz[i],upper_dz[j]); + } + + /*! compacts a node (moves empty children to the end) */ + __forceinline static void compact(AABBNodeMB_t* a) + { + /* find right most filled node */ + ssize_t j=N; + for (j=j-1; j>=0; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + + /* replace empty nodes with filled nodes */ + for (ssize_t i=0; i<j; i++) { + if (a->child(i) == NodeRef::emptyNode) { + a->swap(i,j); + for (j=j-1; j>i; j--) + if (a->child(j) != NodeRef::emptyNode) + break; + } + } + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! stream output operator */ + friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) + { + cout << "AABBNodeMB {" << embree_endl; + for (size_t i=0; i<N; i++) + { + const BBox3fa b0 = n.bounds0(i); + const BBox3fa b1 = n.bounds1(i); + cout << " child" << i << " { " << embree_endl; + cout << " bounds0 = " << b0 << ", " << embree_endl; + cout << " bounds1 = " << b1 << ", " << embree_endl; + cout << " }"; + } + cout << "}"; + return cout; + } + + public: + vfloat<N> lower_x; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_x; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children. + + vfloat<N> lower_dx; //!< X dimension of lower bounds of all N children. + vfloat<N> upper_dx; //!< X dimension of upper bounds of all N children. + vfloat<N> lower_dy; //!< Y dimension of lower bounds of all N children. + vfloat<N> upper_dy; //!< Y dimension of upper bounds of all N children. + vfloat<N> lower_dz; //!< Z dimension of lower bounds of all N children. + vfloat<N> upper_dz; //!< Z dimension of upper bounds of all N children. + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h new file mode 100644 index 0000000000..e968bbbc39 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h @@ -0,0 +1,107 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_aabb_mb.h" + +namespace embree +{ + /*! Aligned 4D Motion Blur Node */ + template<typename NodeRef, int N> + struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + using AABBNodeMB_t<NodeRef,N>::set; + + typedef BVHNodeRecord<NodeRef> NodeRecord; + typedef BVHNodeRecordMB<NodeRef> NodeRecordMB; + typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D; + + struct Create + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const + { + if (hasTimeSplits) + { + AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + else + { + AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + } + }; + + struct Set + { + template<typename BuildRecord> + __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const + { + if (likely(ref.isAABBNodeMB())) { + for (size_t i=0; i<num; i++) + ref.getAABBNodeMB()->set(i, children[i]); + } else { + for (size_t i=0; i<num; i++) + ref.getAABBNodeMB4D()->set(i, children[i]); + } + } + }; + + /*! Clears the node. */ + __forceinline void clear() { + lower_t = vfloat<N>(pos_inf); + upper_t = vfloat<N>(neg_inf); + AABBNodeMB_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box of child. */ + __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) + { + AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds)); + lower_t[i] = tbounds.lower; + upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper; + } + + /*! Sets bounding box and ID of child. */ + __forceinline void set(size_t i, const NodeRecordMB4D& child) { + AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref); + setBounds(i, child.lbounds, child.dt); + } + + /*! Returns the expected surface area when randomly sampling the time. */ + __forceinline float expectedHalfArea(size_t i) const { + return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i)); + } + + /*! returns time range for specified child */ + __forceinline BBox1f timeRange(size_t i) const { + return BBox1f(lower_t[i],upper_t[i]); + } + + /*! stream output operator */ + friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) + { + cout << "AABBNodeMB4D {" << embree_endl; + for (size_t i=0; i<N; i++) + { + const BBox3fa b0 = n.bounds0(i); + const BBox3fa b1 = n.bounds1(i); + cout << " child" << i << " { " << embree_endl; + cout << " bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl; + cout << " bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl; + cout << " time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl; + cout << " }"; + } + cout << "}"; + return cout; + } + + public: + vfloat<N> lower_t; //!< time dimension of lower bounds of all N children + vfloat<N> upper_t; //!< time dimension of upper bounds of all N children + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h new file mode 100644 index 0000000000..8268f3b932 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h @@ -0,0 +1,43 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_ref.h" + +namespace embree +{ + + /*! BVHN Base Node */ + template<typename NodeRef, int N> + struct BaseNode_t + { + /*! Clears the node. */ + __forceinline void clear() + { + for (size_t i=0; i<N; i++) + children[i] = NodeRef::emptyNode; + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! verifies the node */ + __forceinline bool verify() const + { + for (size_t i=0; i<N; i++) { + if (child(i) == NodeRef::emptyNode) { + for (; i<N; i++) { + if (child(i) != NodeRef::emptyNode) + return false; + } + break; + } + } + return true; + } + + NodeRef children[N]; //!< Pointer to the N children (can be a node or leaf) + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h new file mode 100644 index 0000000000..fa7cc08211 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h @@ -0,0 +1,98 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! Node with unaligned bounds */ + template<typename NodeRef, int N> + struct OBBNode_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const { + node.ungetAABBNode()->setRef(i,child); + node.ungetAABBNode()->setBounds(i,bounds); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + naabb.l.vx = Vec3fa(nan); + naabb.l.vy = Vec3fa(nan); + naabb.l.vz = Vec3fa(nan); + naabb.p = Vec3fa(nan); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets bounding box. */ + __forceinline void setBounds(size_t i, const OBBox3fa& b) + { + assert(i < N); + + AffineSpace3fa space = b.space; + space.p -= b.bounds.lower; + space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space; + + naabb.l.vx.x[i] = space.l.vx.x; + naabb.l.vx.y[i] = space.l.vx.y; + naabb.l.vx.z[i] = space.l.vx.z; + + naabb.l.vy.x[i] = space.l.vy.x; + naabb.l.vy.y[i] = space.l.vy.y; + naabb.l.vy.z[i] = space.l.vy.z; + + naabb.l.vz.x[i] = space.l.vz.x; + naabb.l.vz.y[i] = space.l.vz.y; + naabb.l.vz.z[i] = space.l.vz.z; + + naabb.p.x[i] = space.p.x; + naabb.p.y[i] = space.p.y; + naabb.p.z[i] = space.p.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent(size_t i) const { + assert(i<N); + const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]); + const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]); + const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]); + return rsqrt(vx*vx + vy*vy + vz*vz); + } + + /*! Returns reference to specified child */ + __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; } + __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; } + + /*! output operator */ + friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n) + { + o << "UnAABBNode { " << n.naabb << " } " << embree_endl; + return o; + } + + public: + AffineSpace3vf<N> naabb; //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space) + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h new file mode 100644 index 0000000000..834cf5ec28 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h @@ -0,0 +1,90 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + template<typename NodeRef, int N> + struct OBBNodeMB_t : public BaseNode_t<NodeRef, N> + { + using BaseNode_t<NodeRef,N>::children; + + struct Create + { + __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const + { + OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear(); + return NodeRef::encodeNode(node); + } + }; + + struct Set + { + __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const { + node.ungetAABBNodeMB()->setRef(i,child); + node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt)); + } + }; + + /*! Clears the node. */ + __forceinline void clear() + { + space0 = one; + //b0.lower = b0.upper = Vec3fa(nan); + b1.lower = b1.upper = Vec3fa(nan); + BaseNode_t<NodeRef,N>::clear(); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) { + setBounds(i,space,lbounds.bounds0,lbounds.bounds1); + } + + /*! Sets space and bounding boxes. */ + __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c) + { + assert(i < N); + + AffineSpace3fa space = s0; + space.p -= a.lower; + Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower); + space = AffineSpace3fa::scale(scale)*space; + BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale); + BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale); + + space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z; + space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z; + space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z; + space0.p .x[i] = space.p .x; space0.p .y[i] = space.p .y; space0.p .z[i] = space.p .z; + + /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z; + b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/ + + b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z; + b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z; + } + + /*! Sets ID of child. */ + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + /*! Returns the extent of the bounds of the ith child */ + __forceinline Vec3fa extent0(size_t i) const { + assert(i < N); + const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]); + const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]); + const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]); + return rsqrt(vx*vx + vy*vy + vz*vz); + } + + public: + AffineSpace3vf<N> space0; + //BBox3vf<N> b0; // these are the unit bounds + BBox3vf<N> b1; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h new file mode 100644 index 0000000000..5212821f3f --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h @@ -0,0 +1,265 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh_node_base.h" + +namespace embree +{ + /*! BVHN Quantized Node */ + template<int N> + struct __aligned(8) QuantizedBaseNode_t + { + typedef unsigned char T; + static const T MIN_QUAN = 0; + static const T MAX_QUAN = 255; + + /*! Clears the node. */ + __forceinline void clear() { + for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN; + for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN; + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x), + madd(scale.y,(float)lower_y[i],start.y), + madd(scale.z,(float)lower_z[i],start.z)); + const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x), + madd(scale.y,(float)upper_y[i],start.y), + madd(scale.z,(float)upper_z[i],start.z)); + return BBox3fa(lower,upper); + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extent(size_t i) const { + return bounds(i).size(); + } + + static __forceinline void init_dim(const vfloat<N> &lower, + const vfloat<N> &upper, + T lower_quant[N], + T upper_quant[N], + float &start, + float &scale) + { + /* quantize bounds */ + const vbool<N> m_valid = lower != vfloat<N>(pos_inf); + const float minF = reduce_min(lower); + const float maxF = reduce_max(upper); + float diff = (1.0f+2.0f*float(ulp))*(maxF - minF); + float decode_scale = diff / float(MAX_QUAN); + if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero + assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF); + const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f; + vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN); + vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN); + + /* lower/upper correction */ + vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower; + vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper; + ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN); + iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN); + + /* disable invalid lanes */ + ilower = select(m_valid,ilower,MAX_QUAN); + iupper = select(m_valid,iupper,MIN_QUAN); + + /* store as uchar to memory */ + vint<N>::store(lower_quant,ilower); + vint<N>::store(upper_quant,iupper); + start = minF; + scale = decode_scale; + +#if defined(DEBUG) + vfloat<N> extract_lower( vint<N>::loadu(lower_quant) ); + vfloat<N> extract_upper( vint<N>::loadu(upper_quant) ); + vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF); + vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF); + assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid)); + assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid)); +#endif + } + + __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node) + { + init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x); + init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y); + init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z); + } + + __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); } + +#if defined(__AVX512F__) // KNL + __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); } +#endif + __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); } + + __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); } + + __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); } + + __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); } + + __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); } + + __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); } + + template <int M> + __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); } + +#if defined(__AVX512F__) + __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); } + __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); } + __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); } +#endif + + union { + struct { + T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children + T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children + T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children + T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children + T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children + T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children + }; + T all_planes[6*N]; + }; + + Vec3f start; + Vec3f scale; + + friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n) + { + o << "QuantizedBaseNode { " << embree_endl; + o << " start " << n.start << embree_endl; + o << " scale " << n.scale << embree_endl; + o << " lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl; + o << " upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl; + o << " lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl; + o << " upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl; + o << " lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl; + o << " upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl; + o << "}" << embree_endl; + return o; + } + + }; + + template<typename NodeRef, int N> + struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N> + { + using BaseNode_t<NodeRef,N>::children; + using QuantizedBaseNode_t<N>::lower_x; + using QuantizedBaseNode_t<N>::upper_x; + using QuantizedBaseNode_t<N>::lower_y; + using QuantizedBaseNode_t<N>::upper_y; + using QuantizedBaseNode_t<N>::lower_z; + using QuantizedBaseNode_t<N>::upper_z; + using QuantizedBaseNode_t<N>::start; + using QuantizedBaseNode_t<N>::scale; + using QuantizedBaseNode_t<N>::init_dim; + + __forceinline void setRef(size_t i, const NodeRef& ref) { + assert(i < N); + children[i] = ref; + } + + struct Create2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const + { + __aligned(64) AABBNode_t<NodeRef,N> node; + node.clear(); + for (size_t i=0; i<n; i++) { + node.setBounds(i,children[i].bounds()); + } + QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment); + qnode->init(node); + + return (size_t)qnode | NodeRef::tyQuantizedNode; + } + }; + + struct Set2 + { + template<typename BuildRecord> + __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const + { + QuantizedNode_t* node = ref.quantizedNode(); + for (size_t i=0; i<num; i++) node->setRef(i,children[i]); + return ref; + } + }; + + __forceinline void init(AABBNode_t<NodeRef,N>& node) + { + for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode; + init_dim(node); + } + + }; + + /*! BVHN Quantized Node */ + template<int N> + struct __aligned(8) QuantizedBaseNodeMB_t + { + QuantizedBaseNode_t<N> node0; + QuantizedBaseNode_t<N> node1; + + /*! Clears the node. */ + __forceinline void clear() { + node0.clear(); + node1.clear(); + } + + /*! Returns bounds of specified child. */ + __forceinline BBox3fa bounds(size_t i) const + { + assert(i < N); + BBox3fa bounds0 = node0.bounds(i); + BBox3fa bounds1 = node1.bounds(i); + bounds0.extend(bounds1); + return bounds0; + } + + /*! Returns extent of bounds of specified child. */ + __forceinline Vec3fa extent(size_t i) const { + return bounds(i).size(); + } + + __forceinline vbool<N> validMask() const { return node0.validMask(); } + + template<typename T> + __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); } + template<typename T> + __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); } + + + template<int M> + __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); } + template<int M> + __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); } + + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h new file mode 100644 index 0000000000..0f6d4dac7e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h @@ -0,0 +1,242 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../common/default.h" +#include "../common/alloc.h" +#include "../common/accel.h" +#include "../common/device.h" +#include "../common/scene.h" +#include "../geometry/primitive.h" +#include "../common/ray.h" + +namespace embree +{ + /* BVH node reference with bounds */ + template<typename NodeRef> + struct BVHNodeRecord + { + __forceinline BVHNodeRecord() {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {} + __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {} + + NodeRef ref; + BBox3fx bounds; + }; + + template<typename NodeRef> + struct BVHNodeRecordMB + { + __forceinline BVHNodeRecordMB() {} + __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {} + + NodeRef ref; + LBBox3fa lbounds; + }; + + template<typename NodeRef> + struct BVHNodeRecordMB4D + { + __forceinline BVHNodeRecordMB4D() {} + __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {} + + NodeRef ref; + LBBox3fa lbounds; + BBox1f dt; + }; + + template<typename NodeRef, int N> struct BaseNode_t; + template<typename NodeRef, int N> struct AABBNode_t; + template<typename NodeRef, int N> struct AABBNodeMB_t; + template<typename NodeRef, int N> struct AABBNodeMB4D_t; + template<typename NodeRef, int N> struct OBBNode_t; + template<typename NodeRef, int N> struct OBBNodeMB_t; + template<typename NodeRef, int N> struct QuantizedNode_t; + template<typename NodeRef, int N> struct QuantizedNodeMB_t; + + /*! Pointer that points to a node or a list of primitives */ + template<int N> + struct NodeRefPtr + { + //template<int NN> friend class BVHN; + + /*! Number of bytes the nodes and primitives are minimally aligned to.*/ + static const size_t byteAlignment = 16; + static const size_t byteNodeAlignment = 4*N; + + /*! highest address bit is used as barrier for some algorithms */ + static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1)); + + /*! Masks the bits that store the number of items per leaf. */ + static const size_t align_mask = byteAlignment-1; + static const size_t items_mask = byteAlignment-1; + + /*! different supported node types */ + static const size_t tyAABBNode = 0; + static const size_t tyAABBNodeMB = 1; + static const size_t tyAABBNodeMB4D = 6; + static const size_t tyOBBNode = 2; + static const size_t tyOBBNodeMB = 3; + static const size_t tyQuantizedNode = 5; + static const size_t tyLeaf = 8; + + /*! Empty node */ + static const size_t emptyNode = tyLeaf; + + /*! Invalid node, used as marker in traversal */ + static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0); + static const size_t popRay = (((size_t)-1) & (~items_mask)) | (tyLeaf+1); + + /*! Maximum number of primitive blocks in a leaf. */ + static const size_t maxLeafBlocks = items_mask-tyLeaf; + + /*! Default constructor */ + __forceinline NodeRefPtr () {} + + /*! Construction from integer */ + __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {} + + /*! Cast to size_t */ + __forceinline operator size_t() const { return ptr; } + + /*! Sets the barrier bit. */ + __forceinline void setBarrier() { +#if defined(__X86_64__) || defined(__aarch64__) + assert(!isBarrier()); + ptr |= barrier_mask; +#else + assert(false); +#endif + } + + /*! Clears the barrier bit. */ + __forceinline void clearBarrier() { +#if defined(__X86_64__) || defined(__aarch64__) + ptr &= ~barrier_mask; +#else + assert(false); +#endif + } + + /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */ + __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; } + + /*! checks if this is a leaf */ + __forceinline size_t isLeaf() const { return ptr & tyLeaf; } + + /*! returns node type */ + __forceinline int type() const { return ptr & (size_t)align_mask; } + + /*! checks if this is a node */ + __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; } + + /*! checks if this is a motion blur node */ + __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; } + + /*! checks if this is a 4D motion blur node */ + __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; } + + /*! checks if this is a node with unaligned bounding boxes */ + __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; } + + /*! checks if this is a motion blur node with unaligned bounding boxes */ + __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; } + + /*! checks if this is a quantized node */ + __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; } + + /*! Encodes a node */ + static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB); + } + + static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) { + assert(!((size_t)node & align_mask)); + return NodeRefPtr((size_t) node | tyAABBNodeMB4D); + } + + /*! Encodes an unaligned node */ + static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) { + return NodeRefPtr((size_t) node | tyOBBNode); + } + + /*! Encodes an unaligned motion blur node */ + static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) { + return NodeRefPtr((size_t) node | tyOBBNodeMB); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) { + assert(!((size_t)tri & align_mask)); + assert(num <= maxLeafBlocks); + return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks))); + } + + /*! Encodes a leaf */ + static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) { + assert(!((size_t)ptr & align_mask)); + return NodeRefPtr((size_t)ptr | (tyLeaf+ty)); + } + + /*! returns base node pointer */ + __forceinline BaseNode_t<NodeRefPtr,N>* baseNode() + { + assert(!isLeaf()); + return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); + } + __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const + { + assert(!isLeaf()); + return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); + } + + /*! returns node pointer */ + __forceinline AABBNode_t<NodeRefPtr,N>* getAABBNode() { assert(isAABBNode()); return ( AABBNode_t<NodeRefPtr,N>*)ptr; } + __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; } + + /*! returns motion blur node pointer */ + __forceinline AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() { assert(isAABBNodeMB() || isAABBNodeMB4D()); return ( AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns 4D motion blur node pointer */ + __forceinline AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() { assert(isAABBNodeMB4D()); return ( AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned node pointer */ + __forceinline OBBNode_t<NodeRefPtr,N>* ungetAABBNode() { assert(isOBBNode()); return ( OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns unaligned motion blur node pointer */ + __forceinline OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() { assert(isOBBNodeMB()); return ( OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); } + + /*! returns quantized node pointer */ + __forceinline QuantizedNode_t<NodeRefPtr,N>* quantizedNode() { assert(isQuantizedNode()); return ( QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); } + __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); } + + /*! returns leaf pointer */ + __forceinline char* leaf(size_t& num) const { + assert(isLeaf()); + num = (ptr & (size_t)items_mask)-tyLeaf; + return (char*)(ptr & ~(size_t)align_mask); + } + + /*! clear all bit flags */ + __forceinline void clearFlags() { + ptr &= ~(size_t)align_mask; + } + + /*! returns the wideness */ + __forceinline size_t getN() const { return N; } + + public: + size_t ptr; + }; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp new file mode 100644 index 0000000000..a273c21e8b --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp @@ -0,0 +1,247 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_refit.h" +#include "bvh_statistics.h" + +#include "../geometry/linei.h" +#include "../geometry/triangle.h" +#include "../geometry/trianglev.h" +#include "../geometry/trianglei.h" +#include "../geometry/quadv.h" +#include "../geometry/object.h" +#include "../geometry/instance.h" + +namespace embree +{ + namespace isa + { + static const size_t SINGLE_THREAD_THRESHOLD = 4*1024; + + template<int N> + __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b) + { + size_t sa = *(size_t*)&a->node()->lower_x; + size_t sb = *(size_t*)&b->node()->lower_x; + return sa < sb; + } + + template<int N> + BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds) + : bvh(bvh), leafBounds(leafBounds), numSubTrees(0) + { + } + + template<int N> + void BVHNRefitter<N>::refit() + { + if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) { + bvh->bounds = LBBox3fa(recurse_bottom(bvh->root)); + } + else + { + BBox3fa subTreeBounds[MAX_NUM_SUB_TREES]; + numSubTrees = 0; + gather_subtree_refs(bvh->root,numSubTrees,0); + if (numSubTrees) + parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) { + for (size_t i=r.begin(); i<r.end(); i++) { + NodeRef& ref = subTrees[i]; + subTreeBounds[i] = recurse_bottom(ref); + } + }); + + numSubTrees = 0; + bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0)); + } + } + + template<int N> + void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + subTrees[subtrees++] = ref; + return; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + for (size_t i=0; i<N; i++) { + NodeRef& child = node->child(i); + if (unlikely(child == BVH::emptyNode)) continue; + gather_subtree_refs(child,subtrees,depth+1); + } + } + } + + template<int N> + BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth) + { + if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) + { + assert(subtrees < MAX_NUM_SUB_TREES); + assert(subTrees[subtrees] == ref); + return subTreeBounds[subtrees++]; + } + + if (ref.isAABBNode()) + { + AABBNode* node = ref.getAABBNode(); + BBox3fa bounds[N]; + + for (size_t i=0; i<N; i++) + { + NodeRef& child = node->child(i); + + if (unlikely(child == BVH::emptyNode)) + bounds[i] = BBox3fa(empty); + else + bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); + } + + BBox3vf<N> boundsT = transpose<N>(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge<N>(bounds); + } + else + return leafBounds.leafBounds(ref); + } + + // ========================================================= + // ========================================================= + // ========================================================= + + + template<int N> + BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref) + { + /* this is a leaf node */ + if (unlikely(ref.isLeaf())) + return leafBounds.leafBounds(ref); + + /* recurse if this is an internal node */ + AABBNode* node = ref.getAABBNode(); + + /* enable exclusive prefetch for >= AVX platforms */ +#if defined(__AVX__) + BVH::prefetchW(ref); +#endif + BBox3fa bounds[N]; + + for (size_t i=0; i<N; i++) + if (unlikely(node->child(i) == BVH::emptyNode)) + { + bounds[i] = BBox3fa(empty); + } + else + bounds[i] = recurse_bottom(node->child(i)); + + /* AOS to SOA transform */ + BBox3vf<N> boundsT = transpose<N>(bounds); + + /* set new bounds */ + node->lower_x = boundsT.lower.x; + node->lower_y = boundsT.lower.y; + node->lower_z = boundsT.lower.z; + node->upper_x = boundsT.upper.x; + node->upper_y = boundsT.upper.y; + node->upper_z = boundsT.upper.z; + + return merge<N>(bounds); + } + + template<int N, typename Mesh, typename Primitive> + BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode) + : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {} + + template<int N, typename Mesh, typename Primitive> + void BVHNRefitT<N,Mesh,Primitive>::clear() + { + if (builder) + builder->clear(); + } + + template<int N, typename Mesh, typename Primitive> + void BVHNRefitT<N,Mesh,Primitive>::build() + { + if (mesh->topologyChanged(topologyVersion)) { + topologyVersion = mesh->getTopologyVersion(); + builder->build(); + } + else + refitter->refit(); + } + + template class BVHNRefitter<4>; +#if defined(__AVX__) + template class BVHNRefitter<8>; +#endif + +#if defined(EMBREE_GEOMETRY_TRIANGLE) + Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH4Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#if defined(__AVX__) + Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode); + + Builder* BVH8Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_QUAD) + Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode); + Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif + +#endif + +#if defined(EMBREE_GEOMETRY_USER) + Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode); + Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); } +#endif +#endif + +#if defined(EMBREE_GEOMETRY_INSTANCE) + Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } + +#if defined(__AVX__) + Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode); + Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); } +#endif +#endif + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h new file mode 100644 index 0000000000..4aa9bdd7cc --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h @@ -0,0 +1,95 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "../bvh/bvh.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNRefitter + { + public: + + /*! Type shortcuts */ + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + struct LeafBoundsInterface { + virtual const BBox3fa leafBounds(NodeRef& ref) const = 0; + }; + + public: + + /*! Constructor. */ + BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds); + + /*! refits the BVH */ + void refit(); + + private: + /* single-threaded subtree extraction based on BVH depth */ + void gather_subtree_refs(NodeRef& ref, + size_t &subtrees, + const size_t depth = 0); + + /* single-threaded top-level refit */ + BBox3fa refit_toplevel(NodeRef& ref, + size_t &subtrees, + const BBox3fa *const subTreeBounds, + const size_t depth = 0); + + /* single-threaded subtree refit */ + BBox3fa recurse_bottom(NodeRef& ref); + + public: + BVH* bvh; //!< BVH to refit + const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves + + static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4 : (N==8) ? 3 : 3; + static const size_t MAX_NUM_SUB_TREES = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH + size_t numSubTrees; + NodeRef subTrees[MAX_NUM_SUB_TREES]; + }; + + template<int N, typename Mesh, typename Primitive> + class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface + { + public: + + /*! Type shortcuts */ + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::NodeRef NodeRef; + + public: + BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode); + + virtual void build(); + + virtual void clear(); + + virtual const BBox3fa leafBounds (NodeRef& ref) const + { + size_t num; char* prim = ref.leaf(num); + if (unlikely(ref == BVH::emptyNode)) return empty; + + BBox3fa bounds = empty; + for (size_t i=0; i<num; i++) + bounds.extend(((Primitive*)prim)[i].update(mesh)); + return bounds; + } + + private: + BVH* bvh; + std::unique_ptr<Builder> builder; + std::unique_ptr<BVHNRefitter<N>> refitter; + Mesh* mesh; + unsigned int topologyVersion; + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp new file mode 100644 index 0000000000..2bb431bf0e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp @@ -0,0 +1,127 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_rotate.h" + +namespace embree +{ + namespace isa + { + /*! Computes half surface area of box. */ + __forceinline float halfArea3f(const BBox<vfloat4>& box) { + const vfloat4 d = box.size(); + const vfloat4 a = d*shuffle<1,2,0,3>(d); + return a[0]+a[1]+a[2]; + } + + size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) + { + /*! nothing to rotate if we reached a leaf node. */ + if (parentRef.isBarrier()) return 0; + if (parentRef.isLeaf()) return 0; + AABBNode* parent = parentRef.getAABBNode(); + + /*! rotate all children first */ + vint4 cdepth; + for (size_t c=0; c<4; c++) + cdepth[c] = (int)rotate(parent->child(c),depth+1); + + /* compute current areas of all children */ + vfloat4 sizeX = parent->upper_x-parent->lower_x; + vfloat4 sizeY = parent->upper_y-parent->lower_y; + vfloat4 sizeZ = parent->upper_z-parent->lower_z; + vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ); + + /*! get node bounds */ + BBox<vfloat4> child1_0,child1_1,child1_2,child1_3; + parent->bounds(child1_0,child1_1,child1_2,child1_3); + + /*! Find best rotation. We pick a first child (child1) and a sub-child + (child2child) of a different second child (child2), and swap child1 + and child2child. We perform the best such swap. */ + float bestArea = 0; + size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; + for (size_t c2=0; c2<4; c2++) + { + /*! ignore leaf nodes as we cannot descent into them */ + if (parent->child(c2).isBarrier()) continue; + if (parent->child(c2).isLeaf()) continue; + AABBNode* child2 = parent->child(c2).getAABBNode(); + + /*! transpose child bounds */ + BBox<vfloat4> child2c0,child2c1,child2c2,child2c3; + child2->bounds(child2c0,child2c1,child2c2,child2c3); + + /*! put child1_0 at each child2 position */ + float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); + float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); + float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); + float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); + vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); + vfloat4 min0 = vreduce_min(cost0); + int pos0 = (int)bsf(movemask(min0 == cost0)); + + /*! put child1_1 at each child2 position */ + float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); + float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); + float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); + float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); + vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); + vfloat4 min1 = vreduce_min(cost1); + int pos1 = (int)bsf(movemask(min1 == cost1)); + + /*! put child1_2 at each child2 position */ + float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); + float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); + float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); + float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); + vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); + vfloat4 min2 = vreduce_min(cost2); + int pos2 = (int)bsf(movemask(min2 == cost2)); + + /*! put child1_3 at each child2 position */ + float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); + float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); + float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); + float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); + vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); + vfloat4 min3 = vreduce_min(cost3); + int pos3 = (int)bsf(movemask(min3 == cost3)); + + /*! find best other child */ + vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); + int pos[4] = { pos0,pos1,pos2,pos3 }; + const size_t mbd = BVH4::maxBuildDepth; + vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints + valid &= vint4(int(c2)) != vint4(step); + if (none(valid)) continue; + size_t c1 = select_min(valid,area0123); + float area = area0123[c1]; + if (c1 == c2) continue; // can happen if bounds are NANs + + /*! accept a swap when it reduces cost and is not swapping a node with itself */ + if (area < bestArea) { + bestArea = area; + bestChild1 = c1; + bestChild2 = c2; + bestChild2Child = pos[c1]; + } + } + + /*! if we did not find a swap that improves the SAH then do nothing */ + if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); + + /*! perform the best found tree rotation */ + AABBNode* child2 = parent->child(bestChild2).getAABBNode(); + AABBNode::swap(parent,bestChild1,child2,bestChild2Child); + parent->setBounds(bestChild2,child2->bounds()); + AABBNode::compact(parent); + AABBNode::compact(child2); + + /*! This returned depth is conservative as the child that was + * pulled up in the tree could have been on the critical path. */ + cdepth[bestChild1]++; // bestChild1 was pushed down one level + return 1+reduce_max(cdepth); + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h new file mode 100644 index 0000000000..009bef339e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h @@ -0,0 +1,37 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + template<int N> + class BVHNRotate + { + typedef typename BVHN<N>::NodeRef NodeRef; + + public: + static const bool enabled = false; + + static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; } + static __forceinline void restructure(NodeRef ref, size_t depth = 1) {} + }; + + /* BVH4 tree rotations */ + template<> + class BVHNRotate<4> + { + typedef BVH4::AABBNode AABBNode; + typedef BVH4::NodeRef NodeRef; + + public: + static const bool enabled = true; + + static size_t rotate(NodeRef parentRef, size_t depth = 1); + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp new file mode 100644 index 0000000000..aa56035026 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp @@ -0,0 +1,168 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_statistics.h" +#include "../../common/algorithms/parallel_reduce.h" + +namespace embree +{ + template<int N> + BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh) + { + double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea()); + stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f)); + } + + template<int N> + std::string BVHNStatistics<N>::str() + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << " primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl; + size_t totalBytes = stat.bytes(bvh); + double totalSAH = stat.sah(bvh); + stream << " total : sah = " << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), "; + stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl; + if (stat.statAABBNodes.numNodes ) stream << " getAABBNodes : " << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodes.numNodes ) stream << " ungetAABBNodes : " << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB.numNodes ) stream << " getAABBNodesMB : " << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statAABBNodesMB4D.numNodes) stream << " getAABBNodesMB4D : " << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statOBBNodesMB.numNodes) stream << " ungetAABBNodesMB : " << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl; + if (stat.statQuantizedNodes.numNodes ) stream << " quantizedNodes : " << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " leaves : " << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl; + if (true) stream << " histogram : " << stat.statLeaf.histToString() << std::endl; + return stream.str(); + } + + template<int N> + typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1) + { + Statistics s; + assert(t0t1.size() > 0.0f); + double dt = max(0.0f,t0t1.size()); + if (node.isAABBNode()) + { + AABBNode* n = node.getAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extend(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodes.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodes.numNodes++; + s.statAABBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNode()) + { + OBBNode* n = node.ungetAABBNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodes.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodes.numNodes++; + s.statOBBNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB()) + { + AABBNodeMB* n = node.getAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1)); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statAABBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB.numNodes++; + s.statAABBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isAABBNodeMB4D()) + { + AABBNodeMB4D* n = node.getAABBNodeMB4D(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const BBox1f t0t1i = intersect(t0t1,n->timeRange(i)); + assert(!t0t1i.empty()); + const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i); + Statistics s = statistics(n->child(i),Ai,t0t1i); + s.statAABBNodesMB4D.numChildren++; + return s; + }, Statistics::add); + s.statAABBNodesMB4D.numNodes++; + s.statAABBNodesMB4D.nodeSAH += dt*A; + s.depth++; + } + else if (node.isOBBNodeMB()) + { + OBBNodeMB* n = node.ungetAABBNodeMB(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent0(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statOBBNodesMB.numChildren++; + return s; + }, Statistics::add); + s.statOBBNodesMB.numNodes++; + s.statOBBNodesMB.nodeSAH += dt*A; + s.depth++; + } + else if (node.isQuantizedNode()) + { + QuantizedNode* n = node.quantizedNode(); + s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) { + if (n->child(i) == BVH::emptyNode) return Statistics(); + const double Ai = max(0.0f,halfArea(n->extent(i))); + Statistics s = statistics(n->child(i),Ai,t0t1); + s.statQuantizedNodes.numChildren++; + return s; + }, Statistics::add); + s.statQuantizedNodes.numNodes++; + s.statQuantizedNodes.nodeSAH += dt*A; + s.depth++; + } + else if (node.isLeaf()) + { + size_t num; const char* tri = node.leaf(num); + if (num) + { + for (size_t i=0; i<num; i++) + { + const size_t bytes = bvh->primTy->getBytes(tri); + s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri); + s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri); + s.statLeaf.numBytes += bytes; + tri+=bytes; + } + s.statLeaf.numLeaves++; + s.statLeaf.numPrimBlocks += num; + s.statLeaf.leafSAH += dt*A*num; + if (num-1 < Statistics::LeafStat::NHIST) { + s.statLeaf.numPrimBlocksHistogram[num-1]++; + } + } + } + else { + // -- GODOT start -- + // throw std::runtime_error("not supported node type in bvh_statistics"); + abort(); + // -- GODOT end -- + } + return s; + } + +#if defined(__AVX__) + template class BVHNStatistics<8>; +#endif + +#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) + template class BVHNStatistics<4>; +#endif +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h new file mode 100644 index 0000000000..73dfc6fbcc --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h @@ -0,0 +1,285 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include <sstream> + +namespace embree +{ + template<int N> + class BVHNStatistics + { + typedef BVHN<N> BVH; + typedef typename BVH::AABBNode AABBNode; + typedef typename BVH::OBBNode OBBNode; + typedef typename BVH::AABBNodeMB AABBNodeMB; + typedef typename BVH::AABBNodeMB4D AABBNodeMB4D; + typedef typename BVH::OBBNodeMB OBBNodeMB; + typedef typename BVH::QuantizedNode QuantizedNode; + + typedef typename BVH::NodeRef NodeRef; + + struct Statistics + { + template<typename Node> + struct NodeStat + { + NodeStat ( double nodeSAH = 0, + size_t numNodes = 0, + size_t numChildren = 0) + : nodeSAH(nodeSAH), + numNodes(numNodes), + numChildren(numChildren) {} + + double sah(BVH* bvh) const { + return nodeSAH/bvh->getLinearBounds().expectedHalfArea(); + } + + size_t bytes() const { + return numNodes*sizeof(Node); + } + + size_t size() const { + return numNodes; + } + + double fillRateNom () const { return double(numChildren); } + double fillRateDen () const { return double(numNodes*N); } + double fillRate () const { return fillRateNom()/fillRateDen(); } + + __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b) + { + return NodeStat(a.nodeSAH + b.nodeSAH, + a.numNodes+b.numNodes, + a.numChildren+b.numChildren); + } + + std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); + stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6 << " MB "; + stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), "; + stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives); + return stream.str(); + } + + public: + double nodeSAH; + size_t numNodes; + size_t numChildren; + }; + + struct LeafStat + { + static const int NHIST = 8; + + LeafStat ( double leafSAH = 0.0f, + size_t numLeaves = 0, + size_t numPrimsActive = 0, + size_t numPrimsTotal = 0, + size_t numPrimBlocks = 0, + size_t numBytes = 0) + : leafSAH(leafSAH), + numLeaves(numLeaves), + numPrimsActive(numPrimsActive), + numPrimsTotal(numPrimsTotal), + numPrimBlocks(numPrimBlocks), + numBytes(numBytes) + { + for (size_t i=0; i<NHIST; i++) + numPrimBlocksHistogram[i] = 0; + } + + double sah(BVH* bvh) const { + return leafSAH/bvh->getLinearBounds().expectedHalfArea(); + } + + size_t bytes(BVH* bvh) const { + return numBytes; + } + + size_t size() const { + return numLeaves; + } + + double fillRateNom (BVH* bvh) const { return double(numPrimsActive); } + double fillRateDen (BVH* bvh) const { return double(numPrimsTotal); } + double fillRate (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); } + + __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b) + { + LeafStat stat(a.leafSAH + b.leafSAH, + a.numLeaves+b.numLeaves, + a.numPrimsActive+b.numPrimsActive, + a.numPrimsTotal+b.numPrimsTotal, + a.numPrimBlocks+b.numPrimBlocks, + a.numBytes+b.numBytes); + for (size_t i=0; i<NHIST; i++) { + stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i]; + stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i]; + } + return stat; + } + + std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh); + stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), "; + stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6 << " MB "; + stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), "; + stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), "; + stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives); + return stream.str(); + } + + std::string histToString() const + { + std::ostringstream stream; + stream.setf(std::ios::fixed, std::ios::floatfield); + for (size_t i=0; i<NHIST; i++) + stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% "; + return stream.str(); + } + + public: + double leafSAH; //!< SAH of the leaves only + size_t numLeaves; //!< Number of leaf nodes. + size_t numPrimsActive; //!< Number of active primitives ( + size_t numPrimsTotal; //!< Number of active and inactive primitives + size_t numPrimBlocks; //!< Number of primitive blocks. + size_t numBytes; //!< Number of bytes of leaves. + size_t numPrimBlocksHistogram[8]; + }; + + public: + Statistics (size_t depth = 0, + LeafStat statLeaf = LeafStat(), + NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(), + NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(), + NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(), + NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(), + NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(), + NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>()) + + : depth(depth), + statLeaf(statLeaf), + statAABBNodes(statAABBNodes), + statOBBNodes(statOBBNodes), + statAABBNodesMB(statAABBNodesMB), + statAABBNodesMB4D(statAABBNodesMB4D), + statOBBNodesMB(statOBBNodesMB), + statQuantizedNodes(statQuantizedNodes) {} + + double sah(BVH* bvh) const + { + return statLeaf.sah(bvh) + + statAABBNodes.sah(bvh) + + statOBBNodes.sah(bvh) + + statAABBNodesMB.sah(bvh) + + statAABBNodesMB4D.sah(bvh) + + statOBBNodesMB.sah(bvh) + + statQuantizedNodes.sah(bvh); + } + + size_t bytes(BVH* bvh) const { + return statLeaf.bytes(bvh) + + statAABBNodes.bytes() + + statOBBNodes.bytes() + + statAABBNodesMB.bytes() + + statAABBNodesMB4D.bytes() + + statOBBNodesMB.bytes() + + statQuantizedNodes.bytes(); + } + + size_t size() const + { + return statLeaf.size() + + statAABBNodes.size() + + statOBBNodes.size() + + statAABBNodesMB.size() + + statAABBNodesMB4D.size() + + statOBBNodesMB.size() + + statQuantizedNodes.size(); + } + + double fillRate (BVH* bvh) const + { + double nom = statLeaf.fillRateNom(bvh) + + statAABBNodes.fillRateNom() + + statOBBNodes.fillRateNom() + + statAABBNodesMB.fillRateNom() + + statAABBNodesMB4D.fillRateNom() + + statOBBNodesMB.fillRateNom() + + statQuantizedNodes.fillRateNom(); + double den = statLeaf.fillRateDen(bvh) + + statAABBNodes.fillRateDen() + + statOBBNodes.fillRateDen() + + statAABBNodesMB.fillRateDen() + + statAABBNodesMB4D.fillRateDen() + + statOBBNodesMB.fillRateDen() + + statQuantizedNodes.fillRateDen(); + return nom/den; + } + + friend Statistics operator+ ( const Statistics& a, const Statistics& b ) + { + return Statistics(max(a.depth,b.depth), + a.statLeaf + b.statLeaf, + a.statAABBNodes + b.statAABBNodes, + a.statOBBNodes + b.statOBBNodes, + a.statAABBNodesMB + b.statAABBNodesMB, + a.statAABBNodesMB4D + b.statAABBNodesMB4D, + a.statOBBNodesMB + b.statOBBNodesMB, + a.statQuantizedNodes + b.statQuantizedNodes); + } + + static Statistics add ( const Statistics& a, const Statistics& b ) { + return a+b; + } + + public: + size_t depth; + LeafStat statLeaf; + NodeStat<AABBNode> statAABBNodes; + NodeStat<OBBNode> statOBBNodes; + NodeStat<AABBNodeMB> statAABBNodesMB; + NodeStat<AABBNodeMB4D> statAABBNodesMB4D; + NodeStat<OBBNodeMB> statOBBNodesMB; + NodeStat<QuantizedNode> statQuantizedNodes; + }; + + public: + + /* Constructor gathers statistics. */ + BVHNStatistics (BVH* bvh); + + /*! Convert statistics into a string */ + std::string str(); + + double sah() const { + return stat.sah(bvh); + } + + size_t bytesUsed() const { + return stat.bytes(bvh); + } + + private: + Statistics statistics(NodeRef node, const double A, const BBox1f dt); + + private: + BVH* bvh; + Statistics stat; + }; + + typedef BVHNStatistics<4> BVH4Statistics; + typedef BVHNStatistics<8> BVH8Statistics; +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h new file mode 100644 index 0000000000..7f17084b81 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h @@ -0,0 +1,676 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "node_intersector1.h" +#include "../common/stack_item.h" + +#define NEW_SORTING_CODE 1 + +namespace embree +{ + namespace isa + { + /*! BVH regular node traversal for single rays. */ + template<int N, int Nx, int types> + class BVHNNodeTraverser1Hit; + + /*! Helper functions for fast sorting using AVX512 instructions. */ +#if defined(__AVX512ER__) + + /* KNL code path */ + __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) + { + const vfloat16 dist_shift = align_shift_right<15>(dist,dist); + const vllong8 ptr_shift = align_shift_right<7>(ptr,ptr); + const vbool16 m_geq = d >= dist; + const vbool16 m_geq_shift = m_geq << 1; + dist = select(m_geq,d,dist); + ptr = select(vboold8(m_geq),p,ptr); + dist = select(m_geq_shift,dist_shift,dist); + ptr = select(vboold8(m_geq_shift),ptr_shift,ptr); + } + + __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p) + { + //dist = align_shift_right<15>(dist,d); + //ptr = align_shift_right<7>(ptr,p); + dist = align_shift_right<15>(dist,permute(d,vint16(zero))); + ptr = align_shift_right<7>(ptr,permute(p,vllong8(zero))); + } + + template<int N, int Nx, int types, class NodeRef, class BaseNode> + __forceinline void traverseClosestHitAVX512(NodeRef& cur, + size_t mask, + const vfloat<Nx>& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + vllong8 children( vllong<N>::loadu((void*)node->children) ); + children = vllong8::compact((int)mask,children); + vfloat16 distance = tNear; + distance = vfloat16::compact((int)mask,distance,tNear); + + cur = toScalar(children); + BVHN<N>::prefetch(cur,types); + + mask &= mask-1; + if (likely(mask == 0)) return; + + /* 2 hits: order A0 B0 */ + const vllong8 c0(children); + const vfloat16 d0(distance); + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + const vllong8 c1(children); + const vfloat16 d1(distance); + + cur = toScalar(children); + BVHN<N>::prefetch(cur,types); + + /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */ + const vboolf16 m_dist = d0 < d1; + const vfloat16 dist_A0 = select(m_dist, d0, d1); + const vfloat16 dist_B0 = select(m_dist, d1, d0); + const vllong8 ptr_A0 = select(vboold8(m_dist), c0, c1); + const vllong8 ptr_B0 = select(vboold8(m_dist), c1, c0); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A0); + stackPtr[0].ptr = toScalar(ptr_B0); + *(float*)&stackPtr[0].dist = toScalar(dist_B0); + stackPtr++; + return; + } + + /* 3 hits: order A1 B1 C1 */ + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + const vllong8 c2(children); + const vfloat16 d2(distance); + + cur = toScalar(children); + BVHN<N>::prefetch(cur,types); + + const vboolf16 m_dist1 = dist_A0 <= d2; + const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0); + const vllong8 ptr_A1 = select(vboold8(m_dist1), ptr_A0, c2); + const vllong8 ptr_tmp_B1 = select(vboold8(m_dist1), c2, ptr_A0); + + const vboolf16 m_dist2 = dist_B0 <= dist_tmp_B1; + const vfloat16 dist_B1 = select(m_dist2, dist_B0 , dist_tmp_B1); + const vfloat16 dist_C1 = select(m_dist2, dist_tmp_B1, dist_B0); + const vllong8 ptr_B1 = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1); + const vllong8 ptr_C1 = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A1); + stackPtr[0].ptr = toScalar(ptr_C1); + *(float*)&stackPtr[0].dist = toScalar(dist_C1); + stackPtr[1].ptr = toScalar(ptr_B1); + *(float*)&stackPtr[1].dist = toScalar(dist_B1); + stackPtr+=2; + return; + } + + /* 4 hits: order A2 B2 C2 D2 */ + + const vfloat16 dist_A1 = select(m_dist1, dist_A0, d2); + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + const vllong8 c3(children); + const vfloat16 d3(distance); + + cur = toScalar(children); + BVHN<N>::prefetch(cur,types); + + const vboolf16 m_dist3 = dist_A1 <= d3; + const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1); + const vllong8 ptr_A2 = select(vboold8(m_dist3), ptr_A1, c3); + const vllong8 ptr_tmp_B2 = select(vboold8(m_dist3), c3, ptr_A1); + + const vboolf16 m_dist4 = dist_B1 <= dist_tmp_B2; + const vfloat16 dist_B2 = select(m_dist4, dist_B1 , dist_tmp_B2); + const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1); + const vllong8 ptr_B2 = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2); + const vllong8 ptr_tmp_C2 = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1); + + const vboolf16 m_dist5 = dist_C1 <= dist_tmp_C2; + const vfloat16 dist_C2 = select(m_dist5, dist_C1 , dist_tmp_C2); + const vfloat16 dist_D2 = select(m_dist5, dist_tmp_C2, dist_C1); + const vllong8 ptr_C2 = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2); + const vllong8 ptr_D2 = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = toScalar(ptr_A2); + stackPtr[0].ptr = toScalar(ptr_D2); + *(float*)&stackPtr[0].dist = toScalar(dist_D2); + stackPtr[1].ptr = toScalar(ptr_C2); + *(float*)&stackPtr[1].dist = toScalar(dist_C2); + stackPtr[2].ptr = toScalar(ptr_B2); + *(float*)&stackPtr[2].dist = toScalar(dist_B2); + stackPtr+=3; + return; + } + + /* >=5 hits: reverse to descending order for writing to stack */ + + const size_t hits = 4 + popcnt(mask); + const vfloat16 dist_A2 = select(m_dist3, dist_A1, d3); + vfloat16 dist(neg_inf); + vllong8 ptr(zero); + + + isort_quick_update(dist,ptr,dist_A2,ptr_A2); + isort_quick_update(dist,ptr,dist_B2,ptr_B2); + isort_quick_update(dist,ptr,dist_C2,ptr_C2); + isort_quick_update(dist,ptr,dist_D2,ptr_D2); + + do { + + children = align_shift_right<1>(children,children); + distance = align_shift_right<1>(distance,distance); + + cur = toScalar(children); + BVHN<N>::prefetch(cur,types); + + const vfloat16 new_dist(permute(distance,vint16(zero))); + const vllong8 new_ptr(permute(children,vllong8(zero))); + + mask &= mask-1; + isort_update(dist,ptr,new_dist,new_ptr); + + } while(mask); + + const vboold8 m_stack_ptr(0x55); // 10101010 (lsb -> msb) + const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb) + + /* extract current noderef */ + cur = toScalar(permute(ptr,vllong8(hits-1))); + /* rearrange pointers to beginning of 16 bytes block */ + vllong8 stackElementA0; + stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0); + /* put distances in between */ + vuint16 stackElementA1((__m512i)stackElementA0); + stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1); + /* write out first 4 x 16 bytes block to stack */ + vuint16::storeu(stackPtr,stackElementA1); + /* get upper half of dist and ptr */ + dist = align_shift_right<4>(dist,dist); + ptr = align_shift_right<4>(ptr,ptr); + /* assemble and write out second block */ + vllong8 stackElementB0; + stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0); + vuint16 stackElementB1((__m512i)stackElementB0); + stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1); + vuint16::storeu(stackPtr + 4,stackElementB1); + /* increase stack pointer */ + stackPtr += hits-1; + } +#endif + +#if defined(__AVX512VL__) // SKX + + template<int N> + __forceinline void isort_update(vint<N> &dist, const vint<N> &d) + { + const vint<N> dist_shift = align_shift_right<N-1>(dist,dist); + const vboolf<N> m_geq = d >= dist; + const vboolf<N> m_geq_shift = m_geq << 1; + dist = select(m_geq,d,dist); + dist = select(m_geq_shift,dist_shift,dist); + } + + template<int N> + __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) { + dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero))); + } + + __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) { + return toScalar(permutex2var((__m256i)index,n0,n1)); + } + + __forceinline float permuteExtract(const vint8& index, const vfloat8& n) { + return toScalar(permute(n,index)); + } + +#endif + + /* Specialization for BVH4. */ + template<int Nx, int types> + class BVHNNodeTraverser1Hit<4, Nx, types> + { + typedef BVH4 BVH; + typedef BVH4::NodeRef NodeRef; + typedef BVH4::BaseNode BaseNode; + + + public: + /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */ + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat<Nx>& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); +#if defined(__AVX512ER__) + traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#else + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } + +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + StackItemT<NodeRef>::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif +#endif + } + + /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */ + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat<Nx>& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + + /* Specialization for BVH8. */ + template<int Nx, int types> + class BVHNNodeTraverser1Hit<8, Nx, types> + { + typedef BVH8 BVH; + typedef BVH8::NodeRef NodeRef; + typedef BVH8::BaseNode BaseNode; + +#if defined(__AVX512VL__) + template<class NodeRef, class BaseNode> + static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur, + size_t mask, + const vfloat8& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]); + const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]); + vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step); + distance_i = vint8::compact((int)mask,distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + + mask &= mask-1; + if (likely(mask == 0)) return; + + /* 2 hits: order A0 B0 */ + const vint8 d0(distance_i); + const vint8 d1(shuffle<1>(distance_i)); + cur = permuteExtract(d1,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A0 = min(d0, d1); + const vint8 dist_B0 = max(d0, d1); + assert(dist_A0[0] < dist_B0[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A0,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear); + stackPtr++; + return; + } + + /* 3 hits: order A1 B1 C1 */ + + const vint8 d2(shuffle<2>(distance_i)); + cur = permuteExtract(d2,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A1 = min(dist_A0,d2); + const vint8 dist_tmp_B1 = max(dist_A0,d2); + const vint8 dist_B1 = min(dist_B0,dist_tmp_B1); + const vint8 dist_C1 = max(dist_B0,dist_tmp_B1); + assert(dist_A1[0] < dist_B1[0]); + assert(dist_B1[0] < dist_C1[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A1,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear); + stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear); + stackPtr+=2; + return; + } + + /* 4 hits: order A2 B2 C2 D2 */ + + const vint8 d3(shuffle<3>(distance_i)); + cur = permuteExtract(d3,n0,n1); + BVH::prefetch(cur,types); + + const vint8 dist_A2 = min(dist_A1,d3); + const vint8 dist_tmp_B2 = max(dist_A1,d3); + const vint8 dist_B2 = min(dist_B1,dist_tmp_B2); + const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2); + const vint8 dist_C2 = min(dist_C1,dist_tmp_C2); + const vint8 dist_D2 = max(dist_C1,dist_tmp_C2); + assert(dist_A2[0] < dist_B2[0]); + assert(dist_B2[0] < dist_C2[0]); + assert(dist_C2[0] < dist_D2[0]); + + mask &= mask-1; + if (likely(mask == 0)) { + cur = permuteExtract(dist_A2,n0,n1); + stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1); + *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear); + stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1); + *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear); + stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1); + *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear); + stackPtr+=3; + return; + } + + /* >=5 hits: reverse to descending order for writing to stack */ + + distance_i = align_shift_right<3>(distance_i,distance_i); + const size_t hits = 4 + popcnt(mask); + vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert + + isort_quick_update(dist,dist_A2); + isort_quick_update(dist,dist_B2); + isort_quick_update(dist,dist_C2); + isort_quick_update(dist,dist_D2); + + do { + + distance_i = align_shift_right<1>(distance_i,distance_i); + cur = permuteExtract(distance_i,n0,n1); + BVH::prefetch(cur,types); + const vint8 new_dist(permute(distance_i,vint8(zero))); + mask &= mask-1; + isort_update(dist,new_dist); + + } while(mask); + + for (size_t i=0; i<7; i++) + assert(dist[i+0]>=dist[i+1]); + + for (size_t i=0;i<hits-1;i++) + { + stackPtr->ptr = permuteExtract(dist,n0,n1); + *(float*)&stackPtr->dist = permuteExtract(dist,tNear); + dist = align_shift_right<1>(dist,dist); + stackPtr++; + } + cur = permuteExtract(dist,n0,n1); + } +#endif + + public: + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t mask, + const vfloat<Nx>& tNear, + StackItemT<NodeRef>*& stackPtr, + StackItemT<NodeRef>* stackEnd) + { + assert(mask != 0); +#if defined(__AVX512ER__) + traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#elif defined(__AVX512VL__) + traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd); +#else + + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + if (likely(mask == 0)) { + assert(cur != BVH::emptyNode); + return; + } + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + const unsigned int d0 = ((unsigned int*)&tNear)[r]; + r = bscf(mask); + NodeRef c1 = node->child(r); + BVH::prefetch(c1,types); + const unsigned int d1 = ((unsigned int*)&tNear)[r]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + assert(stackPtr < stackEnd); + if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; } + else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; } + } +#if NEW_SORTING_CODE == 1 + vint4 s0((size_t)c0,(size_t)d0); + vint4 s1((size_t)c1,(size_t)d1); + + r = bscf(mask); + NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; + vint4 s2((size_t)c2,(size_t)d2); + /* 3 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort3(s0,s1,s2); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; + cur = toSizeT(s2); + stackPtr+=2; + return; + } + r = bscf(mask); + NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; + vint4 s3((size_t)c3,(size_t)d3); + /* 4 hits */ + if (likely(mask == 0)) { + StackItemT<NodeRef>::sort4(s0,s1,s2,s3); + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; + cur = toSizeT(s3); + stackPtr+=3; + return; + } + *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3; + /*! fallback case if more than 4 children are hit */ + StackItemT<NodeRef>* stackFirst = stackPtr; + stackPtr+=4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; + const vint4 s((size_t)c,(size_t)d); + *(vint4*)stackPtr++ = s; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#else + /*! Here starts the slow path for 3 or 4 hit children. We push + * all nodes onto the stack to sort them there. */ + assert(stackPtr < stackEnd); + stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; + assert(stackPtr < stackEnd); + stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; + + /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + + /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */ + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (likely(mask == 0)) { + sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; + return; + } + /*! fallback case if more than 4 children are hit */ + StackItemT<NodeRef>* stackFirst = stackPtr-4; + while (1) + { + assert(stackPtr < stackEnd); + r = bscf(mask); + c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++; + assert(c != BVH::emptyNode); + if (unlikely(mask == 0)) break; + } + sort(stackFirst,stackPtr); + cur = (NodeRef) stackPtr[-1].ptr; stackPtr--; +#endif +#endif + } + + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t mask, + const vfloat<Nx>& tNear, + NodeRef*& stackPtr, + NodeRef* stackEnd) + { + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVH::prefetch(cur,types); + + /* simpler in sequence traversal order */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); BVH::prefetch(cur,types); + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + assert(stackPtr < stackEnd); + *stackPtr = cur; stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h new file mode 100644 index 0000000000..9c603babf0 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h @@ -0,0 +1,154 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" +#include "../common/ray.h" +#include "../common/stack_item.h" + +namespace embree +{ + namespace isa + { + template<int N, int Nx, int types> + class BVHNNodeTraverserStreamHitCoherent + { + typedef BVHN<N> BVH; + typedef typename BVH::NodeRef NodeRef; + typedef typename BVH::BaseNode BaseNode; + + public: + template<class T> + static __forceinline void traverseClosestHit(NodeRef& cur, + size_t& m_trav_active, + const vbool<Nx>& vmask, + const vfloat<Nx>& tNear, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + const size_t r0 = bscf(mask); + assert(r0 < 8); + cur = node->child(r0); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r0]; + assert(cur != BVH::emptyNode); + if (unlikely(mask == 0)) return; + + const unsigned int* const tNear_i = (unsigned int*)&tNear; + + /*! two children are hit, push far child, and continue with closer child */ + NodeRef c0 = cur; + unsigned int d0 = tNear_i[r0]; + const size_t r1 = bscf(mask); + assert(r1 < 8); + NodeRef c1 = node->child(r1); + BVHN<N>::prefetch(c1,types); + unsigned int d1 = tNear_i[r1]; + + assert(c0 != BVH::emptyNode); + assert(c1 != BVH::emptyNode); + if (likely(mask == 0)) { + if (d0 < d1) { + assert(tNear[r1] >= 0.0f); + stackPtr->mask = tMask[r1]; + stackPtr->parent = parent; + stackPtr->child = c1; + stackPtr++; + cur = c0; + m_trav_active = tMask[r0]; + return; + } + else { + assert(tNear[r0] >= 0.0f); + stackPtr->mask = tMask[r0]; + stackPtr->parent = parent; + stackPtr->child = c0; + stackPtr++; + cur = c1; + m_trav_active = tMask[r1]; + return; + } + } + + /*! slow path for more than two hits */ + size_t hits = movemask(vmask); + const vint<Nx> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<Nx>(step), 0); + #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + const vint<N> tmp = extractN<N,0>(dist_i); + const vint<Nx> dist_i_sorted = usort_descending(tmp); + #else + const vint<Nx> dist_i_sorted = usort_descending(dist_i); + #endif + const vint<Nx> sorted_index = dist_i_sorted & 7; + + size_t i = 0; + for (;;) + { + const unsigned int index = sorted_index[i]; + assert(index < 8); + cur = node->child(index); + m_trav_active = tMask[index]; + assert(m_trav_active); + BVHN<N>::prefetch(cur,types); + bscf(hits); + if (unlikely(hits==0)) break; + i++; + assert(cur != BVH::emptyNode); + assert(tNear[index] >= 0.0f); + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + + template<class T> + static __forceinline void traverseAnyHit(NodeRef& cur, + size_t& m_trav_active, + const vbool<Nx>& vmask, + const T* const tMask, + StackItemMaskCoherent*& stackPtr) + { + const NodeRef parent = cur; + size_t mask = movemask(vmask); + assert(mask != 0); + const BaseNode* node = cur.baseNode(); + + /*! one child is hit, continue with that child */ + size_t r = bscf(mask); + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r]; + + /* simple in order sequence */ + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + + for (; ;) + { + r = bscf(mask); + cur = node->child(r); + BVHN<N>::prefetch(cur,types); + m_trav_active = tMask[r]; + assert(cur != BVH::emptyNode); + if (likely(mask == 0)) return; + stackPtr->mask = m_trav_active; + stackPtr->parent = parent; + stackPtr->child = cur; + stackPtr++; + } + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h new file mode 100644 index 0000000000..a978c0c459 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h @@ -0,0 +1,31 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "bvh.h" + +namespace embree +{ + namespace isa + { + struct NearFarPrecalculations + { + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + + __forceinline NearFarPrecalculations() {} + + __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N) + { + const size_t size = sizeof(float)*N; + nearX = (dir.x < 0.0f) ? 1*size : 0*size; + nearY = (dir.y < 0.0f) ? 3*size : 2*size; + nearZ = (dir.z < 0.0f) ? 5*size : 4*size; + farX = nearX ^ size; + farY = nearY ^ size; + farZ = nearZ ^ size; + } + }; + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h new file mode 100644 index 0000000000..aa0d4ba4d7 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h @@ -0,0 +1,1788 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +#if defined(__AVX2__) +#define __FMA_X4__ +#endif + +#if defined(__aarch64__) +#define __FMA_X4__ +#endif + + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx, bool robust> + struct TravRayBase; + + /* Base (without tnear and tfar) */ + template<int N, int Nx> + struct TravRayBase<N,Nx,false> + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const Vec3fa ray_rdir = rcp_safe(ray_dir); + org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); + rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z); +#if defined(__FMA_X4__) + const Vec3fa ray_org_rdir = ray_org*ray_rdir; +#if !defined(__aarch64__) + org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); +#else + //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd + //x86 will use msub + neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z); +#endif +#endif + nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); + nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); + nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>); + farX = nearX ^ sizeof(vfloat<N>); + farY = nearY ^ sizeof(vfloat<N>); + farZ = nearZ ^ sizeof(vfloat<N>); + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + + } + + template<int K> + __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + size_t flip = sizeof(vfloat<N>)) + { + org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + org_rdir = org*rdir; +#else + neg_org_rdir = -(org*rdir); +#endif +#endif + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf<Nx> org, dir, rdir; +#if defined(__FMA_X4__) +#if !defined(__aarch64__) + Vec3vf<Nx> org_rdir; +#else + //aarch64 version are keeping negation of the org_rdir and use madd + //x86 uses msub + Vec3vf<Nx> neg_org_rdir; +#endif +#endif +#if defined(__AVX512ER__) // KNL+ + vint16 permX, permY, permZ; +#endif + + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Base (without tnear and tfar) */ + template<int N, int Nx> + struct TravRayBase<N,Nx,true> + { + __forceinline TravRayBase() {} + + __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir) + : org_xyz(ray_org), dir_xyz(ray_dir) + { + const float round_down = 1.0f-3.0f*float(ulp); + const float round_up = 1.0f+3.0f*float(ulp); + const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir); + const Vec3fa ray_rdir_near = round_down*ray_rdir; + const Vec3fa ray_rdir_far = round_up *ray_rdir; + org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z); + dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z); + rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z); + rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z); + nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>); + nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>); + nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>); + farX = nearX ^ sizeof(vfloat<N>); + farY = nearY ^ sizeof(vfloat<N>); + farZ = nearZ ^ sizeof(vfloat<N>); + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + template<int K> + __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + size_t flip = sizeof(vfloat<N>)) + { + const vfloat<Nx> round_down = 1.0f-3.0f*float(ulp); + const vfloat<Nx> round_up = 1.0f+3.0f*float(ulp); + org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]); + dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); + rdir_near = round_down*Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + rdir_far = round_up *Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); + + nearX = nearXYZ.x[k]; + nearY = nearXYZ.y[k]; + nearZ = nearXYZ.z[k]; + farX = nearX ^ flip; + farY = nearY ^ flip; + farZ = nearZ ^ flip; + +#if defined(__AVX512ER__) // KNL+ + /* optimization works only for 8-wide BVHs with 16-wide SIMD */ + const vint<16> id(step); + const vint<16> id2 = align_shift_right<16/2>(id, id); + permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2); + permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2); + permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2); +#endif + } + + Vec3fa org_xyz, dir_xyz; + Vec3vf<Nx> org, dir, rdir_near, rdir_far; +#if defined(__AVX512ER__) // KNL+ + vint16 permX, permY, permZ; +#endif + + size_t nearX, nearY, nearZ; + size_t farX, farY, farZ; + }; + + /* Full (with tnear and tfar) */ + template<int N, int Nx, bool robust> + struct TravRay : TravRayBase<N,Nx,robust> + { + __forceinline TravRay() {} + + __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar) + : TravRayBase<N,Nx,robust>(ray_org, ray_dir), + tnear(ray_tnear), tfar(ray_tfar) {} + + template<int K> + __forceinline TravRay(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, + const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ, + float ray_tnear, float ray_tfar, + size_t flip = sizeof(vfloat<N>)) + : TravRayBase<N,Nx,robust>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip), + tnear(ray_tnear), tfar(ray_tfar) {} + + vfloat<Nx> tnear; + vfloat<Nx> tfar; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // Point Query structure used in single-ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + struct TravPointQuery + { + __forceinline TravPointQuery() {} + + __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad) + { + org = Vec3vf<N>(query_org.x, query_org.y, query_org.z); + rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z); + } + + __forceinline vfloat<N> const& tfar() const { + return rad.x; + } + + Vec3vf<N> org, rad; + }; + + ////////////////////////////////////////////////////////////////////////////////////// + // point query + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t pointQuerySphereDistAndMask( + const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, + vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ) + { + const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool<N> vmask = dist <= query.tfar()*query.tfar(); + const vbool<N> valid = minX <= maxX; + return movemask(vmask) & movemask(valid); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x)); + const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y)); + const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z)); + const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x)); + const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y)); + const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z)); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x); + const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y); + const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z); + const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x); + const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y); + const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z); + const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0])); + const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0])); + const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0])); + const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0])); + const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0])); + const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0])); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeSphere(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> start_x(node->start.x); + const vfloat<N> scale_x(node->scale.x); + const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> start_y(node->start.y); + const vfloat<N> scale_y(node->scale.y); + const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> start_z(node->start.z); + const vfloat<N> scale_z(node->scale.z); + const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N> minX = node->dequantizeLowerX(time); + const vfloat<N> maxX = node->dequantizeUpperX(time); + const vfloat<N> minY = node->dequantizeLowerY(time); + const vfloat<N> maxY = node->dequantizeUpperY(time); + const vfloat<N> minZ = node->dequantizeLowerZ(time); + const vfloat<N> maxZ = node->dequantizeUpperZ(time); + return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask()); + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryAABBDistAndMask( + const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, + vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ) + { + const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x; + const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y; + const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z; + dist = vX * vX + vY * vY + vZ * vZ; + const vbool<N> valid = minX <= maxX; + const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) | + (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) | + (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z)); + return movemask(vmask) & movemask(valid); + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x)); + const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y)); + const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z)); + const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x)); + const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y)); + const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z)); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x); + const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y); + const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z); + const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x); + const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y); + const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z); + const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0])); + const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0])); + const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0])); + const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0])); + const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0])); + const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0])); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ); + } + + template<int N> + __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + size_t mask = pointQueryNodeAABB(node, query, time, dist); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t); + mask &= movemask(vmask); + } + + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat<N> start_x(node->start.x); + const vfloat<N> scale_x(node->scale.x); + const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x); + const vfloat<N> start_y(node->start.y); + const vfloat<N> scale_y(node->scale.y); + const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y); + const vfloat<N> start_z(node->start.z); + const vfloat<N> scale_z(node->scale.z); + const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat<N> minX = node->dequantizeLowerX(time); + const vfloat<N> maxX = node->dequantizeUpperX(time); + const vfloat<N> minY = node->dequantizeLowerY(time); + const vfloat<N> maxY = node->dequantizeUpperY(time); + const vfloat<N> minZ = node->dequantizeLowerZ(time); + const vfloat<N> maxZ = node->dequantizeUpperZ(time); + return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + template<int N> + __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + // TODO: point query - implement + const vbool<N> vmask = vbool<N>(true); + const size_t mask = movemask(vmask) & ((1<<N)-1); + dist = vfloat<N>(0.0f); + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist); + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) + { +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__aarch64__) + const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear); + const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) + { +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); +#endif + +#else + const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + +#endif + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) + { + const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) + { + const vllong8 invalid((size_t)BVH8::emptyNode); + const vboold8 m_valid(invalid != vllong8::loadu(node->children)); + const vfloat16 bminmaxX = permute(vfloat16::load((const float*)&node->lower_x), ray.permX); + const vfloat16 bminmaxY = permute(vfloat16::load((const float*)&node->lower_y), ray.permY); + const vfloat16 bminmaxZ = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ); + const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx> + __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist) + { + const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tFarX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist) + { + const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le((1 << 4)-1,tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist) + { + const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le((1 << 8)-1,tNear,tFar); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist) + { + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW + const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<N)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool<N> vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + const vbool<N> vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist) + { + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); + const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + const size_t mask = movemask(tNear <= tFar); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); +#if defined (__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z; +#endif +#if defined(__FMA_X4__) && !defined(__AVX512F__) + const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear)); + const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar )); +#else + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); +#endif + vbool<N> vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N> + __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX); + const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY); + const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ); + const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ); + const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX); + const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY); + const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ); + const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z; + const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ); + vbool<N> vmask = tNear <= tFar; + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + vmask &= (node1->lower_t <= time) & (time < node1->upper_t); + } + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist); + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<4)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat4 start_x(node->start.x); + const vfloat4 scale_x(node->scale.x); + const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x); + const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x); + const vfloat4 start_y(node->start.y); + const vfloat4 scale_y(node->scale.y); + const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y); + const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y); + const vfloat4 start_z(node->start.z); + const vfloat4 scale_z(node->scale.z); + const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z); + const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); + + const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool4 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + dist = tNear; + return mask & mvalid; + } + + +#if defined(__AVX__) + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + +#if defined(__AVX2__) +#if defined(__aarch64__) + const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX2__) && !defined(__AVX512F__) // HSW + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) > asInt(tFar); + const size_t mask = movemask(vmask) ^ ((1<<8)-1); +#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = asInt(tNear) <= asInt(tFar); + const size_t mask = movemask(vmask); +#else + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); +#endif + dist = tNear; + return mask & mvalid; + } + + template<> + __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat8 start_x(node->start.x); + const vfloat8 scale_x(node->scale.x); + const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x); + const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x); + const vfloat8 start_y(node->start.y); + const vfloat8 scale_y(node->scale.y); + const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y); + const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y); + const vfloat8 start_z(node->start.z); + const vfloat8 scale_z(node->scale.z); + const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z); + const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z); + + const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool8 vmask = tNear <= tFar; + const size_t mask = movemask(vmask); + + dist = tNear; + return mask & mvalid; + } + + +#endif + +#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 start_x(node->start.x); + const vfloat16 scale_x(node->scale.x); + const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); + const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); + const vfloat16 start_y(node->start.y); + const vfloat16 scale_y(node->scale.y); + const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); + const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); + const vfloat16 start_z(node->start.z); + const vfloat16 scale_z(node->scale.z); + const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); + const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); + + const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask) & mvalid; + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 start_x(node->start.x); + const vfloat16 scale_x(node->scale.x); + const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x); + const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x); + const vfloat16 start_y(node->start.y); + const vfloat16 scale_y(node->scale.y); + const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y); + const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y); + const vfloat16 start_z(node->start.z); + const vfloat16 scale_z(node->scale.z); + const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z); + const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z); + + const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear); + const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar); + const vbool16 vmask = le(vbool16(0xf),tNear,tFar); + const size_t mask = movemask(vmask) & mvalid; + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist) + { + const vbool16 m_valid(node->validMask16()); + const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); + const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); + const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); + const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<> + __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist) + { + const vbool16 m_valid(node->validMask16()); + const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX); + const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY); + const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ); + const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!! + const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y; + const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z; + const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear); + const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar); + const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar)); + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + +#endif + + + template<int N, int Nx> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist) + { + const vboolf<N> mvalid = node->validMask(); + const vfloat<N> lower_x = node->dequantizeLowerX(time); + const vfloat<N> upper_x = node->dequantizeUpperX(time); + const vfloat<N> lower_y = node->dequantizeLowerY(time); + const vfloat<N> upper_y = node->dequantizeUpperY(time); + const vfloat<N> lower_z = node->dequantizeLowerZ(time); + const vfloat<N> upper_z = node->dequantizeUpperZ(time); +#if defined(__FMA_X4__) +#if defined(__aarch64__) + const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#endif +#else + const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat<N> tminX = mini(tNearX,tFarX); + const vfloat<N> tmaxX = maxi(tNearX,tFarX); + const vfloat<N> tminY = mini(tNearY,tFarY); + const vfloat<N> tmaxY = maxi(tNearY,tFarY); + const vfloat<N> tminZ = mini(tNearZ,tFarZ); + const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ); + const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + template<int N, int Nx> + __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist) + { + const vboolf<N> mvalid = node->validMask(); + const vfloat<N> lower_x = node->dequantizeLowerX(time); + const vfloat<N> upper_x = node->dequantizeUpperX(time); + const vfloat<N> lower_y = node->dequantizeLowerY(time); + const vfloat<N> upper_y = node->dequantizeUpperY(time); + const vfloat<N> lower_z = node->dequantizeLowerZ(time); + const vfloat<N> upper_z = node->dequantizeUpperZ(time); + const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat<N> tminX = mini(tNearX,tFarX); + const vfloat<N> tmaxX = maxi(tNearX,tFarX); + const vfloat<N> tminY = mini(tNearY,tFarY); + const vfloat<N> tmaxY = maxi(tNearY,tFarY); + const vfloat<N> tminZ = mini(tNearZ,tFarZ); + const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ); + const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear); + const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar)); +#else + const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid; +#endif + const size_t mask = movemask(vmask); + dist = tNear; + return mask; + } + + +#if defined(__AVX512ER__) + // for KNL + template<> + __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 lower_x = node->dequantizeLowerX(time); + const vfloat16 upper_x = node->dequantizeUpperX(time); + const vfloat16 lower_y = node->dequantizeLowerY(time); + const vfloat16 upper_y = node->dequantizeUpperY(time); + const vfloat16 lower_z = node->dequantizeLowerZ(time); + const vfloat16 upper_z = node->dequantizeUpperZ(time); + + const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); + + const vfloat16 tminX = min(tNearX,tFarX); + const vfloat16 tmaxX = max(tNearX,tFarX); + const vfloat16 tminY = min(tNearY,tFarY); + const vfloat16 tmaxY = max(tNearY,tFarY); + const vfloat16 tminZ = min(tNearZ,tFarZ); + const vfloat16 tmaxZ = max(tNearZ,tFarZ); + const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); + const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); + const vbool16 vmask = tNear <= tFar; + const size_t mask = movemask(vmask) & mvalid; + dist = extractN<4,0>(tNear); + return mask; + } + + + // for KNL + template<> + __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist) + { + const size_t mvalid = movemask(node->validMask()); + const vfloat16 lower_x = node->dequantizeLowerX(time); + const vfloat16 upper_x = node->dequantizeUpperX(time); + const vfloat16 lower_y = node->dequantizeLowerY(time); + const vfloat16 upper_y = node->dequantizeUpperY(time); + const vfloat16 lower_z = node->dequantizeLowerZ(time); + const vfloat16 upper_z = node->dequantizeUpperZ(time); + + const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x; + const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y; + const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z; + const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x; + const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y; + const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z; + + const vfloat16 tminX = min(tNearX,tFarX); + const vfloat16 tmaxX = max(tNearX,tFarX); + const vfloat16 tminY = min(tNearY,tFarY); + const vfloat16 tmaxY = max(tNearY,tFarY); + const vfloat16 tminZ = min(tNearZ,tFarZ); + const vfloat16 tmaxZ = max(tNearZ,tFarZ); + const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear); + const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar ); + const vbool16 vmask = tNear <= tFar; + const size_t mask = movemask(vmask) & mvalid; + dist = extractN<4,0>(tNear); + return mask; + } + +#endif + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,N,robust>& ray, vfloat<N>& dist) + { + const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir); + //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir; + const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir); + const Vec3vf<N> org = xfmPoint(node->naabb,ray.org); + const Vec3vf<N> tLowerXYZ = org * nrdir; // (Vec3fa(zero) - org) * rdir; + const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir; + + const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp)); + } + const vbool<N> vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, bool robust> + __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,N,robust>& ray, const float time, vfloat<N>& dist) + { + const AffineSpace3vf<N> xfm = node->space0; + const Vec3vf<N> b0_lower = zero; + const Vec3vf<N> b0_upper = one; + const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time)); + const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time)); + + const BBox3vf<N> bounds(lower,upper); + const Vec3vf<N> dir = xfmVector(xfm,ray.dir); + const Vec3vf<N> rdir = rcp_safe(dir); + const Vec3vf<N> org = xfmPoint(xfm,ray.org); + + const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir; + const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir; + + const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z); + const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x); + const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y); + const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z); + vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ); + vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ ); + if (robust) { + tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp)); + tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp)); + } + const vbool<N> vmask = tNear <= tFar; + dist = tNear; + return movemask(vmask); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in point query raversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Computes traversal information for N nodes with 1 point query */ + template<int N, int types> + struct BVHNNodePointQuerySphere1; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeSphere(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeSphereMB4D(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQuerySphere1<N, BVH_QN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNQuantizedBaseNodePointQuerySphere1 + { + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + return pointQueryNodeSphere(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + return pointQueryNodeSphere(node,query,time,dist); + } + }; + + /*! Computes traversal information for N nodes with 1 point query */ + template<int N, int types> + struct BVHNNodePointQueryAABB1; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = pointQueryNodeAABB(node.getAABBNode(), query, dist); + else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else return false; + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist); + else mask = pointQueryNodeAABBMB4D(node, query, time, dist); + return true; + } + }; + + template<int N> + struct BVHNNodePointQueryAABB1<N, BVH_QN1> + { + static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist); + return true; + } + }; + + template<int N> + struct BVHNQuantizedBaseNodePointQueryAABB1 + { + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist) + { + return pointQueryNodeAABB(node,query,dist); + } + + static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist) + { + return pointQueryNodeAABB(node,query,time,dist); + } + }; + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in ray traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with 1 ray */ + template<int N, int Nx, int types, bool robust> + struct BVHNNodeIntersector1; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNode(), ray, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4D<N>(node, ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeMB4DRobust<N>(node, ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNode(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (likely(node.isAABBNode())) mask = intersectNodeRobust(node.getAABBNode(), ray, dist); + else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist); + else return false; + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNode(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (likely(node.isAABBNodeMB())) mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist); + else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else return false; + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4D(node, ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist); + else mask = intersectNodeMB4DRobust(node, ray, time, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_QN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + template<int N, int Nx> + struct BVHNNodeIntersector1<N, Nx, BVH_QN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask) + { + if (unlikely(node.isLeaf())) return false; + mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist); + return true; + } + }; + + /*! Intersects N nodes with K rays */ + template<int N, int Nx, bool robust> + struct BVHNQuantizedBaseNodeIntersector1; + + template<int N, int Nx> + struct BVHNQuantizedBaseNodeIntersector1<N, Nx, false> + { + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,false>& ray, vfloat<Nx>& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + template<int N, int Nx> + struct BVHNQuantizedBaseNodeIntersector1<N, Nx, true> + { + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist) + { + return intersectNode(node,ray,dist); + } + + static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist) + { + return intersectNode(node,ray,time,dist); + } + + }; + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h new file mode 100644 index 0000000000..800ac8b478 --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h @@ -0,0 +1,269 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Frustum structure used in hybrid and stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /* + Optimized frustum test. We calculate t=(p-org)/dir in ray/box + intersection. We assume the rays are split by octant, thus + dir intervals are either positive or negative in each + dimension. + + Case 1: dir.min >= 0 && dir.max >= 0: + t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + + Case 2: dir.min < 0 && dir.max < 0: + t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max + t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min + */ + + template<bool robust> + struct Frustum; + + /* Fast variant */ + template<> + struct Frustum<false> + { + __forceinline Frustum() {} + + template<int K> + __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(valid, org, rdir, ray_tnear, ray_tfar, N); + } + + template<int K> + __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + +#if defined (__aarch64__) + neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org)); + neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org)); +#else + min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org); + max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org); +#endif + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template<int K> + __forceinline void updateMaxDist(const vfloat<K>& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + +#if defined (__aarch64__) + Vec3fa neg_min_org_rdir; + Vec3fa neg_max_org_rdir; +#else + Vec3fa min_org_rdir; + Vec3fa max_org_rdir; +#endif + float min_dist; + float max_dist; + }; + + typedef Frustum<false> FrustumFast; + + /* Robust variant */ + template<> + struct Frustum<true> + { + __forceinline Frustum() {} + + template<int K> + __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(valid, org, rdir, ray_tnear, ray_tfar, N); + } + + template<int K> + __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)), + reduce_min(select(valid, org.y, pos_inf)), + reduce_min(select(valid, org.z, pos_inf))); + + const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)), + reduce_max(select(valid, org.y, neg_inf)), + reduce_max(select(valid, org.z, neg_inf))); + + const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)), + reduce_min(select(valid, rdir.y, pos_inf)), + reduce_min(select(valid, rdir.z, pos_inf))); + + const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)), + reduce_max(select(valid, rdir.y, neg_inf)), + reduce_max(select(valid, rdir.z, neg_inf))); + + const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf))); + const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf))); + + init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N); + } + + __forceinline void init(const Vec3fa& reduced_min_org, + const Vec3fa& reduced_max_org, + const Vec3fa& reduced_min_rdir, + const Vec3fa& reduced_max_rdir, + float reduced_min_dist, + float reduced_max_dist, + int N) + { + const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero)); + min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir); + max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir); + + min_org = select(pos_rdir, reduced_max_org, reduced_min_org); + max_org = select(pos_rdir, reduced_min_org, reduced_max_org); + + min_dist = reduced_min_dist; + max_dist = reduced_max_dist; + + nf = NearFarPrecalculations(min_rdir, N); + } + + template<int K> + __forceinline void updateMaxDist(const vfloat<K>& ray_tfar) + { + max_dist = reduce_max(ray_tfar); + } + + NearFarPrecalculations nf; + + Vec3fa min_rdir; + Vec3fa max_rdir; + + Vec3fa min_org; + Vec3fa max_org; + + float min_dist; + float max_dist; + }; + + typedef Frustum<true> FrustumRobust; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx> + __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node, + const FrustumFast& frustum, vfloat<Nx>& dist) + { + const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); + +#if defined (__aarch64__) + const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x)); + const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y)); + const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z)); + const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x)); + const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y)); + const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z)); +#else + const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x)); + const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y)); + const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z)); + const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x)); + const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y)); + const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z)); +#endif + const vfloat<Nx> fmin = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist)); + dist = fmin; + const vfloat<Nx> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist)); + const vbool<Nx> vmask_node_hit = fmin <= fmax; + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx> + __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node, + const FrustumRobust& frustum, vfloat<Nx>& dist) + { + const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX); + const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY); + const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ); + const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX); + const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY); + const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ); + + const vfloat<Nx> fminX = (bminX - vfloat<Nx>(frustum.min_org.x)) * vfloat<Nx>(frustum.min_rdir.x); + const vfloat<Nx> fminY = (bminY - vfloat<Nx>(frustum.min_org.y)) * vfloat<Nx>(frustum.min_rdir.y); + const vfloat<Nx> fminZ = (bminZ - vfloat<Nx>(frustum.min_org.z)) * vfloat<Nx>(frustum.min_rdir.z); + const vfloat<Nx> fmaxX = (bmaxX - vfloat<Nx>(frustum.max_org.x)) * vfloat<Nx>(frustum.max_rdir.x); + const vfloat<Nx> fmaxY = (bmaxY - vfloat<Nx>(frustum.max_org.y)) * vfloat<Nx>(frustum.max_rdir.y); + const vfloat<Nx> fmaxZ = (bmaxZ - vfloat<Nx>(frustum.max_org.z)) * vfloat<Nx>(frustum.max_rdir.z); + + const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const float round_up = 1.0f+2.0f*float(ulp); + const vfloat<Nx> fmin = max(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist)); + dist = fmin; + const vfloat<Nx> fmax = min(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist)); + const vbool<Nx> vmask_node_hit = (round_down*fmin <= round_up*fmax); + size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1); + return m_node; + } + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h new file mode 100644 index 0000000000..0543e56f8e --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h @@ -0,0 +1,843 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int K, bool robust> + struct TravRayK; + + /* Fast variant */ + template<int K> + struct TravRayK<K, false> + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(org * rdir); +#elif defined(__AVX2__) + org_rdir = org * rdir; +#endif + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size)); + } + } + + Vec3vf<K> org; + Vec3vf<K> dir; + Vec3vf<K> rdir; +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#elif defined(__AVX2__) + Vec3vf<K> org_rdir; +#endif + Vec3vi<K> nearXYZ; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKFast = TravRayK<K, false>; + + /* Robust variant */ + template<int K> + struct TravRayK<K, true> + { + __forceinline TravRayK() {} + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + init(ray_org, ray_dir, N); + } + + __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N) + { + init(ray_org, ray_dir, N); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N) + { + org = ray_org; + dir = ray_dir; + rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir)); + + if (N) + { + const int size = sizeof(float)*N; + nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size)); + nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size)); + nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size)); + } + } + + Vec3vf<K> org; + Vec3vf<K> dir; + Vec3vf<K> rdir; + Vec3vi<K> nearXYZ; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKRobust = TravRayK<K, true>; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i, + const TravRayKFast<K>& ray, vfloat<K>& dist) + + { +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i, + const TravRayKRobust<K>& ray, vfloat<K>& dist) + { + // FIXME: use per instruction rounding for AVX512 + const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z; + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); +#else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); +#endif + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + +#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + else +#endif + { + const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; +#endif + + const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNodeMB4D intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB(); + + const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i])); + const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i])); + const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i])); + const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i])); + const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i])); + const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i])); + + const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); + vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + + if (unlikely(ref.isAABBNodeMB4D())) { + const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node; + lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i])); + } + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K, bool robust> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i, + const TravRayK<K,robust>& ray, vfloat<K>& dist) + { + const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]), + Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]), + Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]), + Vec3f(node->naabb.p .x[i], node->naabb.p .y[i], node->naabb.p .z[i])); + + const Vec3vf<K> dir = xfmVector(naabb, ray.dir); + const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1? + const Vec3vf<K> org = xfmPoint(naabb, ray.org); + + const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir; + const vfloat<K> lclipMinY = org.y * nrdir.y; + const vfloat<K> lclipMinZ = org.z * nrdir.z; + const vfloat<K> lclipMaxX = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir; + const vfloat<K> lclipMaxY = lclipMinY - nrdir.y; + const vfloat<K> lclipMaxZ = lclipMinZ - nrdir.z; + + vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp)); + } + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast OBBNodeMB intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K, bool robust> + __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i, + const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]), + Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]), + Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]), + Vec3f(node->space0.p .x[i], node->space0.p .y[i], node->space0.p .z[i])); + + const Vec3vf<K> b0_lower = zero; + const Vec3vf<K> b0_upper = one; + const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]); + const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]); + const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time); + const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time); + + const Vec3vf<K> dir = xfmVector(xfm, ray.dir); + const Vec3vf<K> rdir = rcp_safe(dir); + const Vec3vf<K> org = xfmPoint(xfm, ray.org); + + const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x; + const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y; + const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z; + const vfloat<K> lclipMaxX = (upper.x - org.x) * rdir.x; + const vfloat<K> lclipMaxY = (upper.y - org.y) * rdir.y; + const vfloat<K> lclipMaxZ = (upper.z - org.z) * rdir.z; + + vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + if (robust) { + lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp)); + lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp)); + } + + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + + ////////////////////////////////////////////////////////////////////////////////////// + // QuantizedBaseNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i, + const TravRayK<K,false>& ray, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat<N> lower_x = node->dequantizeLowerX(); + const vfloat<N> upper_x = node->dequantizeUpperX(); + const vfloat<N> lower_y = node->dequantizeLowerY(); + const vfloat<N> upper_y = node->dequantizeUpperY(); + const vfloat<N> lower_z = node->dequantizeLowerZ(); + const vfloat<N> upper_z = node->dequantizeUpperZ(); + + #if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z); + #elif defined(__AVX2__) + const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z); + #else + const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + #endif + + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + if (K == 16) + { + /* use mixed float/int min/max */ + const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + dist = lnearP; + return lhit; + } + else + #endif + { + const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ)); + #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX + const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar)); + #else + const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar); + #endif + dist = lnearP; + return lhit; + } + } + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i, + const TravRayK<K,true>& ray, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + const vfloat<N> lower_x = node->dequantizeLowerX(); + const vfloat<N> upper_x = node->dequantizeUpperX(); + const vfloat<N> lower_y = node->dequantizeLowerY(); + const vfloat<N> upper_y = node->dequantizeUpperY(); + const vfloat<N> lower_z = node->dequantizeLowerZ(); + const vfloat<N> upper_z = node->dequantizeUpperZ(); + + const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat<K> lower_x = node->dequantizeLowerX(i,time); + const vfloat<K> upper_x = node->dequantizeUpperX(i,time); + const vfloat<K> lower_y = node->dequantizeLowerY(i,time); + const vfloat<K> upper_y = node->dequantizeUpperY(i,time); + const vfloat<K> lower_z = node->dequantizeLowerZ(i,time); + const vfloat<K> upper_z = node->dequantizeUpperZ(i,time); + +#if defined(__aarch64__) + const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z); +#elif defined(__AVX2__) + const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z); +#else + const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + #endif + const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + template<int N, int K> + __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist) + + { + assert(movemask(node->validMask()) & ((size_t)1 << i)); + + const vfloat<K> lower_x = node->dequantizeLowerX(i,time); + const vfloat<K> upper_x = node->dequantizeUpperX(i,time); + const vfloat<K> lower_y = node->dequantizeLowerY(i,time); + const vfloat<K> upper_y = node->dequantizeUpperY(i,time); + const vfloat<K> lower_z = node->dequantizeLowerZ(i,time); + const vfloat<K> upper_z = node->dequantizeUpperZ(i,time); + + const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z; + const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x; + const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y; + const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const float round_down = 1.0f-3.0f*float(ulp); + + const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ)); + const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ)); + const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar); + dist = lnearP; + return lhit; + } + + + ////////////////////////////////////////////////////////////////////////////////////// + // Node intersectors used in hybrid traversal + ////////////////////////////////////////////////////////////////////////////////////// + + /*! Intersects N nodes with K rays */ + template<int N, int K, int types, bool robust> + struct BVHNNodeIntersectorK; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1, false> + { + /* vmask is both an input and an output parameter! Its initial value should be the parent node + hit mask, which is used for correctly computing the current hit mask. The parent hit mask + is actually required only for motion blur node intersections (because different rays may + have different times), so for regular nodes vmask is simply overwritten. */ + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNode())) vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist); + else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB())) vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist); + else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist); + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + template<int N, int K> + struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true> + { + static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i, + const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask) + { + if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) { + vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist); + } else /*if (unlikely(node.isOBBNodeMB()))*/ { + assert(node.isOBBNodeMB()); + vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist); + } + return true; + } + }; + + + /*! Intersects N nodes with K rays */ + template<int N, int K, bool robust> + struct BVHNQuantizedBaseNodeIntersectorK; + + template<int N, int K> + struct BVHNQuantizedBaseNodeIntersectorK<N, K, false> + { + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i, + const TravRayK<K,false>& ray, vfloat<K>& dist) + { + return intersectQuantizedNodeK<N,K>(node,i,ray,dist); + } + + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist); + } + + }; + + template<int N, int K> + struct BVHNQuantizedBaseNodeIntersectorK<N, K, true> + { + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i, + const TravRayK<K,true>& ray, vfloat<K>& dist) + { + return intersectQuantizedNodeK<N,K>(node,i,ray,dist); + } + + static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i, + const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist) + { + return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist); + } + }; + + + } +} diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h new file mode 100644 index 0000000000..f379b57aea --- /dev/null +++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h @@ -0,0 +1,215 @@ +// Copyright 2009-2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "node_intersector.h" + +namespace embree +{ + namespace isa + { + ////////////////////////////////////////////////////////////////////////////////////// + // Ray packet structure used in stream traversal + ////////////////////////////////////////////////////////////////////////////////////// + + template<int K, bool robust> + struct TravRayKStream; + + /* Fast variant */ + template<int K> + struct TravRayKStream<K, false> + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) + { + rdir = rcp_safe(ray_dir); +#if defined(__aarch64__) + neg_org_rdir = -(ray_org * rdir); +#else + org_rdir = ray_org * rdir; +#endif + } + + Vec3vf<K> rdir; +#if defined(__aarch64__) + Vec3vf<K> neg_org_rdir; +#else + Vec3vf<K> org_rdir; +#endif + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKStreamFast = TravRayKStream<K, false>; + + /* Robust variant */ + template<int K> + struct TravRayKStream<K, true> + { + __forceinline TravRayKStream() {} + + __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar) + { + init(ray_org, ray_dir); + tnear = ray_tnear; + tfar = ray_tfar; + } + + __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir) + { + rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir)); + org = ray_org; + } + + Vec3vf<K> rdir; + Vec3vf<K> org; + vfloat<K> tnear; + vfloat<K> tfar; + }; + + template<int K> + using TravRayKStreamRobust = TravRayKStream<K, true>; + + ////////////////////////////////////////////////////////////////////////////////////// + // Fast AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx, int K> + __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node, + const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + +#if defined (__aarch64__) + const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k])); + const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k])); + const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k])); + const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k])); + const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k])); + const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k])); +#else + const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k])); + const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k])); + const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k])); + const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k])); + const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k])); + const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k])); +#endif + const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k])); + const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k])); + + const vbool<Nx> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template<int N, int K> + __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf) + { + char* ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat<K> bminX = *(const float*)(ptr + nf.nearX); + const vfloat<K> bminY = *(const float*)(ptr + nf.nearY); + const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX); + const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); + const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); + +#if defined (__aarch64__) + const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z); + const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x); + const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y); + const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z); +#else + const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z); + const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x); + const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y); + const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z); +#endif + + const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear); + const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar); + + const vbool<K> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + + ////////////////////////////////////////////////////////////////////////////////////// + // Robust AABBNode intersection + ////////////////////////////////////////////////////////////////////////////////////// + + template<int N, int Nx, int K> + __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node, + const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf) + { + const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX)); + const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY)); + const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ)); + const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX)); + const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY)); + const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ)); + + const vfloat<Nx> rminX = (bminX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]); + const vfloat<Nx> rminY = (bminY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]); + const vfloat<Nx> rminZ = (bminZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]); + const vfloat<Nx> rmaxX = (bmaxX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]); + const vfloat<Nx> rmaxY = (bmaxY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]); + const vfloat<Nx> rmaxZ = (bmaxZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]); + const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512 + const vfloat<Nx> rmin = max(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k])); + const vfloat<Nx> rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k])); + + const vbool<Nx> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit) & (((size_t)1 << N)-1); + } + + template<int N, int K> + __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i, + const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf) + { + char *ptr = (char*)&node->lower_x + i*sizeof(float); + const vfloat<K> bminX = *(const float*)(ptr + nf.nearX); + const vfloat<K> bminY = *(const float*)(ptr + nf.nearY); + const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ); + const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX); + const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY); + const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ); + + const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x; + const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y; + const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z; + const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x; + const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y; + const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z; + + const float round_up = 1.0f+3.0f*float(ulp); + const vfloat<K> rmin = max(rminX, rminY, rminZ, vfloat<K>(ray.tnear)); + const vfloat<K> rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar)); + + const vbool<K> vmask_first_hit = rmin <= rmax; + + return movemask(vmask_first_hit); + } + } +} |