summaryrefslogtreecommitdiff
path: root/thirdparty/embree-aarch64/kernels/bvh
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/embree-aarch64/kernels/bvh')
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh.cpp190
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh.h235
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp1325
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h316
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp1165
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h280
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp60
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h114
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp531
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp640
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp705
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp201
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp377
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h263
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h267
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp375
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h72
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h21
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp330
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h37
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp61
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h61
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h295
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h41
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h213
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h247
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h107
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h43
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h98
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h90
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h265
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h242
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp247
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h95
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp127
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h37
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp168
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h285
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h676
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h154
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/node_intersector.h31
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h1788
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h269
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h843
-rw-r--r--thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h215
45 files changed, 14202 insertions, 0 deletions
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
new file mode 100644
index 0000000000..bd102bd6ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
@@ -0,0 +1,190 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+
+namespace embree
+{
+ template<int N>
+ BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene)
+ : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN),
+ primTy(&primTy), device(scene->device), scene(scene),
+ root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0)
+ {
+ }
+
+ template<int N>
+ BVHN<N>::~BVHN ()
+ {
+ for (size_t i=0; i<objects.size(); i++)
+ delete objects[i];
+ }
+
+ template<int N>
+ void BVHN<N>::clear()
+ {
+ set(BVHN::emptyNode,empty,0);
+ alloc.clear();
+ }
+
+ template<int N>
+ void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives)
+ {
+ this->root = root;
+ this->bounds = bounds;
+ this->numPrimitives = numPrimitives;
+ }
+
+ template<int N>
+ void BVHN<N>::clearBarrier(NodeRef& node)
+ {
+ if (node.isBarrier())
+ node.clearBarrier();
+ else if (!node.isLeaf()) {
+ BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH
+ for (size_t c=0; c<N; c++)
+ clearBarrier(n->child(c));
+ }
+ }
+
+ template<int N>
+ void BVHN<N>::layoutLargeNodes(size_t num)
+ {
+#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+ struct NodeArea
+ {
+ __forceinline NodeArea() {}
+
+ __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds)
+ : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {}
+
+ __forceinline bool operator< (const NodeArea& other) const {
+ return this->A < other.A;
+ }
+
+ NodeRef* node;
+ float A;
+ };
+ std::vector<NodeArea> lst;
+ lst.reserve(num);
+ lst.push_back(NodeArea(root,empty));
+
+ while (lst.size() < num)
+ {
+ std::pop_heap(lst.begin(), lst.end());
+ NodeArea n = lst.back(); lst.pop_back();
+ if (!n.node->isAABBNode()) break;
+ AABBNode* node = n.node->getAABBNode();
+ for (size_t i=0; i<N; i++) {
+ if (node->child(i) == BVHN::emptyNode) continue;
+ lst.push_back(NodeArea(node->child(i),node->bounds(i)));
+ std::push_heap(lst.begin(), lst.end());
+ }
+ }
+
+ for (size_t i=0; i<lst.size(); i++)
+ lst[i].node->setBarrier();
+
+ root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator());
+#endif
+ }
+
+ template<int N>
+ typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator)
+ {
+ if (node.isBarrier()) {
+ node.clearBarrier();
+ return node;
+ }
+ else if (node.isAABBNode())
+ {
+ AABBNode* oldnode = node.getAABBNode();
+ AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment);
+ *newnode = *oldnode;
+ for (size_t c=0; c<N; c++)
+ newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator);
+ return encodeNode(newnode);
+ }
+ else return node;
+ }
+
+ template<int N>
+ double BVHN<N>::preBuild(const std::string& builderName)
+ {
+ if (builderName == "")
+ return inf;
+
+ if (device->verbosity(2))
+ {
+ Lock<MutexSys> lock(g_printMutex);
+ std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush;
+ }
+
+ double t0 = 0.0;
+ if (device->benchmark || device->verbosity(2)) t0 = getSeconds();
+ return t0;
+ }
+
+ template<int N>
+ void BVHN<N>::postBuild(double t0)
+ {
+ if (t0 == double(inf))
+ return;
+
+ double dt = 0.0;
+ if (device->benchmark || device->verbosity(2))
+ dt = getSeconds()-t0;
+
+ std::unique_ptr<BVHNStatistics<N>> stat;
+
+ /* print statistics */
+ if (device->verbosity(2))
+ {
+ if (!stat) stat.reset(new BVHNStatistics<N>(this));
+ const size_t usedBytes = alloc.getUsedBytes();
+ Lock<MutexSys> lock(g_printMutex);
+ std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl;
+
+ if (device->verbosity(2))
+ std::cout << stat->str();
+
+ if (device->verbosity(2))
+ {
+ FastAllocator::AllStatistics stat(&alloc);
+ for (size_t i=0; i<objects.size(); i++)
+ if (objects[i])
+ stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc);
+
+ stat.print(numPrimitives);
+ }
+
+ if (device->verbosity(3))
+ {
+ alloc.print_blocks();
+ for (size_t i=0; i<objects.size(); i++)
+ if (objects[i])
+ objects[i]->alloc.print_blocks();
+ }
+
+ std::cout << std::flush;
+ }
+
+ /* benchmark mode */
+ if (device->benchmark)
+ {
+ if (!stat) stat.reset(new BVHNStatistics<N>(this));
+ Lock<MutexSys> lock(g_printMutex);
+ std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush;
+ }
+ }
+
+#if defined(__AVX__)
+ template class BVHN<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
+ template class BVHN<4>;
+#endif
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
new file mode 100644
index 0000000000..8fdf912e52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* include all node types */
+#include "bvh_node_aabb.h"
+#include "bvh_node_aabb_mb.h"
+#include "bvh_node_aabb_mb4d.h"
+#include "bvh_node_obb.h"
+#include "bvh_node_obb_mb.h"
+#include "bvh_node_qaabb.h"
+
+namespace embree
+{
+ /*! flags used to enable specific node types in intersectors */
+ enum BVHNodeFlags
+ {
+ BVH_FLAG_ALIGNED_NODE = 0x00001,
+ BVH_FLAG_ALIGNED_NODE_MB = 0x00010,
+ BVH_FLAG_UNALIGNED_NODE = 0x00100,
+ BVH_FLAG_UNALIGNED_NODE_MB = 0x01000,
+ BVH_FLAG_QUANTIZED_NODE = 0x100000,
+ BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000,
+
+ /* short versions */
+ BVH_AN1 = BVH_FLAG_ALIGNED_NODE,
+ BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB,
+ BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+ BVH_UN1 = BVH_FLAG_UNALIGNED_NODE,
+ BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB,
+ BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+ BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE,
+ BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB,
+ BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB,
+ BVH_QN1 = BVH_FLAG_QUANTIZED_NODE
+ };
+
+ /*! Multi BVH with N children. Each node stores the bounding box of
+ * it's N children as well as N child references. */
+ template<int N>
+ class BVHN : public AccelData
+ {
+ ALIGNED_CLASS_(16);
+ public:
+
+ /*! forward declaration of node ref type */
+ typedef NodeRefPtr<N> NodeRef;
+ typedef BaseNode_t<NodeRef,N> BaseNode;
+ typedef AABBNode_t<NodeRef,N> AABBNode;
+ typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB;
+ typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D;
+ typedef OBBNode_t<NodeRef,N> OBBNode;
+ typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB;
+ typedef QuantizedBaseNode_t<N> QuantizedBaseNode;
+ typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB;
+ typedef QuantizedNode_t<NodeRef,N> QuantizedNode;
+
+ /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+ static const size_t byteAlignment = 16;
+ static const size_t byteNodeAlignment = 4*N;
+
+ /*! Empty node */
+ static const size_t emptyNode = NodeRef::emptyNode;
+
+ /*! Invalid node, used as marker in traversal */
+ static const size_t invalidNode = NodeRef::invalidNode;
+ static const size_t popRay = NodeRef::popRay;
+
+ /*! Maximum depth of the BVH. */
+ static const size_t maxBuildDepth = 32;
+ static const size_t maxBuildDepthLeaf = maxBuildDepth+8;
+ static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder
+
+ /*! Maximum number of primitive blocks in a leaf. */
+ static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks;
+
+ public:
+
+ /*! Builder interface to create allocator */
+ struct CreateAlloc : public FastAllocator::Create {
+ __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
+ };
+
+ typedef BVHNodeRecord<NodeRef> NodeRecord;
+ typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+ typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+ public:
+
+ /*! BVHN default constructor. */
+ BVHN (const PrimitiveType& primTy, Scene* scene);
+
+ /*! BVHN destruction */
+ ~BVHN ();
+
+ /*! clears the acceleration structure */
+ void clear();
+
+ /*! sets BVH members after build */
+ void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives);
+
+ /*! Clears the barrier bits of a subtree. */
+ void clearBarrier(NodeRef& node);
+
+ /*! lays out num large nodes of the BVH */
+ void layoutLargeNodes(size_t num);
+ NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator);
+
+ /*! called by all builders before build starts */
+ double preBuild(const std::string& builderName);
+
+ /*! called by all builders after build ended */
+ void postBuild(double t0);
+
+ /*! allocator class */
+ struct Allocator {
+ BVHN* bvh;
+ Allocator (BVHN* bvh) : bvh(bvh) {}
+ __forceinline void* operator() (size_t bytes) const {
+ return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes);
+ }
+ };
+
+ /*! post build cleanup */
+ void cleanup() {
+ alloc.cleanup();
+ }
+
+ public:
+
+ /*! Encodes a node */
+ static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); }
+ static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); }
+ static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); }
+ static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); }
+ static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); }
+ static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); }
+ static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); }
+
+ public:
+
+ /*! Prefetches the node this reference points to */
+ __forceinline static void prefetch(const NodeRef ref, int types=0)
+ {
+#if defined(__AVX512PF__) // MIC
+ if (types != BVH_FLAG_QUANTIZED_NODE) {
+ prefetchL2(((char*)ref.ptr)+0*64);
+ prefetchL2(((char*)ref.ptr)+1*64);
+ if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+ prefetchL2(((char*)ref.ptr)+2*64);
+ prefetchL2(((char*)ref.ptr)+3*64);
+ }
+ if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+ /* KNL still needs L2 prefetches for large nodes */
+ prefetchL2(((char*)ref.ptr)+4*64);
+ prefetchL2(((char*)ref.ptr)+5*64);
+ prefetchL2(((char*)ref.ptr)+6*64);
+ prefetchL2(((char*)ref.ptr)+7*64);
+ }
+ }
+ else
+ {
+ /* todo: reduce if 32bit offsets are enabled */
+ prefetchL2(((char*)ref.ptr)+0*64);
+ prefetchL2(((char*)ref.ptr)+1*64);
+ prefetchL2(((char*)ref.ptr)+2*64);
+ }
+#else
+ if (types != BVH_FLAG_QUANTIZED_NODE) {
+ prefetchL1(((char*)ref.ptr)+0*64);
+ prefetchL1(((char*)ref.ptr)+1*64);
+ if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+ prefetchL1(((char*)ref.ptr)+2*64);
+ prefetchL1(((char*)ref.ptr)+3*64);
+ }
+ if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+ /* deactivate for large nodes on Xeon, as it introduces regressions */
+ //prefetchL1(((char*)ref.ptr)+4*64);
+ //prefetchL1(((char*)ref.ptr)+5*64);
+ //prefetchL1(((char*)ref.ptr)+6*64);
+ //prefetchL1(((char*)ref.ptr)+7*64);
+ }
+ }
+ else
+ {
+ /* todo: reduce if 32bit offsets are enabled */
+ prefetchL1(((char*)ref.ptr)+0*64);
+ prefetchL1(((char*)ref.ptr)+1*64);
+ prefetchL1(((char*)ref.ptr)+2*64);
+ }
+#endif
+ }
+
+ __forceinline static void prefetchW(const NodeRef ref, int types=0)
+ {
+ embree::prefetchEX(((char*)ref.ptr)+0*64);
+ embree::prefetchEX(((char*)ref.ptr)+1*64);
+ if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+ embree::prefetchEX(((char*)ref.ptr)+2*64);
+ embree::prefetchEX(((char*)ref.ptr)+3*64);
+ }
+ if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+ embree::prefetchEX(((char*)ref.ptr)+4*64);
+ embree::prefetchEX(((char*)ref.ptr)+5*64);
+ embree::prefetchEX(((char*)ref.ptr)+6*64);
+ embree::prefetchEX(((char*)ref.ptr)+7*64);
+ }
+ }
+
+ /*! bvh type information */
+ public:
+ const PrimitiveType* primTy; //!< primitive type stored in the BVH
+
+ /*! bvh data */
+ public:
+ Device* device; //!< device pointer
+ Scene* scene; //!< scene pointer
+ NodeRef root; //!< root node
+ FastAllocator alloc; //!< allocator used to allocate nodes
+
+ /*! statistics data */
+ public:
+ size_t numPrimitives; //!< number of primitives the BVH is build over
+ size_t numVertices; //!< number of vertices the BVH references
+
+ /*! data arrays for special builders */
+ public:
+ std::vector<BVHN*> objects;
+ vector_t<char,aligned_allocator<char,32>> subdiv_patches;
+ };
+
+ typedef BVHN<4> BVH4;
+ typedef BVHN<8> BVH8;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
new file mode 100644
index 0000000000..23f4f63d45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
@@ -0,0 +1,1325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh4_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+ DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+
+ BVH4Factory::BVH4Factory(int bfeatures, int ifeatures)
+ {
+ SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom);
+
+ selectBuilders(bfeatures);
+ selectIntersectors(ifeatures);
+ }
+
+ void BVH4Factory::selectBuilders(int features)
+ {
+ IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH));
+ IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH));
+ IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH));
+ IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH));
+ IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH));
+ IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH));
+
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH));
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
+
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH));
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH));
+ }
+
+ void BVH4Factory::selectIntersectors(int features)
+ {
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+
+ /* select intersectors1 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker));
+
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1));
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1));
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller))
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+ /* select intersectors4 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker));
+
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4));
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker));
+
+ /* select intersectors8 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker));
+
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8));
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker));
+
+ /* select intersectors16 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker));
+
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16));
+ IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker));
+
+ /* select stream intersectors */
+ SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback);
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream));
+
+#endif
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4Hybrid();
+ intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8Hybrid();
+ intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
+ intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
+ intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ default: assert(false);
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH4OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4OBBVirtualCurveIntersector4HybridMB();
+ intersectors.intersector8 = BVH4OBBVirtualCurveIntersector8HybridMB();
+ intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH4OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
+ intersectors.intersector8 = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
+ intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ default: assert(false);
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ assert(ivariant == IntersectVariant::FAST);
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4_filter = BVH4Triangle4Intersector4HybridMoeller();
+ intersectors.intersector4_nofilter = BVH4Triangle4Intersector4HybridMoellerNoFilter();
+ intersectors.intersector8_filter = BVH4Triangle4Intersector8HybridMoeller();
+ intersectors.intersector8_nofilter = BVH4Triangle4Intersector8HybridMoellerNoFilter();
+ intersectors.intersector16_filter = BVH4Triangle4Intersector16HybridMoeller();
+ intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
+ intersectors.intersectorN_filter = BVH4Triangle4IntersectorStreamMoeller();
+ intersectors.intersectorN_nofilter = BVH4Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ assert(ivariant == IntersectVariant::ROBUST);
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4vIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Triangle4vIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4Triangle4vIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4iIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4Triangle4iIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4Triangle4iIntersectorStreamMoeller();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4iIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Triangle4iIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4Triangle4iIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4vMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Triangle4vMBIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Triangle4iMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Triangle4iMBIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4_filter = BVH4Quad4vIntersector4HybridMoeller();
+ intersectors.intersector4_nofilter = BVH4Quad4vIntersector4HybridMoellerNoFilter();
+ intersectors.intersector8_filter = BVH4Quad4vIntersector8HybridMoeller();
+ intersectors.intersector8_nofilter = BVH4Quad4vIntersector8HybridMoellerNoFilter();
+ intersectors.intersector16_filter = BVH4Quad4vIntersector16HybridMoeller();
+ intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
+ intersectors.intersectorN_filter = BVH4Quad4vIntersectorStreamMoeller();
+ intersectors.intersectorN_nofilter = BVH4Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Quad4vIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Quad4vIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4Quad4vIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
+ intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
+ intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
+ intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
+ intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4VirtualIntersector4Chunk();
+ intersectors.intersector8 = BVH4VirtualIntersector8Chunk();
+ intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
+ intersectors.intersectorN = BVH4VirtualIntersectorStream();
+#endif
+ intersectors.collider = BVH4ColliderUserGeom();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4VirtualMBIntersector4Chunk();
+ intersectors.intersector8 = BVH4VirtualMBIntersector8Chunk();
+ intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4InstanceIntersector4Chunk();
+ intersectors.intersector8 = BVH4InstanceIntersector8Chunk();
+ intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
+ intersectors.intersectorN = BVH4InstanceIntersectorStream();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4InstanceMBIntersector4Chunk();
+ intersectors.intersector8 = BVH4InstanceMBIntersector8Chunk();
+ intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4SubdivPatch1Intersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4SubdivPatch1Intersector4();
+ intersectors.intersector8 = BVH4SubdivPatch1Intersector8();
+ intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4SubdivPatch1MBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4SubdivPatch1MBIntersector4();
+ intersectors.intersector8 = BVH4SubdivPatch1MBIntersector8();
+ intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Curve4i::type,scene);
+ Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->hair_builder == "default" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+ else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+#if defined(EMBREE_TARGET_SIMD8)
+ Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Curve8i::type,scene);
+ Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->hair_builder == "default" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+ else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+#endif
+
+ Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Curve4v::type,scene);
+ Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->hair_builder == "default" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+ else if (scene->device->hair_builder == "sah" ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Curve4iMB::type,scene);
+ Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+ else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+#if defined(EMBREE_TARGET_SIMD8)
+ Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Curve8iMB::type,scene);
+ Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->hair_builder == "default" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+ else if (scene->device->hair_builder == "sah" ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+#endif
+
+ Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Triangle4::type,scene);
+
+ Accel::Intersectors intersectors;
+ if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant);
+ else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+ else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+ else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Triangle4v::type,scene);
+
+ Accel::Intersectors intersectors;
+ if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant);
+ else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST);
+ else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+ else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false);
+ else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+ Accel::Intersectors intersectors;
+ if (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant);
+ else if (scene->device->tri_traverser == "fast" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST);
+ else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>");
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default" ) {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->tri_builder == "sah" ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+ else if (scene->device->tri_builder == "dynamic" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false);
+ else if (scene->device->tri_builder == "morton" ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+ Accel::Intersectors intersectors;
+ if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant);
+ else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST);
+ else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>");
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder_mb == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Triangle4vMB::type,scene);
+
+ Accel::Intersectors intersectors;
+ if (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant);
+ else if (scene->device->tri_traverser_mb == "fast" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST);
+ else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>");
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder_mb == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Quad4v::type,scene);
+ Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->quad_builder == "sah" ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0);
+ else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+ else if (scene->device->quad_builder == "dynamic" ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Quad4i::type,scene);
+ Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+ }
+ }
+ else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(Quad4i::type,scene);
+ Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder_mb == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene)
+ {
+ BVH4* accel = new BVH4(Quad4i::type,scene);
+ Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+ Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene)
+ {
+ BVH4* accel = new BVH4(Triangle4i::type,scene);
+ Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+ Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene)
+ {
+ BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+ Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel);
+ Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene)
+ {
+ BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+ Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel);
+ Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant)
+ {
+ BVH4* accel = new BVH4(Object::type,scene);
+ Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel);
+
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0);
+ else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene)
+ {
+ BVH4* accel = new BVH4(Object::type,scene);
+ Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel);
+ Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+ {
+ BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+ Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel);
+ auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+ // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break;
+ case BuildVariant::DYNAMIC : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+ else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive)
+ {
+ BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+ Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel);
+ auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+ Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ if (ivariant == IntersectVariant::FAST)
+ {
+ intersectors.intersector1 = BVH4GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4GridIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4GridIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ }
+ else /* if (ivariant == IntersectVariant::ROBUST) */
+ {
+ intersectors.intersector1 = BVH4GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4GridIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH4GridIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ }
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH4GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH4GridMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH4GridMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH4IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+ Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ builder = BVH4GridSceneBuilderSAH(accel,scene,0);
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+ Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant);
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ builder = BVH4GridMBSceneBuilderSAH(accel,scene,0);
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>");
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
new file mode 100644
index 0000000000..a68227b41f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
@@ -0,0 +1,316 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+ /*! BVH4 instantiations */
+ class BVH4Factory : public BVHFactory
+ {
+ public:
+ BVH4Factory(int bfeatures, int ifeatures);
+
+ public:
+ Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant);
+ Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant);
+ Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant);
+ Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant);
+ Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+
+ Accel* BVH4Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST);
+ Accel* BVH4Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ Accel* BVH4Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ Accel* BVH4QuantizedTriangle4i(Scene* scene);
+ Accel* BVH4QuantizedQuad4i(Scene* scene);
+
+ Accel* BVH4SubdivPatch1(Scene* scene);
+ Accel* BVH4SubdivPatch1MB(Scene* scene);
+
+ Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+ Accel* BVH4UserGeometryMB(Scene* scene);
+
+ Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+ Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
+
+ Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ private:
+ void selectBuilders(int features);
+ void selectIntersectors(int features);
+
+ private:
+ Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+ Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+
+ Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+ Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+ Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh);
+ Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh);
+
+ Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh);
+ Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh);
+
+ Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
+ Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
+
+ Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
+ Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
+
+ Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+ private:
+
+ DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+ // ==============
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+ // ==============
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+ // ==============
+
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
+ DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+ // SAH scene builders
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ // spatial scene builder
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+ // twolevel scene builders
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
new file mode 100644
index 0000000000..9fe057c392
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
@@ -0,0 +1,1165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8
+
+#if defined (EMBREE_TARGET_SIMD8)
+
+#include "bvh8_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+ DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+ DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+ DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+ DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+ DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+ DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+ BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
+ {
+ SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom);
+
+ selectBuilders(bfeatures);
+ selectIntersectors(ifeatures);
+ }
+
+ void BVH8Factory::selectBuilders(int features)
+ {
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH));
+
+ IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH));
+ IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH));
+ IF_ENABLED_TRIS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH));
+ IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH));
+ IF_ENABLED_USER (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH));
+ IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH));
+ }
+
+ void BVH8Factory::selectIntersectors(int features)
+ {
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+
+ /* select intersectors1 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller))
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+ /* select intersectors4 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker));
+
+ /* select intersectors8 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker));
+
+ /* select intersectors16 */
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid));
+ IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker));
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk));
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk));
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk));
+
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller));
+ IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker));
+
+ /* select stream intersectors */
+
+ SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback);
+
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker));
+ IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker));
+
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker));
+ IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker));
+
+ IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream));
+
+ IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream));
+
+#endif
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4Hybrid();
+ intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8Hybrid();
+ intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
+ intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
+ intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ default: assert(false);
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH8OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8OBBVirtualCurveIntersector4HybridMB();
+ intersectors.intersector8 = BVH8OBBVirtualCurveIntersector8HybridMB();
+ intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.leafIntersector = leafIntersector;
+ intersectors.intersector1 = BVH8OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
+ intersectors.intersector8 = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
+ intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ default: assert(false);
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ assert(ivariant == IntersectVariant::FAST);
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4_filter = BVH8Triangle4Intersector4HybridMoeller();
+ intersectors.intersector4_nofilter = BVH8Triangle4Intersector4HybridMoellerNoFilter();
+ intersectors.intersector8_filter = BVH8Triangle4Intersector8HybridMoeller();
+ intersectors.intersector8_nofilter = BVH8Triangle4Intersector8HybridMoellerNoFilter();
+ intersectors.intersector16_filter = BVH8Triangle4Intersector16HybridMoeller();
+ intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
+ intersectors.intersectorN_filter = BVH8Triangle4IntersectorStreamMoeller();
+ intersectors.intersectorN_nofilter = BVH8Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+#define ENABLE_WOOP_TEST 0
+#if ENABLE_WOOP_TEST == 0
+ //assert(ivariant == IntersectVariant::ROBUST);
+ intersectors.intersector1 = BVH8Triangle4vIntersector1Pluecker();
+#else
+ intersectors.intersector1 = BVH8Triangle4vIntersector1Woop();
+#endif
+
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4vIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Triangle4vIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Triangle4vIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8Triangle4vIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4iIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8Triangle4iIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8Triangle4iIntersectorStreamMoeller();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4iIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Triangle4iIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8Triangle4iIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4vMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Triangle4vMBIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Triangle4iMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Triangle4iMBIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4_filter = BVH8Quad4vIntersector4HybridMoeller();
+ intersectors.intersector4_nofilter = BVH8Quad4vIntersector4HybridMoellerNoFilter();
+ intersectors.intersector8_filter = BVH8Quad4vIntersector8HybridMoeller();
+ intersectors.intersector8_nofilter = BVH8Quad4vIntersector8HybridMoellerNoFilter();
+ intersectors.intersector16_filter = BVH8Quad4vIntersector16HybridMoeller();
+ intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
+ intersectors.intersectorN_filter = BVH8Quad4vIntersectorStreamMoeller();
+ intersectors.intersectorN_nofilter = BVH8Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Quad4vIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Quad4vIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8Quad4vIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Quad4iIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8Quad4iIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8Quad4iIntersectorStreamMoeller();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Quad4iIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Quad4iIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8Quad4iIntersectorStreamPluecker();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ switch (ivariant) {
+ case IntersectVariant::FAST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ case IntersectVariant::ROBUST:
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8Quad4iMBIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8Quad4iMBIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+ }
+ return Accel::Intersectors();
+ }
+
+ Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8VirtualIntersector4Chunk();
+ intersectors.intersector8 = BVH8VirtualIntersector8Chunk();
+ intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
+ intersectors.intersectorN = BVH8VirtualIntersectorStream();
+#endif
+ intersectors.collider = BVH8ColliderUserGeom();
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8VirtualMBIntersector4Chunk();
+ intersectors.intersector8 = BVH8VirtualMBIntersector8Chunk();
+ intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8InstanceIntersector4Chunk();
+ intersectors.intersector8 = BVH8InstanceIntersector8Chunk();
+ intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
+ intersectors.intersectorN = BVH8InstanceIntersectorStream();
+#endif
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8InstanceMBIntersector4Chunk();
+ intersectors.intersector8 = BVH8InstanceMBIntersector8Chunk();
+ intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ return intersectors;
+ }
+
+ Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Curve8v::type,scene);
+ Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant);
+ Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Curve8iMB::type,scene);
+ Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant);
+ Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Triangle4::type,scene);
+ Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant);
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->tri_builder == "sah" ) builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+ else if (scene->device->tri_builder == "sah_presplit") builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+ else if (scene->device->tri_builder == "dynamic" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+ else if (scene->device->tri_builder == "morton" ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Triangle4v::type,scene);
+ Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant);
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->tri_builder == "sah_fast_spatial") builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>");
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Triangle4i::type,scene);
+ Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+ }
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Triangle4i::type,scene);
+ Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder_mb == "default") { // FIXME: implement
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Triangle4vMB::type,scene);
+ Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->tri_builder_mb == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene)
+ {
+ BVH8* accel = new BVH8(Triangle4i::type,scene);
+ Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel);
+ Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene)
+ {
+ BVH8* accel = new BVH8(Triangle4::type,scene);
+ Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel);
+ Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Quad4v::type,scene);
+ Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+ }
+ }
+ else if (scene->device->quad_builder == "dynamic" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+ else if (scene->device->quad_builder == "morton" ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true);
+ else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Quad4i::type,scene);
+ Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+ }
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(Quad4i::type,scene);
+ Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant);
+
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder_mb == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : assert(false); break; // FIXME: implement
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene)
+ {
+ BVH8* accel = new BVH8(Quad4i::type,scene);
+ Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel);
+ Builder* builder = nullptr;
+ if (scene->device->quad_builder == "default" ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>");
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant)
+ {
+ BVH8* accel = new BVH8(Object::type,scene);
+ Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel);
+
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0);
+ else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene)
+ {
+ BVH8* accel = new BVH8(Object::type,scene);
+ Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel);
+ Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+ {
+ BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+ Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel);
+ auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE;
+ // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+ Builder* builder = nullptr;
+ if (scene->device->object_builder == "default") {
+ switch (bvariant) {
+ case BuildVariant::STATIC : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break;
+ case BuildVariant::DYNAMIC : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+ case BuildVariant::HIGH_QUALITY: assert(false); break;
+ }
+ }
+ else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+ else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
+ {
+ BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+ Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel);
+ auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE;
+ Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype);
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ if (ivariant == IntersectVariant::FAST)
+ {
+ intersectors.intersector1 = BVH8GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8GridIntersector4HybridMoeller();
+ intersectors.intersector8 = BVH8GridIntersector8HybridMoeller();
+ intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ }
+ else /* if (ivariant == IntersectVariant::ROBUST) */
+ {
+ intersectors.intersector1 = BVH8GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = BVH8GridIntersector4HybridPluecker();
+ intersectors.intersector8 = BVH8GridIntersector8HybridPluecker();
+ intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
+ intersectors.intersectorN = BVH8IntersectorStreamPacketFallback();
+#endif
+ }
+ return intersectors;
+ }
+
+ Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+ {
+ Accel::Intersectors intersectors;
+ intersectors.ptr = bvh;
+ intersectors.intersector1 = BVH8GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+ intersectors.intersector4 = nullptr;
+ intersectors.intersector8 = nullptr;
+ intersectors.intersector16 = nullptr;
+ intersectors.intersectorN = nullptr;
+#endif
+ return intersectors;
+ }
+
+ Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+ Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant);
+ Builder* builder = nullptr;
+ if (scene->device->grid_builder == "default") {
+ builder = BVH8GridSceneBuilderSAH(accel,scene,0);
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>");
+
+ return new AccelInstance(accel,builder,intersectors);
+ }
+
+ Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+ {
+ BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+ Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant);
+ Builder* builder = nullptr;
+ if (scene->device->grid_builder_mb == "default") {
+ builder = BVH8GridMBSceneBuilderSAH(accel,scene,0);
+ }
+ else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>");
+ return new AccelInstance(accel,builder,intersectors);
+ }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
new file mode 100644
index 0000000000..b92188e7d3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
@@ -0,0 +1,280 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+ /*! BVH8 instantiations */
+ class BVH8Factory : public BVHFactory
+ {
+ public:
+ BVH8Factory(int bfeatures, int ifeatures);
+
+ public:
+ Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant);
+ Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+ DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+
+ Accel* BVH8Triangle4 (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Triangle4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Triangle4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ Accel* BVH8Quad4v (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Quad4i (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ Accel* BVH8QuantizedTriangle4i(Scene* scene);
+ Accel* BVH8QuantizedTriangle4(Scene* scene);
+ Accel* BVH8QuantizedQuad4i(Scene* scene);
+
+ Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+ Accel* BVH8UserGeometryMB(Scene* scene);
+
+ Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+ Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
+
+ Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+ Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+ private:
+ void selectBuilders(int features);
+ void selectIntersectors(int features);
+
+ private:
+ Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+ Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+
+ Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+ Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+ Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh);
+ Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh);
+ Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh);
+
+ Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh);
+ Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh);
+
+ Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
+ Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
+
+ Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
+ Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+ private:
+ DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+ DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+ DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+ DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+ DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+ // SAH scene builders
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+ DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+ // SAH spatial scene builders
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+ // twolevel scene builders
+ private:
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+ DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
new file mode 100644
index 0000000000..e832537ec5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N>
+ typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+ {
+ auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+ return createLeaf(prims,set,alloc);
+ };
+
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ return BVHBuilderBinnedSAH::build<NodeRef>
+ (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings);
+ }
+
+
+ template<int N>
+ typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+ {
+ auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+ return createLeaf(prims,set,alloc);
+ };
+
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ return BVHBuilderBinnedSAH::build<NodeRef>
+ (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings);
+ }
+
+ template<int N>
+ typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange)
+ {
+ auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB {
+ return createLeaf(prims,set,alloc);
+ };
+
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ return BVHBuilderBinnedSAH::build<NodeRecordMB>
+ (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings);
+ }
+
+ template struct BVHNBuilderVirtual<4>;
+ template struct BVHNBuilderQuantizedVirtual<4>;
+ template struct BVHNBuilderMblurVirtual<4>;
+
+#if defined(__AVX__)
+ template struct BVHNBuilderVirtual<8>;
+ template struct BVHNBuilderQuantizedVirtual<8>;
+ template struct BVHNBuilderMblurVirtual<8>;
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
new file mode 100644
index 0000000000..1b86bb45ad
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "../builders/bvh_builder_sah.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+ template<int N>
+ struct BVHNBuilderVirtual
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef FastAllocator::CachedAllocator Allocator;
+
+ struct BVHNBuilderV {
+ NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+ virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+ };
+
+ template<typename CreateLeafFunc>
+ struct BVHNBuilderT : public BVHNBuilderV
+ {
+ BVHNBuilderT (CreateLeafFunc createLeafFunc)
+ : createLeafFunc(createLeafFunc) {}
+
+ NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+ return createLeafFunc(prims,set,alloc);
+ }
+
+ private:
+ CreateLeafFunc createLeafFunc;
+ };
+
+ template<typename CreateLeafFunc>
+ static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+ return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+ }
+ };
+
+ template<int N>
+ struct BVHNBuilderQuantizedVirtual
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef FastAllocator::CachedAllocator Allocator;
+
+ struct BVHNBuilderV {
+ NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+ virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+ };
+
+ template<typename CreateLeafFunc>
+ struct BVHNBuilderT : public BVHNBuilderV
+ {
+ BVHNBuilderT (CreateLeafFunc createLeafFunc)
+ : createLeafFunc(createLeafFunc) {}
+
+ NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+ return createLeafFunc(prims,set,alloc);
+ }
+
+ private:
+ CreateLeafFunc createLeafFunc;
+ };
+
+ template<typename CreateLeafFunc>
+ static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+ return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+ }
+ };
+
+ template<int N>
+ struct BVHNBuilderMblurVirtual
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNodeMB AABBNodeMB;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecordMB NodeRecordMB;
+ typedef FastAllocator::CachedAllocator Allocator;
+
+ struct BVHNBuilderV {
+ NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange);
+ virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+ };
+
+ template<typename CreateLeafFunc>
+ struct BVHNBuilderT : public BVHNBuilderV
+ {
+ BVHNBuilderT (CreateLeafFunc createLeafFunc)
+ : createLeafFunc(createLeafFunc) {}
+
+ NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+ return createLeafFunc(prims,set,alloc);
+ }
+
+ private:
+ CreateLeafFunc createLeafFunc;
+ };
+
+ template<typename CreateLeafFunc>
+ static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) {
+ return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange);
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
new file mode 100644
index 0000000000..64759c1294
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
@@ -0,0 +1,531 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+#include "bvh_rotate.h"
+#include "../common/profile.h"
+#include "../../common/algorithms/parallel_prefix_sum.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/bvh_builder_morton.h"
+
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+#if defined(__X86_64__) || defined(__aarch64__)
+# define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
+#else
+# define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#endif
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N>
+ struct SetBVHNBounds
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+ typedef typename BVH::AABBNode AABBNode;
+
+ BVH* bvh;
+ __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {}
+
+ __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num)
+ {
+ AABBNode* node = ref.getAABBNode();
+
+ BBox3fa res = empty;
+ for (size_t i=0; i<num; i++) {
+ const BBox3fa b = children[i].bounds;
+ res.extend(b);
+ node->setRef(i,children[i].ref);
+ node->setBounds(i,b);
+ }
+
+ BBox3fx result = (BBox3fx&)res;
+#if ROTATE_TREE
+ if (N == 4)
+ {
+ size_t n = 0;
+ for (size_t i=0; i<num; i++)
+ n += children[i].bounds.lower.a;
+
+ if (n >= 4096) {
+ for (size_t i=0; i<num; i++) {
+ if (children[i].bounds.lower.a < 4096) {
+ for (int j=0; j<ROTATE_TREE; j++)
+ BVHNRotate<N>::rotate(node->child(i));
+ node->child(i).setBarrier();
+ }
+ }
+ }
+ result.lower.a = unsigned(n);
+ }
+#endif
+
+ return NodeRecord(ref,result);
+ }
+ };
+
+ template<int N, typename Primitive>
+ struct CreateMortonLeaf;
+
+ template<int N>
+ struct CreateMortonLeaf<N,Triangle4>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+ assert(items<=4);
+
+ /* allocate leaf node */
+ Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+ vuint4 vgeomID = -1, vprimID = -1;
+ Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+ const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int primID = morton[start+i].index;
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ vgeomID [i] = geomID_;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+
+ Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
+ BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = unsigned(current.size());
+#endif
+ return NodeRecord(ref,box_o);
+ }
+
+ private:
+ TriangleMesh* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<int N>
+ struct CreateMortonLeaf<N,Triangle4v>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+ assert(items<=4);
+
+ /* allocate leaf node */
+ Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+ vuint4 vgeomID = -1, vprimID = -1;
+ Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+ const TriangleMesh* __restrict__ mesh = this->mesh;
+
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int primID = morton[start+i].index;
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ vgeomID [i] = geomID_;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ }
+ Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
+ BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = current.size();
+#endif
+ return NodeRecord(ref,box_o);
+ }
+ private:
+ TriangleMesh* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<int N>
+ struct CreateMortonLeaf<N,Triangle4i>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+ assert(items<=4);
+
+ /* allocate leaf node */
+ Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+
+ vuint4 v0 = zero, v1 = zero, v2 = zero;
+ vuint4 vgeomID = -1, vprimID = -1;
+ const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int primID = morton[start+i].index;
+ const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+ vgeomID[i] = geomID_;
+ vprimID[i] = primID;
+ unsigned int int_stride = mesh->vertices0.getStride()/4;
+ v0[i] = tri.v[0] * int_stride;
+ v1[i] = tri.v[1] * int_stride;
+ v2[i] = tri.v[2] * int_stride;
+ }
+
+ for (size_t i=items; i<4; i++)
+ {
+ vgeomID[i] = vgeomID[0];
+ vprimID[i] = -1;
+ v0[i] = 0;
+ v1[i] = 0;
+ v2[i] = 0;
+ }
+ Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
+ BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = current.size();
+#endif
+ return NodeRecord(ref,box_o);
+ }
+ private:
+ TriangleMesh* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<int N>
+ struct CreateMortonLeaf<N,Quad4v>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+ assert(items<=4);
+
+ /* allocate leaf node */
+ Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+
+ vuint4 vgeomID = -1, vprimID = -1;
+ Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+ const QuadMesh* __restrict__ mesh = this->mesh;
+
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int primID = morton[start+i].index;
+ const QuadMesh::Quad& tri = mesh->quad(primID);
+ const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+ const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+ const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+ const Vec3fa& p3 = mesh->vertex(tri.v[3]);
+ lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+ upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+ vgeomID [i] = geomID_;
+ vprimID [i] = primID;
+ v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+ v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+ v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+ v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+ }
+ Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
+ BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = current.size();
+#endif
+ return NodeRecord(ref,box_o);
+ }
+ private:
+ QuadMesh* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<int N>
+ struct CreateMortonLeaf<N,Object>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+
+ /* allocate leaf node */
+ Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+ const UserGeometry* mesh = this->mesh;
+
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int index = morton[start+i].index;
+ const unsigned int primID = index;
+ bounds.extend(mesh->bounds(primID));
+ new (&accel[i]) Object(geomID_,primID);
+ }
+
+ BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = current.size();
+#endif
+ return NodeRecord(ref,box_o);
+ }
+ private:
+ UserGeometry* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<int N>
+ struct CreateMortonLeaf<N,InstancePrimitive>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+ : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+ __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+ {
+ vfloat4 lower(pos_inf);
+ vfloat4 upper(neg_inf);
+ size_t items = current.size();
+ size_t start = current.begin();
+ assert(items <= 1);
+
+ /* allocate leaf node */
+ InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment);
+ NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+ const Instance* instance = this->mesh;
+
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<items; i++)
+ {
+ const unsigned int primID = morton[start+i].index;
+ bounds.extend(instance->bounds(primID));
+ new (&accel[i]) InstancePrimitive(instance, geomID_);
+ }
+
+ BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+ if (N == 4)
+ box_o.lower.a = current.size();
+#endif
+ return NodeRecord(ref,box_o);
+ }
+ private:
+ Instance* mesh;
+ BVHBuilderMorton::BuildPrim* morton;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ };
+
+ template<typename Mesh>
+ struct CalculateMeshBounds
+ {
+ __forceinline CalculateMeshBounds (Mesh* mesh)
+ : mesh(mesh) {}
+
+ __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) {
+ return mesh->bounds(morton.index);
+ }
+
+ private:
+ Mesh* mesh;
+ };
+
+ template<int N, typename Mesh, typename Primitive>
+ class BVHNMeshBuilderMorton : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecord NodeRecord;
+
+ public:
+
+ BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD)
+ : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {}
+
+ /* build function */
+ void build()
+ {
+ /* we reset the allocator when the mesh size changed */
+ if (mesh->numPrimitives != numPreviousPrimitives) {
+ bvh->alloc.clear();
+ morton.clear();
+ }
+ size_t numPrimitives = mesh->size();
+ numPreviousPrimitives = numPrimitives;
+
+ /* skip build for empty scene */
+ if (numPrimitives == 0) {
+ bvh->set(BVH::emptyNode,empty,0);
+ return;
+ }
+
+ /* preallocate arrays */
+ morton.resize(numPrimitives);
+ size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+ size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim);
+ bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes
+ bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated);
+
+ /* create morton code array */
+ BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes);
+ size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface);
+
+ /* create BVH */
+ SetBVHNBounds<N> setBounds(bvh);
+ CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data());
+ CalculateMeshBounds<Mesh> calculateBounds(mesh);
+ auto root = BVHBuilderMorton::build<NodeRecord>(
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNode::Create(),
+ setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface,
+ morton.data(),dest,numPrimitivesGen,settings);
+
+ bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives);
+
+#if ROTATE_TREE
+ if (N == 4)
+ {
+ for (int i=0; i<ROTATE_TREE; i++)
+ BVHNRotate<N>::rotate(bvh->root);
+ bvh->clearBarrier(bvh->root);
+ }
+#endif
+
+ /* clear temporary data for static geometry */
+ if (bvh->scene->isStaticAccel()) {
+ morton.clear();
+ }
+ bvh->cleanup();
+ }
+
+ void clear() {
+ morton.clear();
+ }
+
+ private:
+ BVH* bvh;
+ Mesh* mesh;
+ mvector<BVHBuilderMorton::BuildPrim> morton;
+ BVHBuilderMorton::Settings settings;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ unsigned int numPreviousPrimitives = 0;
+ };
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH4Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); }
+ Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); }
+ Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+ Builder* BVH8Triangle4MeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); }
+ Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); }
+ Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+ Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+ Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+ Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#endif
+#endif
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
new file mode 100644
index 0000000000..cf5b2eb47f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
@@ -0,0 +1,640 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define PROFILE 0
+#define PROFILE_RUNS 20
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, typename Primitive>
+ struct CreateLeaf
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+
+ __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {}
+
+ __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ size_t n = set.size();
+ size_t items = Primitive::blocks(n);
+ size_t start = set.begin();
+ Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+ typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+ for (size_t i=0; i<items; i++) {
+ accel[i].fill(prims,start,set.end(),bvh->scene);
+ }
+ return node;
+ }
+
+ BVH* bvh;
+ };
+
+
+ template<int N, typename Primitive>
+ struct CreateLeafQuantized
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+
+ __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {}
+
+ __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ size_t n = set.size();
+ size_t items = Primitive::blocks(n);
+ size_t start = set.begin();
+ Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+ typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+ for (size_t i=0; i<items; i++) {
+ accel[i].fill(prims,start,set.end(),bvh->scene);
+ }
+ return node;
+ }
+
+ BVH* bvh;
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+ template<int N, typename Primitive>
+ struct BVHNBuilderSAH : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVHN<N>::NodeRef NodeRef;
+
+ BVH* bvh;
+ Scene* scene;
+ Geometry* mesh;
+ mvector<PrimRef> prims;
+ GeneralBVHBuilder::Settings settings;
+ Geometry::GTypeMask gtype_;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+ bool primrefarrayalloc;
+ unsigned int numPreviousPrimitives = 0;
+
+ BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize,
+ const Geometry::GTypeMask gtype, bool primrefarrayalloc = false)
+ : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0),
+ settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {}
+
+ BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+ : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {}
+
+ // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+ void build()
+ {
+ /* we reset the allocator when the mesh size changed */
+ if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+ bvh->alloc.clear();
+ }
+
+ /* if we use the primrefarray for allocations we have to take it back from the BVH */
+ if (settings.primrefarrayalloc != size_t(inf))
+ bvh->alloc.unshare(prims);
+
+ /* skip build for empty scene */
+ const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+ numPreviousPrimitives = numPrimitives;
+ if (numPrimitives == 0) {
+ bvh->clear();
+ prims.clear();
+ return;
+ }
+
+ double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+ profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+ /* create primref array */
+ if (primrefarrayalloc) {
+ settings.primrefarrayalloc = numPrimitives/1000;
+ if (settings.primrefarrayalloc < 1000)
+ settings.primrefarrayalloc = inf;
+ }
+
+ /* enable os_malloc for two level build */
+ if (mesh)
+ bvh->alloc.setOSallocation(true);
+
+ /* initialize allocator */
+ const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+ prims.resize(numPrimitives);
+
+ PrimInfo pinfo = mesh ?
+ createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+ createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+ /* pinfo might has zero size due to invalid geometry */
+ if (unlikely(pinfo.size() == 0))
+ {
+ bvh->clear();
+ prims.clear();
+ return;
+ }
+
+ /* call BVH builder */
+ NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+ bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+ bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+#if PROFILE
+ });
+#endif
+
+ /* if we allocated using the primrefarray we have to keep it alive */
+ if (settings.primrefarrayalloc != size_t(inf))
+ bvh->alloc.share(prims);
+
+ /* for static geometries we can do some cleanups */
+ else if (scene && scene->isStaticAccel()) {
+ prims.clear();
+ }
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+ void clear() {
+ prims.clear();
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+ template<int N, typename Primitive>
+ struct BVHNBuilderSAHQuantized : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVHN<N>::NodeRef NodeRef;
+
+ BVH* bvh;
+ Scene* scene;
+ Geometry* mesh;
+ mvector<PrimRef> prims;
+ GeneralBVHBuilder::Settings settings;
+ Geometry::GTypeMask gtype_;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ unsigned int numPreviousPrimitives = 0;
+
+ BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+ : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {}
+
+ BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+ : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {}
+
+ // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+ void build()
+ {
+ /* we reset the allocator when the mesh size changed */
+ if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+ bvh->alloc.clear();
+ }
+
+ /* skip build for empty scene */
+ const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+ numPreviousPrimitives = numPrimitives;
+ if (numPrimitives == 0) {
+ prims.clear();
+ bvh->clear();
+ return;
+ }
+
+ double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+ profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+ /* create primref array */
+ prims.resize(numPrimitives);
+ PrimInfo pinfo = mesh ?
+ createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+ createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+ /* enable os_malloc for two level build */
+ if (mesh)
+ bvh->alloc.setOSallocation(true);
+
+ /* call BVH builder */
+ const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+ NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+ bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+ //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!!
+#if PROFILE
+ });
+#endif
+
+ /* clear temporary data for static geometry */
+ if (scene && scene->isStaticAccel()) {
+ prims.clear();
+ }
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+ void clear() {
+ prims.clear();
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+
+ template<int N, typename Primitive>
+ struct CreateLeafGrid
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+
+ __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {}
+
+ __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ const size_t items = set.size(); //Primitive::blocks(n);
+ const size_t start = set.begin();
+
+ /* collect all subsets with unique geomIDs */
+ assert(items <= N);
+ unsigned int geomIDs[N];
+ unsigned int num_geomIDs = 1;
+ geomIDs[0] = prims[start].geomID();
+
+ for (size_t i=1;i<items;i++)
+ {
+ bool found = false;
+ const unsigned int new_geomID = prims[start+i].geomID();
+ for (size_t j=0;j<num_geomIDs;j++)
+ if (new_geomID == geomIDs[j])
+ { found = true; break; }
+ if (!found)
+ geomIDs[num_geomIDs++] = new_geomID;
+ }
+
+ /* allocate all leaf memory in one single block */
+ SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment);
+ typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs);
+
+ for (size_t g=0;g<num_geomIDs;g++)
+ {
+ unsigned int x[N];
+ unsigned int y[N];
+ unsigned int primID[N];
+ BBox3fa bounds[N];
+ unsigned int pos = 0;
+ for (size_t i=0;i<items;i++)
+ {
+ if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+ const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()];
+ x[pos] = sgrid_bd.sx;
+ y[pos] = sgrid_bd.sy;
+ primID[pos] = sgrid_bd.primID;
+ bounds[pos] = prims[start+i].bounds();
+ pos++;
+ }
+ assert(pos <= N);
+ new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos);
+ }
+
+ return node;
+ }
+
+ BVH* bvh;
+ const SubGridBuildData * const sgrids;
+ };
+
+
+ template<int N>
+ struct BVHNBuilderSAHGrid : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVHN<N>::NodeRef NodeRef;
+
+ BVH* bvh;
+ Scene* scene;
+ GridMesh* mesh;
+ mvector<PrimRef> prims;
+ mvector<SubGridBuildData> sgrids;
+ GeneralBVHBuilder::Settings settings;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ unsigned int numPreviousPrimitives = 0;
+
+ BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+ : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {}
+
+ BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+ : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {}
+
+ void build()
+ {
+ /* we reset the allocator when the mesh size changed */
+ if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+ bvh->alloc.clear();
+ }
+
+ /* if we use the primrefarray for allocations we have to take it back from the BVH */
+ if (settings.primrefarrayalloc != size_t(inf))
+ bvh->alloc.unshare(prims);
+
+ const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false);
+ numPreviousPrimitives = numGridPrimitives;
+
+ PrimInfo pinfo(empty);
+ size_t numPrimitives = 0;
+
+ if (!mesh)
+ {
+ /* first run to get #primitives */
+
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator<GridMesh,false> iter(scene);
+
+ pstate.init(iter,size_t(1024));
+
+ /* iterate over all meshes in the scene */
+ pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j)) continue;
+ BBox3fa bounds = empty;
+ const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j);
+ if (!mesh->valid(j)) continue;
+ pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ numPrimitives = pinfo.size();
+
+ /* resize arrays */
+ sgrids.resize(numPrimitives);
+ prims.resize(numPrimitives);
+
+ /* second run to fill primrefs and SubGridBuildData arrays */
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+ k = base.size();
+ size_t p_index = k;
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j)) continue;
+ const GridMesh::Grid &g = mesh->grid(j);
+ for (unsigned int y=0; y<g.resY-1u; y+=2)
+ for (unsigned int x=0; x<g.resX-1u; x+=2)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+ const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
+ pinfo.add_center2(prim);
+ sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+ prims[p_index++] = prim;
+ }
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ assert(pinfo.size() == numPrimitives);
+ }
+ else
+ {
+ ParallelPrefixSumState<PrimInfo> pstate;
+ /* iterate over all grids in a single mesh */
+ pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+ {
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j)) continue;
+ BBox3fa bounds = empty;
+ const PrimRef prim(bounds,geomID_,unsigned(j));
+ pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+ numPrimitives = pinfo.size();
+ /* resize arrays */
+ sgrids.resize(numPrimitives);
+ prims.resize(numPrimitives);
+
+ /* second run to fill primrefs and SubGridBuildData arrays */
+ pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+ {
+
+ size_t p_index = base.size();
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j)) continue;
+ const GridMesh::Grid &g = mesh->grid(j);
+ for (unsigned int y=0; y<g.resY-1u; y+=2)
+ for (unsigned int x=0; x<g.resX-1u; x+=2)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+ const PrimRef prim(bounds,geomID_,unsigned(p_index));
+ pinfo.add_center2(prim);
+ sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+ prims[p_index++] = prim;
+ }
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ }
+
+ /* no primitives */
+ if (numPrimitives == 0) {
+ bvh->clear();
+ prims.clear();
+ sgrids.clear();
+ return;
+ }
+
+ double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+ /* create primref array */
+ settings.primrefarrayalloc = numPrimitives/1000;
+ if (settings.primrefarrayalloc < 1000)
+ settings.primrefarrayalloc = inf;
+
+ /* enable os_malloc for two level build */
+ if (mesh)
+ bvh->alloc.setOSallocation(true);
+
+ /* initialize allocator */
+ const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+
+ /* pinfo might has zero size due to invalid geometry */
+ if (unlikely(pinfo.size() == 0))
+ {
+ bvh->clear();
+ sgrids.clear();
+ prims.clear();
+ return;
+ }
+
+ /* call BVH builder */
+ NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+ bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+ bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+ /* clear temporary array */
+ sgrids.clear();
+
+ /* if we allocated using the primrefarray we have to keep it alive */
+ if (settings.primrefarrayalloc != size_t(inf))
+ bvh->alloc.share(prims);
+
+ /* for static geometries we can do some cleanups */
+ else if (scene && scene->isStaticAccel()) {
+ prims.clear();
+ }
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+ void clear() {
+ prims.clear();
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+ Builder* BVH4Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+
+
+ Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+#if defined(__AVX__)
+ Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+ Builder* BVH8Triangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH8Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH8Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+ Builder* BVH8QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+ Builder* BVH8QuantizedTriangle4SceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH4Quad4iMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH4Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH4Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+ Builder* BVH4QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH4QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#if defined(__AVX__)
+ Builder* BVH8Quad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH8Quad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+ Builder* BVH8QuantizedQuad4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH8QuantizedQuad4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+ Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+
+ Builder* BVH4VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) {
+ int minLeafSize = scene->device->object_accel_min_leaf_size;
+ int maxLeafSize = scene->device->object_accel_max_leaf_size;
+ return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+ }
+
+ Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+ return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type);
+ }
+#if defined(__AVX__)
+
+ Builder* BVH8VirtualSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) {
+ int minLeafSize = scene->device->object_accel_min_leaf_size;
+ int maxLeafSize = scene->device->object_accel_max_leaf_size;
+ return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+ }
+
+ Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+ return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type);
+ }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+ Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+ return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
+ }
+#if defined(__AVX__)
+ Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+ Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+ return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+ }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+ Builder* BVH4GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); }
+ Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct
+
+#if defined(__AVX__)
+ Builder* BVH8GridMeshBuilderSAH (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); }
+ Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct
+#endif
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
new file mode 100644
index 0000000000..9c01553ec6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -0,0 +1,705 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+
+namespace embree
+{
+ namespace isa
+ {
+
+#if 0
+ template<int N, typename Primitive>
+ struct CreateMBlurLeaf
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+ __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {}
+
+ __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ size_t items = Primitive::blocks(set.size());
+ size_t start = set.begin();
+ for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+ Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+ NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<items; i++)
+ allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time));
+
+ return NodeRecordMB(node,allBounds);
+ }
+
+ BVH* bvh;
+ PrimRef* prims;
+ size_t time;
+ };
+#endif
+
+ template<int N, typename Mesh, typename Primitive>
+ struct CreateMSMBlurLeaf
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+ __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {}
+
+ __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+ {
+ size_t items = Primitive::blocks(current.prims.size());
+ size_t start = current.prims.begin();
+ size_t end = current.prims.end();
+ for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+ Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment);
+ NodeRef node = bvh->encodeLeaf((char*)accel,items);
+ LBBox3fa allBounds = empty;
+ for (size_t i=0; i<items; i++)
+ allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range));
+ return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+ }
+
+ BVH* bvh;
+ };
+
+ /* Motion blur BVH with 4D nodes and internal time splits */
+ template<int N, typename Mesh, typename Primitive>
+ struct BVHNBuilderMBlurSAH : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVHN<N>::NodeRef NodeRef;
+ typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+ typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+ BVH* bvh;
+ Scene* scene;
+ const size_t sahBlockSize;
+ const float intCost;
+ const size_t minLeafSize;
+ const size_t maxLeafSize;
+ const Geometry::GTypeMask gtype_;
+
+ BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+ : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {}
+
+ void build()
+ {
+ /* skip build for empty scene */
+ const size_t numPrimitives = scene->getNumPrimitives(gtype_,true);
+ if (numPrimitives == 0) { bvh->clear(); return; }
+
+ double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH");
+
+#if PROFILE
+ profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+ //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>();
+ //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+
+ /*if (numTimeSegments == 1)
+ buildSingleSegment(numPrimitives);
+ else*/
+ buildMultiSegment(numPrimitives);
+
+#if PROFILE
+ });
+#endif
+
+ /* clear temporary data for static geometry */
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf.
+ void buildSingleSegment(size_t numPrimitives)
+ {
+ /* create primref array */
+ mvector<PrimRef> prims(scene->device,numPrimitives);
+ const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0);
+ /* early out if no valid primitives */
+ if (pinfo.size() == 0) { bvh->clear(); return; }
+ /* estimate acceleration structure size */
+ const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+ /* settings for BVH build */
+ GeneralBVHBuilder::Settings settings;
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ settings.logBlockSize = bsr(sahBlockSize);
+ settings.minLeafSize = min(minLeafSize,maxLeafSize);
+ settings.maxLeafSize = maxLeafSize;
+ settings.travCost = travCost;
+ settings.intCost = intCost;
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ /* build hierarchy */
+ auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+ (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(),
+ CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface,
+ prims.data(),pinfo,settings);
+
+ bvh->set(root.ref,root.lbounds,pinfo.size());
+ }
+#endif
+
+ void buildMultiSegment(size_t numPrimitives)
+ {
+ /* create primref array */
+ mvector<PrimRefMB> prims(scene->device,numPrimitives);
+ PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface);
+
+ /* early out if no valid primitives */
+ if (pinfo.size() == 0) { bvh->clear(); return; }
+
+ /* estimate acceleration structure size */
+ const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+ /* settings for BVH build */
+ BVHBuilderMSMBlur::Settings settings;
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxDepth;
+ settings.logBlockSize = bsr(sahBlockSize);
+ settings.minLeafSize = min(minLeafSize,maxLeafSize);
+ settings.maxLeafSize = maxLeafSize;
+ settings.travCost = travCost;
+ settings.intCost = intCost;
+ settings.singleLeafTimeSegment = Primitive::singleTimeSegment;
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ /* build hierarchy */
+ auto root =
+ BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+ RecalculatePrimRef<Mesh>(scene),
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNodeMB4D::Create(),
+ typename BVH::AABBNodeMB4D::Set(),
+ CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh),
+ bvh->scene->progressInterface,
+ settings);
+
+ bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+ }
+
+ void clear() {
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+ struct GridRecalculatePrimRef
+ {
+ Scene* scene;
+ const SubGridBuildData * const sgrids;
+
+ __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids)
+ : scene(scene), sgrids(sgrids) {}
+
+ __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+ {
+ const unsigned int geomID = prim.geomID();
+ const GridMesh* mesh = scene->get<GridMesh>(geomID);
+ const unsigned int buildID = prim.primID();
+ const SubGridBuildData &subgrid = sgrids[buildID];
+ const unsigned int primID = subgrid.primID;
+ const size_t x = subgrid.x();
+ const size_t y = subgrid.y();
+ const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+ const unsigned num_time_segments = mesh->numTimeSegments();
+ const range<int> tbounds = mesh->timeSegmentRange(time_range);
+ return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID);
+ }
+
+ __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+ const unsigned int geomID = prim.geomID();
+ const GridMesh* mesh = scene->get<GridMesh>(geomID);
+ const unsigned int buildID = prim.primID();
+ const SubGridBuildData &subgrid = sgrids[buildID];
+ const unsigned int primID = subgrid.primID;
+ const size_t x = subgrid.x();
+ const size_t y = subgrid.y();
+ return mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+ }
+
+ };
+
+ template<int N>
+ struct CreateMSMBlurLeafGrid
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+ __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+ __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+ {
+ const size_t items = current.prims.size();
+ const size_t start = current.prims.begin();
+
+ const PrimRefMB* prims = current.prims.prims->data();
+ /* collect all subsets with unique geomIDs */
+ assert(items <= N);
+ unsigned int geomIDs[N];
+ unsigned int num_geomIDs = 1;
+ geomIDs[0] = prims[start].geomID();
+
+ for (size_t i=1;i<items;i++)
+ {
+ bool found = false;
+ const unsigned int new_geomID = prims[start+i].geomID();
+ for (size_t j=0;j<num_geomIDs;j++)
+ if (new_geomID == geomIDs[j])
+ { found = true; break; }
+ if (!found)
+ geomIDs[num_geomIDs++] = new_geomID;
+ }
+
+ /* allocate all leaf memory in one single block */
+ SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+ typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+ LBBox3fa allBounds = empty;
+
+ for (size_t g=0;g<num_geomIDs;g++)
+ {
+ const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+ unsigned int x[N];
+ unsigned int y[N];
+ unsigned int primID[N];
+ BBox3fa bounds0[N];
+ BBox3fa bounds1[N];
+ unsigned int pos = 0;
+ for (size_t i=0;i<items;i++)
+ {
+ if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+ const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()];
+ x[pos] = sgrid_bd.sx;
+ y[pos] = sgrid_bd.sy;
+ primID[pos] = sgrid_bd.primID;
+ const size_t x = sgrid_bd.x();
+ const size_t y = sgrid_bd.y();
+ LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range);
+ allBounds.extend(newBounds);
+ bounds0[pos] = newBounds.bounds0;
+ bounds1[pos] = newBounds.bounds1;
+ pos++;
+ }
+ assert(pos <= N);
+ new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos);
+ }
+ return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+ }
+
+ Scene *scene;
+ BVH* bvh;
+ const SubGridBuildData * const sgrids;
+ };
+
+#if 0
+ template<int N>
+ struct CreateLeafGridMB
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+ __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids)
+ : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+ __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ const size_t items = set.size();
+ const size_t start = set.begin();
+
+ /* collect all subsets with unique geomIDs */
+ assert(items <= N);
+ unsigned int geomIDs[N];
+ unsigned int num_geomIDs = 1;
+ geomIDs[0] = prims[start].geomID();
+
+ for (size_t i=1;i<items;i++)
+ {
+ bool found = false;
+ const unsigned int new_geomID = prims[start+i].geomID();
+ for (size_t j=0;j<num_geomIDs;j++)
+ if (new_geomID == geomIDs[j])
+ { found = true; break; }
+ if (!found)
+ geomIDs[num_geomIDs++] = new_geomID;
+ }
+
+ /* allocate all leaf memory in one single block */
+ SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+ typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+ LBBox3fa allBounds = empty;
+
+ for (size_t g=0;g<num_geomIDs;g++)
+ {
+ const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+
+ unsigned int x[N];
+ unsigned int y[N];
+ unsigned int primID[N];
+ BBox3fa bounds0[N];
+ BBox3fa bounds1[N];
+ unsigned int pos = 0;
+ for (size_t i=0;i<items;i++)
+ {
+ if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+ const SubGridBuildData &sgrid_bd = sgrids[prims[start+i].primID()];
+ x[pos] = sgrid_bd.sx;
+ y[pos] = sgrid_bd.sy;
+ primID[pos] = sgrid_bd.primID;
+ const size_t x = sgrid_bd.x();
+ const size_t y = sgrid_bd.y();
+ bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]);
+ bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]);
+ assert(valid0);
+ assert(valid1);
+ allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos]));
+ pos++;
+ }
+ new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos);
+ }
+ return NodeRecordMB(node,allBounds);
+ }
+
+ Scene *scene;
+ BVH* bvh;
+ const SubGridBuildData * const sgrids;
+ };
+#endif
+
+
+ /* Motion blur BVH with 4D nodes and internal time splits */
+ template<int N>
+ struct BVHNBuilderMBlurSAHGrid : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVHN<N>::NodeRef NodeRef;
+ typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+ typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+ BVH* bvh;
+ Scene* scene;
+ const size_t sahBlockSize;
+ const float intCost;
+ const size_t minLeafSize;
+ const size_t maxLeafSize;
+ mvector<SubGridBuildData> sgrids;
+
+
+ BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize)
+ : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {}
+
+
+ PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+ {
+ /* first run to get #primitives */
+ ParallelForForPrefixSumState<PrimInfo> pstate;
+ Scene::Iterator<GridMesh,true> iter(scene);
+
+ pstate.init(iter,size_t(1024));
+
+ /* iterate over all meshes in the scene */
+ PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j,range<size_t>(0,1))) continue;
+ BBox3fa bounds = empty;
+ const PrimRef prim(bounds,unsigned(geomID),unsigned(j));
+ pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ size_t numPrimitives = pinfo.size();
+ if (numPrimitives == 0) return pinfo;
+
+ /* resize arrays */
+ sgrids.resize(numPrimitives);
+ prims.resize(numPrimitives);
+
+ /* second run to fill primrefs and SubGridBuildData arrays */
+ pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+
+ k = base.size();
+ size_t p_index = k;
+ PrimInfo pinfo(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ const GridMesh::Grid &g = mesh->grid(j);
+ if (!mesh->valid(j,range<size_t>(0,1))) continue;
+
+ for (unsigned int y=0; y<g.resY-1u; y+=2)
+ for (unsigned int x=0; x<g.resX-1u; x+=2)
+ {
+ BBox3fa bounds = empty;
+ if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid
+ const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index));
+ pinfo.add_center2(prim);
+ sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+ prims[p_index++] = prim;
+ }
+ }
+ return pinfo;
+ }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+ assert(pinfo.size() == numPrimitives);
+ return pinfo;
+ }
+
+ PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f))
+ {
+ /* first run to get #primitives */
+ ParallelForForPrefixSumState<PrimInfoMB> pstate;
+ Scene::Iterator<GridMesh,true> iter(scene);
+
+ pstate.init(iter,size_t(1024));
+ /* iterate over all meshes in the scene */
+ PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+
+ PrimInfoMB pinfoMB(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+ LBBox3fa bounds(empty);
+ PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+ pinfoMB.merge(gridMB);
+ }
+ return pinfoMB;
+ }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+
+ size_t numPrimitives = pinfoMB.size();
+ if (numPrimitives == 0) return pinfoMB;
+
+ /* resize arrays */
+ sgrids.resize(numPrimitives);
+ prims.resize(numPrimitives);
+ /* second run to fill primrefs and SubGridBuildData arrays */
+ pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+
+ k = base.size();
+ size_t p_index = k;
+ PrimInfoMB pinfoMB(empty);
+ for (size_t j=r.begin(); j<r.end(); j++)
+ {
+ if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+ const GridMesh::Grid &g = mesh->grid(j);
+
+ for (unsigned int y=0; y<g.resY-1u; y+=2)
+ for (unsigned int x=0; x<g.resX-1u; x+=2)
+ {
+ const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index));
+ pinfoMB.add_primref(prim);
+ sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+ prims[p_index++] = prim;
+ }
+ }
+ return pinfoMB;
+ }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+
+ assert(pinfoMB.size() == numPrimitives);
+ pinfoMB.time_range = t0t1;
+ return pinfoMB;
+ }
+
+ void build()
+ {
+ /* skip build for empty scene */
+ const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true);
+ if (numPrimitives == 0) { bvh->clear(); return; }
+
+ double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid");
+
+ //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>();
+ //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+ //if (numTimeSegments == 1)
+ // buildSingleSegment(numPrimitives);
+ //else
+ buildMultiSegment(numPrimitives);
+
+ /* clear temporary data for static geometry */
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+#if 0
+ void buildSingleSegment(size_t numPrimitives)
+ {
+ /* create primref array */
+ mvector<PrimRef> prims(scene->device,numPrimitives);
+ const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0);
+ /* early out if no valid primitives */
+ if (pinfo.size() == 0) { bvh->clear(); return; }
+
+ /* estimate acceleration structure size */
+ const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+ //TODO: check leaf_bytes
+ const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+ /* settings for BVH build */
+ GeneralBVHBuilder::Settings settings;
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ settings.logBlockSize = bsr(sahBlockSize);
+ settings.minLeafSize = min(minLeafSize,maxLeafSize);
+ settings.maxLeafSize = maxLeafSize;
+ settings.travCost = travCost;
+ settings.intCost = intCost;
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ /* build hierarchy */
+ auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+ (typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNodeMB::Create(),
+ typename BVH::AABBNodeMB::Set(),
+ CreateLeafGridMB<N>(scene,bvh,sgrids.data()),
+ bvh->scene->progressInterface,
+ prims.data(),pinfo,settings);
+
+ bvh->set(root.ref,root.lbounds,pinfo.size());
+ }
+#endif
+
+ void buildMultiSegment(size_t numPrimitives)
+ {
+ /* create primref array */
+ mvector<PrimRefMB> prims(scene->device,numPrimitives);
+ PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface);
+
+ /* early out if no valid primitives */
+ if (pinfo.size() == 0) { bvh->clear(); return; }
+
+
+
+ GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data());
+
+ /* estimate acceleration structure size */
+ const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+ //FIXME: check leaf_bytes
+ //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>));
+ const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+ /* settings for BVH build */
+ BVHBuilderMSMBlur::Settings settings;
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxDepth;
+ settings.logBlockSize = bsr(sahBlockSize);
+ settings.minLeafSize = min(minLeafSize,maxLeafSize);
+ settings.maxLeafSize = maxLeafSize;
+ settings.travCost = travCost;
+ settings.intCost = intCost;
+ settings.singleLeafTimeSegment = false;
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ /* build hierarchy */
+ auto root =
+ BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+ recalculatePrimRef,
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNodeMB4D::Create(),
+ typename BVH::AABBNodeMB4D::Set(),
+ CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()),
+ bvh->scene->progressInterface,
+ settings);
+ bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+ }
+
+ void clear() {
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+ Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#if defined(__AVX__)
+ Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+ Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#if defined(__AVX__)
+ Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ Builder* BVH4VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) {
+ int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+ int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+ return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+ }
+#if defined(__AVX__)
+ Builder* BVH8VirtualMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) {
+ int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+ int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+ return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+ }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+ Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+ Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
+#if defined(__AVX__)
+ Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); }
+#endif
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
new file mode 100644
index 0000000000..285b38c39d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/primrefgen_presplit.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, typename Primitive>
+ struct CreateLeafSpatial
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+
+ __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {}
+
+ __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+ {
+ size_t n = set.size();
+ size_t items = Primitive::blocks(n);
+ size_t start = set.begin();
+ Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+ typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+ for (size_t i=0; i<items; i++) {
+ accel[i].fill(prims,start,set.end(),bvh->scene);
+ }
+ return node;
+ }
+
+ BVH* bvh;
+ };
+
+ template<int N, typename Mesh, typename Primitive, typename Splitter>
+ struct BVHNBuilderFastSpatialSAH : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ BVH* bvh;
+ Scene* scene;
+ Mesh* mesh;
+ mvector<PrimRef> prims0;
+ GeneralBVHBuilder::Settings settings;
+ const float splitFactor;
+ unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+ unsigned int numPreviousPrimitives = 0;
+
+ BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+ : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+ splitFactor(scene->device->max_spatial_split_replications) {}
+
+ BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+ : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+ splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {}
+
+ // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+ void build()
+ {
+ /* we reset the allocator when the mesh size changed */
+ if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+ bvh->alloc.clear();
+ }
+
+ /* skip build for empty scene */
+ const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false);
+ numPreviousPrimitives = numOriginalPrimitives;
+ if (numOriginalPrimitives == 0) {
+ prims0.clear();
+ bvh->clear();
+ return;
+ }
+
+ const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>();
+ const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)));
+ double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH"));
+
+ /* create primref array */
+ const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives));
+ prims0.resize(numSplitPrimitives);
+
+ /* enable os_malloc for two level build */
+ if (mesh)
+ bvh->alloc.setOSallocation(true);
+
+ NodeRef root(0);
+ PrimInfo pinfo;
+
+
+ if (likely(usePreSplits))
+ {
+ /* spatial presplit SAH BVH builder */
+ pinfo = mesh ?
+ createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) :
+ createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface);
+
+ const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+ /* call BVH builder */
+ root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings);
+ }
+ else
+ {
+ /* standard spatial split SAH BVH builder */
+ pinfo = mesh ?
+ createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) :
+ createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface);
+
+ Splitter splitter(scene);
+
+ const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+ const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+ settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+ /* call BVH builder */
+ root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>(
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNode::Create2(),
+ typename BVH::AABBNode::Set2(),
+ CreateLeafSpatial<N,Primitive>(bvh),
+ splitter,
+ bvh->scene->progressInterface,
+ prims0.data(),
+ numSplitPrimitives,
+ pinfo,settings);
+
+ /* ==================== */
+ }
+
+ bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+ bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+ /* clear temporary data for static geometry */
+ if (scene && scene->isStaticAccel()) {
+ prims0.clear();
+ }
+ bvh->cleanup();
+ bvh->postBuild(t0);
+ }
+
+ void clear() {
+ prims0.clear();
+ }
+ };
+
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+ /************************************************************************************/
+
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+
+ Builder* BVH4Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+ Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+ Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+ Builder* BVH8Triangle4SceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+ Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+ Builder* BVH8Quad4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
new file mode 100644
index 0000000000..1a78f347ac
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
@@ -0,0 +1,377 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder_twolevel.h"
+#include "bvh_statistics.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../common/scene_line_segments.h"
+#include "../common/scene_triangle_mesh.h"
+#include "../common/scene_quad_mesh.h"
+
+#define PROFILE 0
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, typename Mesh, typename Primitive>
+ BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold)
+ : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {}
+
+ template<int N, typename Mesh, typename Primitive>
+ BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () {
+ }
+
+ // ===========================================================================
+ // ===========================================================================
+ // ===========================================================================
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build()
+ {
+ /* delete some objects */
+ size_t num = scene->size();
+ if (num < bvh->objects.size()) {
+ parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ builders[i].reset();
+ delete bvh->objects[i]; bvh->objects[i] = nullptr;
+ }
+ });
+ }
+
+#if PROFILE
+ while(1)
+#endif
+ {
+ /* reset memory allocator */
+ bvh->alloc.reset();
+
+ /* skip build for empty scene */
+ const size_t numPrimitives = scene->getNumPrimitives(gtype,false);
+
+ if (numPrimitives == 0) {
+ prims.resize(0);
+ bvh->set(BVH::emptyNode,empty,0);
+ return;
+ }
+
+ /* calculate the size of the entire BVH */
+ const size_t numLeafBlocks = Primitive::blocks(numPrimitives);
+ const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N;
+ const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive));
+ bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+ double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel");
+
+ /* resize object array if scene got larger */
+ if (bvh->objects.size() < num) bvh->objects.resize(num);
+ if (builders.size() < num) builders.resize(num);
+ resizeRefsList ();
+ nextRef.store(0);
+
+ /* create acceleration structures */
+ parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+ {
+ for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+ {
+ Mesh* mesh = scene->getSafe<Mesh>(objectID);
+
+ /* ignore meshes we do not support */
+ if (mesh == nullptr || mesh->numTimeSteps != 1)
+ continue;
+
+ if (isSmallGeometry(mesh)) {
+ setupSmallBuildRefBuilder (objectID, mesh);
+ } else {
+ setupLargeBuildRefBuilder (objectID, mesh);
+ }
+ }
+ });
+
+ /* parallel build of acceleration structures */
+ parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+ {
+ for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+ {
+ /* ignore if no triangle mesh or not enabled */
+ Mesh* mesh = scene->getSafe<Mesh>(objectID);
+ if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1)
+ continue;
+
+ builders[objectID]->attachBuildRefs (this);
+ }
+ });
+
+
+#if PROFILE
+ double d0 = getSeconds();
+#endif
+ /* fast path for single geometry scenes */
+ if (nextRef == 1) {
+ bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives);
+ }
+
+ else
+ {
+ /* open all large nodes */
+ refs.resize(nextRef);
+
+ /* this probably needs some more tuning */
+ const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR));
+
+#if !ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+#if ENABLE_OPEN_SEQUENTIAL
+ open_sequential(extSize);
+#endif
+ /* compute PrimRefs */
+ prims.resize(refs.size());
+#endif
+
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+ tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount()));
+ limited.execute([&]
+#endif
+ {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+ const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+ PrimInfo pinfo(empty);
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ pinfo.add_center2(refs[i]);
+ }
+ return pinfo;
+ }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+
+#else
+ const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(), PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+ PrimInfo pinfo(empty);
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ pinfo.add_center2(refs[i]);
+ prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node);
+ }
+ return pinfo;
+ }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+#endif
+
+ /* skip if all objects where empty */
+ if (pinfo.size() == 0)
+ bvh->set(BVH::emptyNode,empty,0);
+
+ /* otherwise build toplevel hierarchy */
+ else
+ {
+ /* settings for BVH build */
+ GeneralBVHBuilder::Settings settings;
+ settings.branchingFactor = N;
+ settings.maxDepth = BVH::maxBuildDepthLeaf;
+ settings.logBlockSize = bsr(N);
+ settings.minLeafSize = 1;
+ settings.maxLeafSize = 1;
+ settings.travCost = 1.0f;
+ settings.intCost = 1.0f;
+ settings.singleThreadThreshold = singleThreadThreshold;
+
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+ refs.resize(extSize);
+
+ NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>(
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNode::Create2(),
+ typename BVH::AABBNode::Set2(),
+
+ [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+ assert(range.size() == 1);
+ return (NodeRef) refs[range.begin()].node;
+ },
+ [&] (BuildRef &bref, BuildRef *refs) -> size_t {
+ return openBuildRef(bref,refs);
+ },
+ [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+ refs.data(),extSize,pinfo,settings);
+#else
+ NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>(
+ typename BVH::CreateAlloc(bvh),
+ typename BVH::AABBNode::Create2(),
+ typename BVH::AABBNode::Set2(),
+
+ [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+ assert(range.size() == 1);
+ return (NodeRef) prims[range.begin()].ID();
+ },
+ [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+ prims.data(),pinfo,settings);
+#endif
+
+
+ bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives);
+ }
+ }
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+ );
+#endif
+
+ }
+
+ bvh->alloc.cleanup();
+ bvh->postBuild(t0);
+#if PROFILE
+ double d1 = getSeconds();
+ std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl;
+#endif
+ }
+
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID)
+ {
+ if (geomID >= bvh->objects.size()) return;
+ if (builders[geomID]) builders[geomID].reset();
+ delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr;
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear()
+ {
+ for (size_t i=0; i<bvh->objects.size(); i++)
+ if (bvh->objects[i]) bvh->objects[i]->clear();
+
+ for (size_t i=0; i<builders.size(); i++)
+ if (builders[i]) builders[i].reset();
+
+ refs.clear();
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize)
+ {
+ if (refs.size() == 0)
+ return;
+
+ refs.reserve(extSize);
+
+#if 1
+ for (size_t i=0;i<refs.size();i++)
+ {
+ NodeRef ref = refs[i].node;
+ if (ref.isAABBNode())
+ BVH::prefetch(ref);
+ }
+#endif
+
+ std::make_heap(refs.begin(),refs.end());
+ while (refs.size()+N-1 <= extSize)
+ {
+ std::pop_heap (refs.begin(),refs.end());
+ NodeRef ref = refs.back().node;
+ if (ref.isLeaf()) break;
+ refs.pop_back();
+
+ AABBNode* node = ref.getAABBNode();
+ for (size_t i=0; i<N; i++) {
+ if (node->child(i) == BVH::emptyNode) continue;
+ refs.push_back(BuildRef(node->bounds(i),node->child(i)));
+
+#if 1
+ NodeRef ref_pre = node->child(i);
+ if (ref_pre.isAABBNode())
+ ref_pre.prefetch();
+#endif
+ std::push_heap (refs.begin(),refs.end());
+ }
+ }
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/)
+ {
+ if (builders[objectID] == nullptr || // new mesh
+ dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr) // size change resulted in large->small change
+ {
+ builders[objectID].reset (new RefBuilderSmall(objectID));
+ }
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh)
+ {
+ if (bvh->objects[objectID] == nullptr || // new mesh
+ builders[objectID]->meshQualityChanged (mesh->quality) || // changed build quality
+ dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr) // size change resulted in small->large change
+ {
+ Builder* builder = nullptr;
+ delete bvh->objects[objectID];
+ createMeshAccel(objectID, builder);
+ builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality));
+ }
+ }
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+ Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+ Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+ }
+#endif
+
+#if defined(__AVX__)
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+ Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+ Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+ }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+ return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+ }
+#endif
+
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
new file mode 100644
index 0000000000..8f57c3b406
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+
+#include "bvh_builder_twolevel_internal.h"
+#include "bvh.h"
+#include "../common/primref.h"
+#include "../builders/priminfo.h"
+#include "../builders/primrefgen.h"
+
+/* new open/merge builder */
+#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1
+#define ENABLE_OPEN_SEQUENTIAL 0
+#define SPLIT_MEMORY_RESERVE_FACTOR 1000
+#define SPLIT_MEMORY_RESERVE_SCALE 2
+#define SPLIT_MIN_EXT_SPACE 1000
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, typename Mesh, typename Primitive>
+ class BVHNBuilderTwoLevel : public Builder
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::NodeRef NodeRef;
+
+ __forceinline static bool isSmallGeometry(Mesh* mesh) {
+ return mesh->size() <= 4;
+ }
+
+ public:
+
+ typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+ struct BuildRef : public PrimRef
+ {
+ public:
+ __forceinline BuildRef () {}
+
+ __forceinline BuildRef (const BBox3fa& bounds, NodeRef node)
+ : PrimRef(bounds,(size_t)node), node(node)
+ {
+ if (node.isLeaf())
+ bounds_area = 0.0f;
+ else
+ bounds_area = area(this->bounds());
+ }
+
+ /* used by the open/merge bvh builder */
+ __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives)
+ : PrimRef(bounds,geomID,numPrimitives), node(node)
+ {
+ /* important for relative buildref ordering */
+ if (node.isLeaf())
+ bounds_area = 0.0f;
+ else
+ bounds_area = area(this->bounds());
+ }
+
+ __forceinline size_t size() const {
+ return primID();
+ }
+
+ friend bool operator< (const BuildRef& a, const BuildRef& b) {
+ return a.bounds_area < b.bounds_area;
+ }
+
+ friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) {
+ return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }";
+ }
+
+ __forceinline unsigned int numPrimitives() const { return primID(); }
+
+ public:
+ NodeRef node;
+ float bounds_area;
+ };
+
+
+ __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) {
+ if (bref.node.isLeaf())
+ {
+ refs[0] = bref;
+ return 1;
+ }
+ NodeRef ref = bref.node;
+ unsigned int geomID = bref.geomID();
+ unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1);
+ AABBNode* node = ref.getAABBNode();
+ size_t n = 0;
+ for (size_t i=0; i<N; i++) {
+ if (node->child(i) == BVH::emptyNode) continue;
+ refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims);
+ n++;
+ }
+ assert(n > 1);
+ return n;
+ }
+
+ /*! Constructor. */
+ BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD);
+
+ /*! Destructor */
+ ~BVHNBuilderTwoLevel ();
+
+ /*! builder entry point */
+ void build();
+ void deleteGeometry(size_t geomID);
+ void clear();
+
+ void open_sequential(const size_t extSize);
+
+ private:
+
+ class RefBuilderBase {
+ public:
+ virtual ~RefBuilderBase () {}
+ virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0;
+ virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0;
+ };
+
+ class RefBuilderSmall : public RefBuilderBase {
+ public:
+
+ RefBuilderSmall (size_t objectID)
+ : objectID_ (objectID) {}
+
+ void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) {
+
+ Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_);
+ size_t meshSize = mesh->size();
+ assert(isSmallGeometry(mesh));
+
+ mvector<PrimRef> prefs(topBuilder->scene->device, meshSize);
+ auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface);
+
+ size_t begin=0;
+ while (begin < pinfo.size())
+ {
+ Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment);
+ typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1);
+ accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene);
+
+ /* create build primitive */
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+ topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1);
+#else
+ topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node);
+#endif
+ }
+ assert(begin == pinfo.size());
+ }
+
+ bool meshQualityChanged (RTCBuildQuality /*currQuality*/) {
+ return false;
+ }
+
+ size_t objectID_;
+ };
+
+ class RefBuilderLarge : public RefBuilderBase {
+ public:
+
+ RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality)
+ : objectID_ (objectID), builder_ (builder), quality_ (quality) {}
+
+ void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder)
+ {
+ BVH* object = topBuilder->getBVH(objectID_); assert(object);
+
+ /* build object if it got modified */
+ if (topBuilder->isGeometryModified(objectID_))
+ builder_->build();
+
+ /* create build primitive */
+ if (!object->getBounds().empty())
+ {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+ Mesh* mesh = topBuilder->getMesh(objectID_);
+ topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size());
+#else
+ topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root);
+#endif
+ }
+ }
+
+ bool meshQualityChanged (RTCBuildQuality currQuality) {
+ return currQuality != quality_;
+ }
+
+ private:
+ size_t objectID_;
+ Ref<Builder> builder_;
+ RTCBuildQuality quality_;
+ };
+
+ void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+ void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+
+ BVH* getBVH (size_t objectID) {
+ return this->bvh->objects[objectID];
+ }
+ Mesh* getMesh (size_t objectID) {
+ return this->scene->template getSafe<Mesh>(objectID);
+ }
+ bool isGeometryModified (size_t objectID) {
+ return this->scene->isGeometryModified(objectID);
+ }
+
+ void resizeRefsList ()
+ {
+ size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0),
+ [this](const range<size_t>& r)->size_t {
+ size_t c = 0;
+ for (auto i=r.begin(); i<r.end(); ++i) {
+ Mesh* mesh = scene->getSafe<Mesh>(i);
+ if (mesh == nullptr || mesh->numTimeSteps != 1)
+ continue;
+ size_t meshSize = mesh->size();
+ c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1;
+ }
+ return c;
+ },
+ std::plus<size_t>()
+ );
+
+ if (refs.size() < num) {
+ refs.resize(num);
+ }
+ }
+
+ void createMeshAccel (size_t geomID, Builder*& builder)
+ {
+ bvh->objects[geomID] = new BVH(Primitive::type,scene);
+ BVH* accel = bvh->objects[geomID];
+ auto mesh = scene->getSafe<Mesh>(geomID);
+ if (nullptr == mesh) {
+ throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type");
+ return;
+ }
+
+ __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder);
+ }
+
+ using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>;
+
+ BuilderList builders;
+ BVH* bvh;
+ Scene* scene;
+ mvector<BuildRef> refs;
+ mvector<PrimRef> prims;
+ std::atomic<int> nextRef;
+ const size_t singleThreadThreshold;
+ Geometry::GTypeMask gtype;
+ bool useMortonBuilder_ = false;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
new file mode 100644
index 0000000000..1c1ae8d6a7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -0,0 +1,267 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+ DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+
+ namespace isa
+ {
+
+ namespace __internal_two_level_builder__ {
+
+ template<int N, typename Mesh, typename Primitive>
+ struct MortonBuilder {};
+ template<>
+ struct MortonBuilder<4,TriangleMesh,Triangle4> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<4,TriangleMesh,Triangle4v> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<4,TriangleMesh,Triangle4i> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<4,QuadMesh,Quad4v> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<4,UserGeometry,Object> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<4,Instance,InstancePrimitive> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,TriangleMesh,Triangle4> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,TriangleMesh,Triangle4v> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,TriangleMesh,Triangle4i> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,QuadMesh,Quad4v> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,UserGeometry,Object> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct MortonBuilder<8,Instance,InstancePrimitive> {
+ MortonBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+ };
+
+ template<int N, typename Mesh, typename Primitive>
+ struct SAHBuilder {};
+ template<>
+ struct SAHBuilder<4,TriangleMesh,Triangle4> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<4,TriangleMesh,Triangle4v> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<4,TriangleMesh,Triangle4i> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<4,QuadMesh,Quad4v> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<4,UserGeometry,Object> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<4,Instance,InstancePrimitive> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,TriangleMesh,Triangle4> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,TriangleMesh,Triangle4v> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,TriangleMesh,Triangle4i> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,QuadMesh,Quad4v> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,UserGeometry,Object> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct SAHBuilder<8,Instance,InstancePrimitive> {
+ SAHBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+ };
+
+ template<int N, typename Mesh, typename Primitive>
+ struct RefitBuilder {};
+ template<>
+ struct RefitBuilder<4,TriangleMesh,Triangle4> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<4,TriangleMesh,Triangle4v> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<4,TriangleMesh,Triangle4i> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<4,QuadMesh,Quad4v> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<4,UserGeometry,Object> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<4,Instance,InstancePrimitive> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,TriangleMesh,Triangle4> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,TriangleMesh,Triangle4v> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,TriangleMesh,Triangle4i> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,QuadMesh,Quad4v> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,UserGeometry,Object> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+ };
+ template<>
+ struct RefitBuilder<8,Instance,InstancePrimitive> {
+ RefitBuilder () {}
+ Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+ };
+
+ template<int N, typename Mesh, typename Primitive>
+ struct MeshBuilder {
+ MeshBuilder () {}
+ void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) {
+ if(useMortonBuilder) {
+ builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype);
+ return;
+ }
+ switch (mesh->quality) {
+ case RTC_BUILD_QUALITY_LOW: builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+ case RTC_BUILD_QUALITY_MEDIUM:
+ case RTC_BUILD_QUALITY_HIGH: builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+ case RTC_BUILD_QUALITY_REFIT: builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+ default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality");
+ }
+ }
+ };
+ }
+ }
+} \ No newline at end of file
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
new file mode 100644
index 0000000000..a27be8bae8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
@@ -0,0 +1,375 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_collider.h"
+#include "../geometry/triangle_triangle_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+#define CSTAT(x)
+
+ size_t parallel_depth_threshold = 3;
+ CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0));
+ CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0));
+ CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0));
+ CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0));
+
+ struct Collision
+ {
+ __forceinline Collision() {}
+
+ __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1)
+ : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {}
+
+ unsigned geomID0;
+ unsigned primID0;
+ unsigned geomID1;
+ unsigned primID1;
+ };
+
+ template<int N>
+ __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1)
+ {
+ const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x);
+ const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y);
+ const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z);
+ const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x);
+ const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y);
+ const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z);
+ return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+ }
+
+ template<int N>
+ __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1)
+ {
+ const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x);
+ const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y);
+ const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z);
+ const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x);
+ const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y);
+ const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z);
+ return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+ }
+
+ template<int N>
+ __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1)
+ {
+ const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x);
+ const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y);
+ const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z);
+ const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x);
+ const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y);
+ const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z);
+ return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+ }
+
+ bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1)
+ {
+ CSTAT(bvh_collide_prim_intersections1++);
+ const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0);
+ const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1);
+ const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0);
+ const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1);
+
+ /* special culling for scene intersection with itself */
+ if (scene0 == scene1 && geomID0 == geomID1)
+ {
+ /* ignore self intersections */
+ if (primID0 == primID1)
+ return false;
+ }
+ CSTAT(bvh_collide_prim_intersections2++);
+
+ if (scene0 == scene1 && geomID0 == geomID1)
+ {
+ /* ignore intersection with topological neighbors */
+ const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]);
+ if (any(vint4(tri1.v[0]) == t0)) return false;
+ if (any(vint4(tri1.v[1]) == t0)) return false;
+ if (any(vint4(tri1.v[2]) == t0)) return false;
+ }
+ CSTAT(bvh_collide_prim_intersections3++);
+
+ const Vec3fa a0 = mesh0->vertex(tri0.v[0]);
+ const Vec3fa a1 = mesh0->vertex(tri0.v[1]);
+ const Vec3fa a2 = mesh0->vertex(tri0.v[2]);
+ const Vec3fa b0 = mesh1->vertex(tri1.v[0]);
+ const Vec3fa b1 = mesh1->vertex(tri1.v[1]);
+ const Vec3fa b2 = mesh1->vertex(tri1.v[2]);
+
+ return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2);
+ }
+
+ template<int N>
+ __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1)
+ {
+ Collision collisions[16];
+ size_t num_collisions = 0;
+
+ size_t N0; Object* leaf0 = (Object*) node0.leaf(N0);
+ size_t N1; Object* leaf1 = (Object*) node1.leaf(N1);
+ for (size_t i=0; i<N0; i++) {
+ for (size_t j=0; j<N1; j++) {
+ const unsigned geomID0 = leaf0[i].geomID();
+ const unsigned primID0 = leaf0[i].primID();
+ const unsigned geomID1 = leaf1[j].geomID();
+ const unsigned primID1 = leaf1[j].primID();
+ if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue;
+ collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1);
+ if (num_collisions == 16) {
+ this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+ num_collisions = 0;
+ }
+ }
+ }
+ if (num_collisions)
+ this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+ }
+
+ template<int N>
+ void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1)
+ {
+ CSTAT(bvh_collide_traversal_steps++);
+ if (unlikely(ref0.isLeaf())) {
+ if (unlikely(ref1.isLeaf())) {
+ CSTAT(bvh_collide_leaf_pairs++);
+ processLeaf(ref0,ref1);
+ return;
+ } else goto recurse_node1;
+
+ } else {
+ if (unlikely(ref1.isLeaf())) {
+ goto recurse_node0;
+ } else {
+ if (area(bounds0) > area(bounds1)) {
+ goto recurse_node0;
+ }
+ else {
+ goto recurse_node1;
+ }
+ }
+ }
+
+ {
+ recurse_node0:
+ AABBNode* node0 = ref0.getAABBNode();
+ size_t mask = overlap<N>(bounds1,*node0);
+ //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ //for (size_t i=0; i<N; i++) {
+#if 0
+ if (depth0 < parallel_depth_threshold)
+ {
+ parallel_for(size_t(N), [&] ( size_t i ) {
+ if (mask & ( 1 << i)) {
+ BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+ collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+ }
+ });
+ }
+ else
+#endif
+ {
+ for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+ collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+ }
+ }
+ return;
+ }
+
+ {
+ recurse_node1:
+ AABBNode* node1 = ref1.getAABBNode();
+ size_t mask = overlap<N>(bounds0,*node1);
+ //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ //for (size_t i=0; i<N; i++) {
+#if 0
+ if (depth1 < parallel_depth_threshold)
+ {
+ parallel_for(size_t(N), [&] ( size_t i ) {
+ if (mask & ( 1 << i)) {
+ BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+ collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+ }
+ });
+ }
+ else
+#endif
+ {
+ for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+ collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+ }
+ }
+ return;
+ }
+ }
+
+ template<int N>
+ void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs)
+ {
+ if (unlikely(job.ref0.isLeaf())) {
+ if (unlikely(job.ref1.isLeaf())) {
+ jobs.push_back(job);
+ return;
+ } else goto recurse_node1;
+ } else {
+ if (unlikely(job.ref1.isLeaf())) {
+ goto recurse_node0;
+ } else {
+ if (area(job.bounds0) > area(job.bounds1)) {
+ goto recurse_node0;
+ }
+ else {
+ goto recurse_node1;
+ }
+ }
+ }
+
+ {
+ recurse_node0:
+ const AABBNode* node0 = job.ref0.getAABBNode();
+ size_t mask = overlap<N>(job.bounds1,*node0);
+ for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1));
+ }
+ return;
+ }
+
+ {
+ recurse_node1:
+ const AABBNode* node1 = job.ref1.getAABBNode();
+ size_t mask = overlap<N>(job.bounds0,*node1);
+ for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+ jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1));
+ }
+ return;
+ }
+ }
+
+ template<int N>
+ void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1)
+ {
+ CSTAT(bvh_collide_traversal_steps = 0);
+ CSTAT(bvh_collide_leaf_pairs = 0);
+ CSTAT(bvh_collide_leaf_iterations = 0);
+ CSTAT(bvh_collide_prim_intersections1 = 0);
+ CSTAT(bvh_collide_prim_intersections2 = 0);
+ CSTAT(bvh_collide_prim_intersections3 = 0);
+ CSTAT(bvh_collide_prim_intersections4 = 0);
+ CSTAT(bvh_collide_prim_intersections5 = 0);
+ CSTAT(bvh_collide_prim_intersections = 0);
+#if 0
+ collide_recurse(ref0,bounds0,ref1,bounds1,0,0);
+#else
+ const int M = 2048;
+ jobvector jobs[2];
+ jobs[0].reserve(M);
+ jobs[1].reserve(M);
+ jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0));
+ int source = 0;
+ int target = 1;
+
+ /* try to split job until job list is full */
+ while (jobs[source].size()+8 <= M)
+ {
+ for (size_t i=0; i<jobs[source].size(); i++)
+ {
+ const CollideJob& job = jobs[source][i];
+ size_t remaining = jobs[source].size()-i;
+ if (jobs[target].size()+remaining+8 > M) {
+ jobs[target].push_back(job);
+ } else {
+ split(job,jobs[target]);
+ }
+ }
+
+ /* stop splitting jobs if we reached only leaves and cannot make progress anymore */
+ if (jobs[target].size() == jobs[source].size())
+ break;
+
+ jobs[source].resize(0);
+ std::swap(source,target);
+ }
+
+ /* parallel processing of all jobs */
+ parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) {
+ CollideJob& j = jobs[source][i];
+ collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1);
+ });
+
+
+#endif
+ CSTAT(PRINT(bvh_collide_traversal_steps));
+ CSTAT(PRINT(bvh_collide_leaf_pairs));
+ CSTAT(PRINT(bvh_collide_leaf_iterations));
+ CSTAT(PRINT(bvh_collide_prim_intersections1));
+ CSTAT(PRINT(bvh_collide_prim_intersections2));
+ CSTAT(PRINT(bvh_collide_prim_intersections3));
+ CSTAT(PRINT(bvh_collide_prim_intersections4));
+ CSTAT(PRINT(bvh_collide_prim_intersections5));
+ CSTAT(PRINT(bvh_collide_prim_intersections));
+ }
+
+ template<int N>
+ void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr)
+ {
+ BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr).
+ collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds());
+ }
+
+#if defined (EMBREE_LOWEST_ISA)
+ struct collision_regression_test : public RegressionTest
+ {
+ collision_regression_test(const char* name) : RegressionTest(name) {
+ registerRegressionTest(this);
+ }
+
+ bool run ()
+ {
+ bool passed = true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f),
+ Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0),
+ Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0),
+ Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false;
+ passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0),
+ Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false;
+ return passed;
+ }
+ };
+
+ collision_regression_test collision_regression("collision_regression_test");
+#endif
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// Collider Definitions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>);
+
+#if defined(__AVX__)
+ DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>);
+#endif
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
new file mode 100644
index 0000000000..ac4f99c96a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
@@ -0,0 +1,72 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/object.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N>
+ class BVHNCollider
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::AABBNode AABBNode;
+
+ struct CollideJob
+ {
+ CollideJob () {}
+
+ CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0,
+ NodeRef ref1, const BBox3fa& bounds1, size_t depth1)
+ : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {}
+
+ NodeRef ref0;
+ BBox3fa bounds0;
+ size_t depth0;
+ NodeRef ref1;
+ BBox3fa bounds1;
+ size_t depth1;
+ };
+
+ typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector;
+
+ void split(const CollideJob& job, jobvector& jobs);
+
+ public:
+ __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+ : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {}
+
+ public:
+ virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0;
+ void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1);
+ void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1);
+
+ protected:
+ Scene* scene0;
+ Scene* scene1;
+ RTCCollideFunc callback;
+ void* userPtr;
+ };
+
+ template<int N>
+ class BVHNColliderUserGeom : public BVHNCollider<N>
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::AABBNode AABBNode;
+
+ __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+ : BVHNCollider<N>(scene0,scene1,callback,userPtr) {}
+
+ virtual void processLeaf(NodeRef leaf0, NodeRef leaf1);
+ public:
+ static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr);
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
new file mode 100644
index 0000000000..54021ca6eb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../common/isa.h"
+#include "../common/accel.h"
+#include "../common/scene.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+ /*! BVH instantiations */
+ class BVHFactory
+ {
+ public:
+ enum class BuildVariant { STATIC, DYNAMIC, HIGH_QUALITY };
+ enum class IntersectVariant { FAST, ROBUST };
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
new file mode 100644
index 0000000000..ea6adc2717
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
@@ -0,0 +1,330 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.h"
+#include "node_intersector1.h"
+#include "bvh_traverser1.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, int types, bool robust, typename PrimitiveIntersector1>
+ void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
+ RayHit& __restrict__ ray,
+ IntersectContext* __restrict__ context)
+ {
+ const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+ /* perform per ray precalculations required by the primitive intersector */
+ Precalculations pre(ray, bvh);
+
+ /* stack state */
+ StackItemT<NodeRef> stack[stackSize]; // stack of nodes
+ StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+ StackItemT<NodeRef>* stackEnd = stack+stackSize;
+ stack[0].ptr = bvh->root;
+ stack[0].dist = neg_inf;
+
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+ /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ if (!ray.valid()) return;
+#endif
+ /* verify correct input */
+ assert(ray.valid());
+ assert(ray.tnear() >= 0.0f);
+ assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+ /* load the ray into SIMD registers */
+ TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+ /* initialize the node traverser */
+ BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+ /* pop loop */
+ while (true) pop:
+ {
+ /* pop next node */
+ if (unlikely(stackPtr == stack)) break;
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+
+ /* if popped node is too far, pop next one */
+#if defined(__AVX512ER__)
+ /* much faster on KNL */
+ if (unlikely(any(vfloat<Nx>(*(float*)&stackPtr->dist) > tray.tfar)))
+ continue;
+#else
+ if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
+ continue;
+#endif
+
+ /* downtraversal loop */
+ while (true)
+ {
+ /* intersect node */
+ size_t mask; vfloat<Nx> tNear;
+ STAT3(normal.trav_nodes,1,1,1);
+ bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+ if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+ /* if no child is hit, pop next node */
+ if (unlikely(mask == 0))
+ goto pop;
+
+ /* select next child and push other children */
+ nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+ }
+
+ /* this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(normal.trav_leaves,1,1,1);
+ size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+ size_t lazy_node = 0;
+ PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node);
+ tray.tfar = ray.tfar;
+
+ /* push lazy node onto stack */
+ if (unlikely(lazy_node)) {
+ stackPtr->ptr = lazy_node;
+ stackPtr->dist = neg_inf;
+ stackPtr++;
+ }
+ }
+ }
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector1>
+ void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
+ Ray& __restrict__ ray,
+ IntersectContext* __restrict__ context)
+ {
+ const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+ /* early out for already occluded rays */
+ if (unlikely(ray.tfar < 0.0f))
+ return;
+
+ /* perform per ray precalculations required by the primitive intersector */
+ Precalculations pre(ray, bvh);
+
+ /* stack state */
+ NodeRef stack[stackSize]; // stack of nodes that still need to get traversed
+ NodeRef* stackPtr = stack+1; // current stack pointer
+ NodeRef* stackEnd = stack+stackSize;
+ stack[0] = bvh->root;
+
+ /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ if (!ray.valid()) return;
+#endif
+
+ /* verify correct input */
+ assert(ray.valid());
+ assert(ray.tnear() >= 0.0f);
+ assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+ /* load the ray into SIMD registers */
+ TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+ /* initialize the node traverser */
+ BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+ /* pop loop */
+ while (true) pop:
+ {
+ /* pop next node */
+ if (unlikely(stackPtr == stack)) break;
+ stackPtr--;
+ NodeRef cur = (NodeRef)*stackPtr;
+
+ /* downtraversal loop */
+ while (true)
+ {
+ /* intersect node */
+ size_t mask; vfloat<Nx> tNear;
+ STAT3(shadow.trav_nodes,1,1,1);
+ bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+ if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+ /* if no child is hit, pop next node */
+ if (unlikely(mask == 0))
+ goto pop;
+
+ /* select next child and push other children */
+ nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+ }
+
+ /* this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(shadow.trav_leaves,1,1,1);
+ size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+ size_t lazy_node = 0;
+ if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) {
+ ray.tfar = neg_inf;
+ break;
+ }
+
+ /* push lazy node onto stack */
+ if (unlikely(lazy_node)) {
+ *stackPtr = (NodeRef)lazy_node;
+ stackPtr++;
+ }
+ }
+ }
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector1>
+ struct PointQueryDispatch
+ {
+ typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+ typedef typename PrimitiveIntersector1::Primitive Primitive;
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+ static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+ /* right now AVX512KNL SIMD extension only for standard node types */
+ static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+ {
+ const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ if (bvh->root == BVH::emptyNode)
+ return false;
+
+ /* stack state */
+ StackItemT<NodeRef> stack[stackSize]; // stack of nodes
+ StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+ StackItemT<NodeRef>* stackEnd = stack+stackSize;
+ stack[0].ptr = bvh->root;
+ stack[0].dist = neg_inf;
+
+ /* verify correct input */
+ assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f));
+
+ /* load the point query into SIMD registers */
+ TravPointQuery<N> tquery(query->p, context->query_radius);
+
+ /* initialize the node traverser */
+ BVHNNodeTraverser1Hit<N, N, types> nodeTraverser;
+
+ bool changed = false;
+ float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+ ? query->radius * query->radius
+ : dot(context->query_radius, context->query_radius);
+
+ /* pop loop */
+ while (true) pop:
+ {
+ /* pop next node */
+ if (unlikely(stackPtr == stack)) break;
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+
+ /* if popped node is too far, pop next one */
+ if (unlikely(*(float*)&stackPtr->dist > cull_radius))
+ continue;
+
+ /* downtraversal loop */
+ while (true)
+ {
+ /* intersect node */
+ size_t mask; vfloat<N> tNear;
+ STAT3(point_query.trav_nodes,1,1,1);
+ bool nodeIntersected;
+ if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+ nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+ } else {
+ nodeIntersected = BVHNNodePointQueryAABB1 <N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+ }
+ if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; }
+
+ /* if no child is hit, pop next node */
+ if (unlikely(mask == 0))
+ goto pop;
+
+ /* select next child and push other children */
+ nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+ }
+
+ /* this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(point_query.trav_leaves,1,1,1);
+ size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+ size_t lazy_node = 0;
+ if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node))
+ {
+ changed = true;
+ tquery.rad = context->query_radius;
+ cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+ ? query->radius * query->radius
+ : dot(context->query_radius, context->query_radius);
+ }
+
+ /* push lazy node onto stack */
+ if (unlikely(lazy_node)) {
+ stackPtr->ptr = lazy_node;
+ stackPtr->dist = neg_inf;
+ stackPtr++;
+ }
+ }
+ return changed;
+ }
+ };
+
+ /* disable point queries for not yet supported geometry types */
+ template<int N, int types, bool robust>
+ struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> {
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+ };
+
+ template<int N, int types, bool robust>
+ struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> {
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+ };
+
+ template<int N, int types, bool robust>
+ struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> {
+ static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+ };
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector1>
+ bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery(
+ const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+ {
+ return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context);
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
new file mode 100644
index 0000000000..1a269c319a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! BVH single ray intersector. */
+ template<int N, int types, bool robust, typename PrimitiveIntersector1>
+ class BVHNIntersector1
+ {
+ /* shortcuts for frequently used types */
+ typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+ typedef typename PrimitiveIntersector1::Primitive Primitive;
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+ static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+ /* right now AVX512KNL SIMD extension only for standard node types */
+ static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+ public:
+ static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
+ static void occluded (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+ static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
new file mode 100644
index 0000000000..989f7354fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.cpp"
+
+namespace embree
+{
+ namespace isa
+ {
+ int getISA() {
+ return VerifyMultiTargetLinking::getISA();
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// BVH4Intersector1 Definitions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >));
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >));
+
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >));
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >));
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+ IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>));
+ IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>));
+
+ IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >));
+ IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >));
+
+ IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
+ IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >));
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >));
+ //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
new file mode 100644
index 0000000000..d764cc928d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+#include "node_intersector_frustum.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int K, bool robust>
+ struct TravRayK;
+
+ /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true>
+ class BVHNIntersectorKHybrid
+ {
+ /* right now AVX512KNL SIMD extension only for standard node types */
+ static const size_t Nx = types == BVH_AN1 ? vextend<N>::size : N;
+
+ /* shortcuts for frequently used types */
+ typedef typename PrimitiveIntersectorK::Precalculations Precalculations;
+ typedef typename PrimitiveIntersectorK::Primitive Primitive;
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::BaseNode BaseNode;
+ typedef typename BVH::AABBNode AABBNode;
+
+ static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+ static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth;
+
+ static const size_t switchThresholdIncoherent = \
+ (K==4) ? 3 :
+ (K==8) ? ((N==4) ? 5 : 7) :
+ (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal
+ 0;
+
+ private:
+ static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+ RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+ static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+ RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+
+ public:
+ static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+ static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+ static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+ static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+ };
+
+ /*! BVH packet intersector. */
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK>
+ class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {};
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
new file mode 100644
index 0000000000..83d1fb4d3d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
@@ -0,0 +1,295 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector_packet_stream.h"
+#include "node_intersector_frustum.h"
+#include "bvh_traverser_stream.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! BVH ray stream intersector. */
+ template<int N, int Nx, int types, bool robust, typename PrimitiveIntersector>
+ class BVHNIntersectorStream
+ {
+ static const int Nxd = (Nx == N) ? N : Nx/2;
+
+ /* shortcuts for frequently used types */
+ template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
+ template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::BaseNode BaseNode;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::AABBNodeMB AABBNodeMB;
+
+ template<int K>
+ __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
+ TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
+ {
+ const size_t numPackets = (numOctantRays+K-1)/K;
+
+ Vec3vf<K> tmp_min_rdir(pos_inf);
+ Vec3vf<K> tmp_max_rdir(neg_inf);
+ Vec3vf<K> tmp_min_org(pos_inf);
+ Vec3vf<K> tmp_max_org(neg_inf);
+ vfloat<K> tmp_min_dist(pos_inf);
+ vfloat<K> tmp_max_dist(neg_inf);
+
+ size_t m_active = 0;
+ for (size_t i = 0; i < numPackets; i++)
+ {
+ const vfloat<K> tnear = inputPackets[i]->tnear();
+ const vfloat<K> tfar = inputPackets[i]->tfar;
+ vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ m_valid &= inputPackets[i]->valid();
+#endif
+
+ m_active |= (size_t)movemask(m_valid) << (i*K);
+
+ vfloat<K> packet_min_dist = max(tnear, 0.0f);
+ vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+ tmp_min_dist = min(tmp_min_dist, packet_min_dist);
+ tmp_max_dist = max(tmp_max_dist, packet_max_dist);
+
+ const Vec3vf<K>& org = inputPackets[i]->org;
+ const Vec3vf<K>& dir = inputPackets[i]->dir;
+
+ new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
+
+ tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
+ tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
+ tmp_min_org = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
+ tmp_max_org = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
+ }
+
+ m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
+
+
+ const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
+ reduce_min(tmp_min_rdir.y),
+ reduce_min(tmp_min_rdir.z));
+
+ const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
+ reduce_max(tmp_max_rdir.y),
+ reduce_max(tmp_max_rdir.z));
+
+ const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
+ reduce_min(tmp_min_org.y),
+ reduce_min(tmp_min_org.z));
+
+ const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
+ reduce_max(tmp_max_org.y),
+ reduce_max(tmp_max_org.z));
+
+ commonOctant =
+ (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
+ (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
+ (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
+
+ const float frustum_min_dist = reduce_min(tmp_min_dist);
+ const float frustum_max_dist = reduce_max(tmp_max_dist);
+
+ frustum.init(reduced_min_origin, reduced_max_origin,
+ reduced_min_rdir, reduced_max_rdir,
+ frustum_min_dist, frustum_max_dist,
+ N);
+
+ return m_active;
+ }
+
+ template<int K>
+ __forceinline static size_t intersectAABBNodePacket(size_t m_active,
+ const TravRayKStream<K,robust>* packets,
+ const AABBNode* __restrict__ node,
+ size_t boxID,
+ const NearFarPrecalculations& nf)
+ {
+ assert(m_active);
+ const size_t startPacketID = bsf(m_active) / K;
+ const size_t endPacketID = bsr(m_active) / K;
+ size_t m_trav_active = 0;
+ for (size_t i = startPacketID; i <= endPacketID; i++)
+ {
+ const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
+ m_trav_active |= m_hit << (i*K);
+ }
+ return m_trav_active;
+ }
+
+ template<int K>
+ __forceinline static size_t traverseCoherentStream(size_t m_active,
+ TravRayKStream<K, robust>* packets,
+ const AABBNode* __restrict__ node,
+ const Frustum<robust>& frustum,
+ size_t* maskK,
+ vfloat<Nx>& dist)
+ {
+ size_t m_node_hit = intersectNodeFrustum<N,Nx>(node, frustum, dist);
+ const size_t first_index = bsf(m_active);
+ const size_t first_packetID = first_index / K;
+ const size_t first_rayID = first_index % K;
+ size_t m_first_hit = intersectNode1<N,Nx>(node, packets[first_packetID], first_rayID, frustum.nf);
+
+ /* this make traversal independent of the ordering of rays */
+ size_t m_node = m_node_hit ^ m_first_hit;
+ while (unlikely(m_node))
+ {
+ const size_t boxID = bscf(m_node);
+ const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
+ m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
+ maskK[boxID] = m_current;
+ }
+ return m_node_hit;
+ }
+
+ // TODO: explicit 16-wide path for KNL
+ template<int K>
+ __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+ TravRayKStreamFast<K>* __restrict__ packets,
+ const AABBNode* __restrict__ node,
+ const NearFarPrecalculations& nf,
+ const int shiftTable[32])
+ {
+ const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+ const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+ const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+ const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+ const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+ const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+ assert(m_active);
+ vint<Nx> vmask(zero);
+ do
+ {
+ STAT3(shadow.trav_nodes,1,1,1);
+ const size_t rayID = bscf(m_active);
+ assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+ TravRayKStream<K,robust> &p = packets[rayID / K];
+ const size_t i = rayID % K;
+ const vint<Nx> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+ const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+ const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+ const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+ const vfloat<Nx> tFarX = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+ const vfloat<Nx> tFarY = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+ const vfloat<Nx> tFarZ = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+#else
+ const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
+ const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
+ const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
+ const vfloat<Nx> tFarX = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
+ const vfloat<Nx> tFarY = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
+ const vfloat<Nx> tFarZ = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]);
+#endif
+
+ const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+ const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));
+
+#if defined(__AVX512ER__)
+ const vboolx m_node((1 << N)-1);
+ const vbool<Nx> hit_mask = le(m_node, tNear, tFar);
+ vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+ const vbool<Nx> hit_mask = tNear <= tFar;
+#if defined(__AVX2__)
+ vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+ vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+ } while(m_active);
+ return vmask;
+ }
+
+ template<int K>
+ __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+ TravRayKStreamRobust<K>* __restrict__ packets,
+ const AABBNode* __restrict__ node,
+ const NearFarPrecalculations& nf,
+ const int shiftTable[32])
+ {
+ const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+ const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+ const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+ const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+ const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+ const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+ assert(m_active);
+ vint<Nx> vmask(zero);
+ do
+ {
+ STAT3(shadow.trav_nodes,1,1,1);
+ const size_t rayID = bscf(m_active);
+ assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+ TravRayKStream<K,robust> &p = packets[rayID / K];
+ const size_t i = rayID % K;
+ const vint<Nx> bitmask(shiftTable[rayID]);
+ const vfloat<Nx> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
+ const vfloat<Nx> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
+ const vfloat<Nx> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
+ const vfloat<Nx> tFarX = (bmaxX - p.org.x[i]) * p.rdir.x[i];
+ const vfloat<Nx> tFarY = (bmaxY - p.org.y[i]) * p.rdir.y[i];
+ const vfloat<Nx> tFarZ = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
+ const vfloat<Nx> tNear = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+ const vfloat<Nx> tFar = mini(tFarX , tFarY , tFarZ, vfloat<Nx>(p.tfar[i]));
+ const float round_down = 1.0f-2.0f*float(ulp);
+ const float round_up = 1.0f+2.0f*float(ulp);
+#if defined(__AVX512ER__)
+ const vboolx m_node((1 << N)-1);
+ const vbool<Nx> hit_mask = le(m_node, round_down*tNear, round_up*tFar);
+ vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+ const vbool<Nx> hit_mask = round_down*tNear <= round_up*tFar;
+#if defined(__AVX2__)
+ vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+ vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+ } while(m_active);
+ return vmask;
+ }
+
+
+ static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
+
+ public:
+ static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+ static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+ private:
+ template<int K>
+ static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+ template<int K>
+ static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+ template<int K>
+ static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+ };
+
+
+ /*! BVH ray stream intersector with direct fallback to packets. */
+ template<int N, int Nx>
+ class BVHNIntersectorStreamPacketFallback
+ {
+ public:
+ static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+ static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+ private:
+ template<int K>
+ static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+ template<int K>
+ static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
new file mode 100644
index 0000000000..cdeb923637
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/ray.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ class RayStreamFilter
+ {
+ public:
+ static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
+ static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
+ static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+ static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
+
+ static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
+ static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
+ static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+ static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
+
+ private:
+ template<int K, bool intersect>
+ static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
+
+ template<int K, bool intersect>
+ static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
+
+ template<int K, bool intersect>
+ static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+
+ template<int K, bool intersect>
+ static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
+ };
+ }
+};
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
new file mode 100644
index 0000000000..baa4a8d805
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+ /*! BVHN AABBNode */
+ template<typename NodeRef, int N>
+ struct AABBNode_t : public BaseNode_t<NodeRef, N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+
+ struct Create
+ {
+ __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const
+ {
+ AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ };
+
+ struct Set
+ {
+ __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const {
+ node.getAABBNode()->setRef(i,child);
+ node.getAABBNode()->setBounds(i,bounds);
+ }
+ };
+
+ struct Create2
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+ {
+ AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear();
+ for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds());
+ return NodeRef::encodeNode(node);
+ }
+ };
+
+ struct Set2
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+ {
+ AABBNode_t* node = ref.getAABBNode();
+ for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+ return ref;
+ }
+ };
+
+ struct Set3
+ {
+ Set3 (FastAllocator* allocator, PrimRef* prims)
+ : allocator(allocator), prims(prims) {}
+
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+ {
+ AABBNode_t* node = ref.getAABBNode();
+ for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+
+ if (unlikely(precord.alloc_barrier))
+ {
+ PrimRef* begin = &prims[precord.prims.begin()];
+ PrimRef* end = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!!
+ size_t bytes = (size_t)end - (size_t)begin;
+ allocator->addBlock(begin,bytes);
+ }
+
+ return ref;
+ }
+
+ FastAllocator* const allocator;
+ PrimRef* const prims;
+ };
+
+ /*! Clears the node. */
+ __forceinline void clear() {
+ lower_x = lower_y = lower_z = pos_inf;
+ upper_x = upper_y = upper_z = neg_inf;
+ BaseNode_t<NodeRef,N>::clear();
+ }
+
+ /*! Sets bounding box and ID of child. */
+ __forceinline void setRef(size_t i, const NodeRef& ref) {
+ assert(i < N);
+ children[i] = ref;
+ }
+
+ /*! Sets bounding box of child. */
+ __forceinline void setBounds(size_t i, const BBox3fa& bounds)
+ {
+ assert(i < N);
+ lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+ upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+ }
+
+ /*! Sets bounding box and ID of child. */
+ __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) {
+ setBounds(i,bounds);
+ children[i] = ref;
+ }
+
+ /*! Returns bounds of node. */
+ __forceinline BBox3fa bounds() const {
+ const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z));
+ const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z));
+ return BBox3fa(lower,upper);
+ }
+
+ /*! Returns bounds of specified child. */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ assert(i < N);
+ const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]);
+ const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]);
+ return BBox3fa(lower,upper);
+ }
+
+ /*! Returns extent of bounds of specified child. */
+ __forceinline Vec3fa extend(size_t i) const {
+ return bounds(i).size();
+ }
+
+ /*! Returns bounds of all children (implemented later as specializations) */
+ __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const;
+
+ /*! swap two children of the node */
+ __forceinline void swap(size_t i, size_t j)
+ {
+ assert(i<N && j<N);
+ std::swap(children[i],children[j]);
+ std::swap(lower_x[i],lower_x[j]);
+ std::swap(lower_y[i],lower_y[j]);
+ std::swap(lower_z[i],lower_z[j]);
+ std::swap(upper_x[i],upper_x[j]);
+ std::swap(upper_y[i],upper_y[j]);
+ std::swap(upper_z[i],upper_z[j]);
+ }
+
+ /*! swap the children of two nodes */
+ __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j)
+ {
+ assert(i<N && j<N);
+ std::swap(a->children[i],b->children[j]);
+ std::swap(a->lower_x[i],b->lower_x[j]);
+ std::swap(a->lower_y[i],b->lower_y[j]);
+ std::swap(a->lower_z[i],b->lower_z[j]);
+ std::swap(a->upper_x[i],b->upper_x[j]);
+ std::swap(a->upper_y[i],b->upper_y[j]);
+ std::swap(a->upper_z[i],b->upper_z[j]);
+ }
+
+ /*! compacts a node (moves empty children to the end) */
+ __forceinline static void compact(AABBNode_t* a)
+ {
+ /* find right most filled node */
+ ssize_t j=N;
+ for (j=j-1; j>=0; j--)
+ if (a->child(j) != NodeRef::emptyNode)
+ break;
+
+ /* replace empty nodes with filled nodes */
+ for (ssize_t i=0; i<j; i++) {
+ if (a->child(i) == NodeRef::emptyNode) {
+ a->swap(i,j);
+ for (j=j-1; j>i; j--)
+ if (a->child(j) != NodeRef::emptyNode)
+ break;
+ }
+ }
+ }
+
+ /*! Returns reference to specified child */
+ __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; }
+ __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+
+ /*! output operator */
+ friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n)
+ {
+ o << "AABBNode { " << embree_endl;
+ o << " lower_x " << n.lower_x << embree_endl;
+ o << " upper_x " << n.upper_x << embree_endl;
+ o << " lower_y " << n.lower_y << embree_endl;
+ o << " upper_y " << n.upper_y << embree_endl;
+ o << " lower_z " << n.lower_z << embree_endl;
+ o << " upper_z " << n.upper_z << embree_endl;
+ o << " children = ";
+ for (size_t i=0; i<N; i++) o << n.children[i] << " ";
+ o << embree_endl;
+ o << "}" << embree_endl;
+ return o;
+ }
+
+ public:
+ vfloat<N> lower_x; //!< X dimension of lower bounds of all N children.
+ vfloat<N> upper_x; //!< X dimension of upper bounds of all N children.
+ vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children.
+ vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children.
+ vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children.
+ vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children.
+ };
+
+ template<>
+ __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const {
+ transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower);
+ transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper);
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
new file mode 100644
index 0000000000..501f4bce5b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+ /*! Motion Blur AABBNode */
+ template<typename NodeRef, int N>
+ struct AABBNodeMB_t : public BaseNode_t<NodeRef, N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+ typedef BVHNodeRecord<NodeRef> NodeRecord;
+ typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+ typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+ struct Create
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+ {
+ AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ };
+
+ struct Set
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+ {
+ AABBNodeMB_t* node = ref.getAABBNodeMB();
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<num; i++) {
+ node->setRef(i,children[i].ref);
+ node->setBounds(i,children[i].lbounds);
+ bounds.extend(children[i].lbounds);
+ }
+ return NodeRecordMB(ref,bounds);
+ }
+ };
+
+ struct SetTimeRange
+ {
+ __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {}
+
+ template<typename BuildRecord>
+ __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+ {
+ AABBNodeMB_t* node = ref.getAABBNodeMB();
+
+ LBBox3fa bounds = empty;
+ for (size_t i=0; i<num; i++) {
+ node->setRef(i, children[i].ref);
+ node->setBounds(i, children[i].lbounds, tbounds);
+ bounds.extend(children[i].lbounds);
+ }
+ return NodeRecordMB(ref,bounds);
+ }
+
+ BBox1f tbounds;
+ };
+
+ /*! Clears the node. */
+ __forceinline void clear() {
+ lower_x = lower_y = lower_z = vfloat<N>(pos_inf);
+ upper_x = upper_y = upper_z = vfloat<N>(neg_inf);
+ lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f);
+ upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f);
+ BaseNode_t<NodeRef,N>::clear();
+ }
+
+ /*! Sets ID of child. */
+ __forceinline void setRef(size_t i, NodeRef ref) {
+ children[i] = ref;
+ }
+
+ /*! Sets bounding box of child. */
+ __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i)
+ {
+ /*! for empty bounds we have to avoid inf-inf=nan */
+ BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX)));
+ BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX)));
+ bounds0 = bounds0.enlarge_by(4.0f*float(ulp));
+ bounds1 = bounds1.enlarge_by(4.0f*float(ulp));
+ Vec3fa dlower = bounds1.lower-bounds0.lower;
+ Vec3fa dupper = bounds1.upper-bounds0.upper;
+
+ lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z;
+ upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z;
+
+ lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z;
+ upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z;
+ }
+
+ /*! Sets bounding box of child. */
+ __forceinline void setBounds(size_t i, const LBBox3fa& bounds) {
+ setBounds(i, bounds.bounds0, bounds.bounds1);
+ }
+
+ /*! Sets bounding box of child. */
+ __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) {
+ setBounds(i, bounds.global(tbounds));
+ }
+
+ /*! Sets bounding box and ID of child. */
+ __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) {
+ lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+ upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+ children[i] = ref;
+ }
+
+ /*! Sets bounding box and ID of child. */
+ __forceinline void set(size_t i, const NodeRecordMB4D& child)
+ {
+ setRef(i, child.ref);
+ setBounds(i, child.lbounds, child.dt);
+ }
+
+ /*! Return bounding box for time 0 */
+ __forceinline BBox3fa bounds0(size_t i) const {
+ return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]),
+ Vec3fa(upper_x[i],upper_y[i],upper_z[i]));
+ }
+
+ /*! Return bounding box for time 1 */
+ __forceinline BBox3fa bounds1(size_t i) const {
+ return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]),
+ Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i]));
+ }
+
+ /*! Returns bounds of node. */
+ __forceinline BBox3fa bounds() const {
+ return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)),
+ reduce_min(min(lower_y,lower_y+lower_dy)),
+ reduce_min(min(lower_z,lower_z+lower_dz))),
+ Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)),
+ reduce_max(max(upper_y,upper_y+upper_dy)),
+ reduce_max(max(upper_z,upper_z+upper_dz))));
+ }
+
+ /*! Return bounding box of child i */
+ __forceinline BBox3fa bounds(size_t i) const {
+ return merge(bounds0(i),bounds1(i));
+ }
+
+ /*! Return linear bounding box of child i */
+ __forceinline LBBox3fa lbounds(size_t i) const {
+ return LBBox3fa(bounds0(i),bounds1(i));
+ }
+
+ /*! Return bounding box of child i at specified time */
+ __forceinline BBox3fa bounds(size_t i, float time) const {
+ return lerp(bounds0(i),bounds1(i),time);
+ }
+
+ /*! Returns the expected surface area when randomly sampling the time. */
+ __forceinline float expectedHalfArea(size_t i) const {
+ return lbounds(i).expectedHalfArea();
+ }
+
+ /*! Returns the expected surface area when randomly sampling the time. */
+ __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const {
+ return lbounds(i).expectedHalfArea(t0t1);
+ }
+
+ /*! swap two children of the node */
+ __forceinline void swap(size_t i, size_t j)
+ {
+ assert(i<N && j<N);
+ std::swap(children[i],children[j]);
+
+ std::swap(lower_x[i],lower_x[j]);
+ std::swap(upper_x[i],upper_x[j]);
+ std::swap(lower_y[i],lower_y[j]);
+ std::swap(upper_y[i],upper_y[j]);
+ std::swap(lower_z[i],lower_z[j]);
+ std::swap(upper_z[i],upper_z[j]);
+
+ std::swap(lower_dx[i],lower_dx[j]);
+ std::swap(upper_dx[i],upper_dx[j]);
+ std::swap(lower_dy[i],lower_dy[j]);
+ std::swap(upper_dy[i],upper_dy[j]);
+ std::swap(lower_dz[i],lower_dz[j]);
+ std::swap(upper_dz[i],upper_dz[j]);
+ }
+
+ /*! compacts a node (moves empty children to the end) */
+ __forceinline static void compact(AABBNodeMB_t* a)
+ {
+ /* find right most filled node */
+ ssize_t j=N;
+ for (j=j-1; j>=0; j--)
+ if (a->child(j) != NodeRef::emptyNode)
+ break;
+
+ /* replace empty nodes with filled nodes */
+ for (ssize_t i=0; i<j; i++) {
+ if (a->child(i) == NodeRef::emptyNode) {
+ a->swap(i,j);
+ for (j=j-1; j>i; j--)
+ if (a->child(j) != NodeRef::emptyNode)
+ break;
+ }
+ }
+ }
+
+ /*! Returns reference to specified child */
+ __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; }
+ __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+
+ /*! stream output operator */
+ friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n)
+ {
+ cout << "AABBNodeMB {" << embree_endl;
+ for (size_t i=0; i<N; i++)
+ {
+ const BBox3fa b0 = n.bounds0(i);
+ const BBox3fa b1 = n.bounds1(i);
+ cout << " child" << i << " { " << embree_endl;
+ cout << " bounds0 = " << b0 << ", " << embree_endl;
+ cout << " bounds1 = " << b1 << ", " << embree_endl;
+ cout << " }";
+ }
+ cout << "}";
+ return cout;
+ }
+
+ public:
+ vfloat<N> lower_x; //!< X dimension of lower bounds of all N children.
+ vfloat<N> upper_x; //!< X dimension of upper bounds of all N children.
+ vfloat<N> lower_y; //!< Y dimension of lower bounds of all N children.
+ vfloat<N> upper_y; //!< Y dimension of upper bounds of all N children.
+ vfloat<N> lower_z; //!< Z dimension of lower bounds of all N children.
+ vfloat<N> upper_z; //!< Z dimension of upper bounds of all N children.
+
+ vfloat<N> lower_dx; //!< X dimension of lower bounds of all N children.
+ vfloat<N> upper_dx; //!< X dimension of upper bounds of all N children.
+ vfloat<N> lower_dy; //!< Y dimension of lower bounds of all N children.
+ vfloat<N> upper_dy; //!< Y dimension of upper bounds of all N children.
+ vfloat<N> lower_dz; //!< Z dimension of lower bounds of all N children.
+ vfloat<N> upper_dz; //!< Z dimension of upper bounds of all N children.
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
new file mode 100644
index 0000000000..e968bbbc39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -0,0 +1,107 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_aabb_mb.h"
+
+namespace embree
+{
+ /*! Aligned 4D Motion Blur Node */
+ template<typename NodeRef, int N>
+ struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+ using AABBNodeMB_t<NodeRef,N>::set;
+
+ typedef BVHNodeRecord<NodeRef> NodeRecord;
+ typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+ typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+ struct Create
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const
+ {
+ if (hasTimeSplits)
+ {
+ AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ else
+ {
+ AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ }
+ };
+
+ struct Set
+ {
+ template<typename BuildRecord>
+ __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
+ {
+ if (likely(ref.isAABBNodeMB())) {
+ for (size_t i=0; i<num; i++)
+ ref.getAABBNodeMB()->set(i, children[i]);
+ } else {
+ for (size_t i=0; i<num; i++)
+ ref.getAABBNodeMB4D()->set(i, children[i]);
+ }
+ }
+ };
+
+ /*! Clears the node. */
+ __forceinline void clear() {
+ lower_t = vfloat<N>(pos_inf);
+ upper_t = vfloat<N>(neg_inf);
+ AABBNodeMB_t<NodeRef,N>::clear();
+ }
+
+ /*! Sets bounding box of child. */
+ __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds)
+ {
+ AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds));
+ lower_t[i] = tbounds.lower;
+ upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper;
+ }
+
+ /*! Sets bounding box and ID of child. */
+ __forceinline void set(size_t i, const NodeRecordMB4D& child) {
+ AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref);
+ setBounds(i, child.lbounds, child.dt);
+ }
+
+ /*! Returns the expected surface area when randomly sampling the time. */
+ __forceinline float expectedHalfArea(size_t i) const {
+ return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i));
+ }
+
+ /*! returns time range for specified child */
+ __forceinline BBox1f timeRange(size_t i) const {
+ return BBox1f(lower_t[i],upper_t[i]);
+ }
+
+ /*! stream output operator */
+ friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n)
+ {
+ cout << "AABBNodeMB4D {" << embree_endl;
+ for (size_t i=0; i<N; i++)
+ {
+ const BBox3fa b0 = n.bounds0(i);
+ const BBox3fa b1 = n.bounds1(i);
+ cout << " child" << i << " { " << embree_endl;
+ cout << " bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl;
+ cout << " bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl;
+ cout << " time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl;
+ cout << " }";
+ }
+ cout << "}";
+ return cout;
+ }
+
+ public:
+ vfloat<N> lower_t; //!< time dimension of lower bounds of all N children
+ vfloat<N> upper_t; //!< time dimension of upper bounds of all N children
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
new file mode 100644
index 0000000000..8268f3b932
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
@@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_ref.h"
+
+namespace embree
+{
+
+ /*! BVHN Base Node */
+ template<typename NodeRef, int N>
+ struct BaseNode_t
+ {
+ /*! Clears the node. */
+ __forceinline void clear()
+ {
+ for (size_t i=0; i<N; i++)
+ children[i] = NodeRef::emptyNode;
+ }
+
+ /*! Returns reference to specified child */
+ __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; }
+ __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+
+ /*! verifies the node */
+ __forceinline bool verify() const
+ {
+ for (size_t i=0; i<N; i++) {
+ if (child(i) == NodeRef::emptyNode) {
+ for (; i<N; i++) {
+ if (child(i) != NodeRef::emptyNode)
+ return false;
+ }
+ break;
+ }
+ }
+ return true;
+ }
+
+ NodeRef children[N]; //!< Pointer to the N children (can be a node or leaf)
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
new file mode 100644
index 0000000000..fa7cc08211
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+ /*! Node with unaligned bounds */
+ template<typename NodeRef, int N>
+ struct OBBNode_t : public BaseNode_t<NodeRef, N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+
+ struct Create
+ {
+ __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+ {
+ OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ };
+
+ struct Set
+ {
+ __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const {
+ node.ungetAABBNode()->setRef(i,child);
+ node.ungetAABBNode()->setBounds(i,bounds);
+ }
+ };
+
+ /*! Clears the node. */
+ __forceinline void clear()
+ {
+ naabb.l.vx = Vec3fa(nan);
+ naabb.l.vy = Vec3fa(nan);
+ naabb.l.vz = Vec3fa(nan);
+ naabb.p = Vec3fa(nan);
+ BaseNode_t<NodeRef,N>::clear();
+ }
+
+ /*! Sets bounding box. */
+ __forceinline void setBounds(size_t i, const OBBox3fa& b)
+ {
+ assert(i < N);
+
+ AffineSpace3fa space = b.space;
+ space.p -= b.bounds.lower;
+ space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space;
+
+ naabb.l.vx.x[i] = space.l.vx.x;
+ naabb.l.vx.y[i] = space.l.vx.y;
+ naabb.l.vx.z[i] = space.l.vx.z;
+
+ naabb.l.vy.x[i] = space.l.vy.x;
+ naabb.l.vy.y[i] = space.l.vy.y;
+ naabb.l.vy.z[i] = space.l.vy.z;
+
+ naabb.l.vz.x[i] = space.l.vz.x;
+ naabb.l.vz.y[i] = space.l.vz.y;
+ naabb.l.vz.z[i] = space.l.vz.z;
+
+ naabb.p.x[i] = space.p.x;
+ naabb.p.y[i] = space.p.y;
+ naabb.p.z[i] = space.p.z;
+ }
+
+ /*! Sets ID of child. */
+ __forceinline void setRef(size_t i, const NodeRef& ref) {
+ assert(i < N);
+ children[i] = ref;
+ }
+
+ /*! Returns the extent of the bounds of the ith child */
+ __forceinline Vec3fa extent(size_t i) const {
+ assert(i<N);
+ const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]);
+ const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]);
+ const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]);
+ return rsqrt(vx*vx + vy*vy + vz*vz);
+ }
+
+ /*! Returns reference to specified child */
+ __forceinline NodeRef& child(size_t i) { assert(i<N); return children[i]; }
+ __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+
+ /*! output operator */
+ friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n)
+ {
+ o << "UnAABBNode { " << n.naabb << " } " << embree_endl;
+ return o;
+ }
+
+ public:
+ AffineSpace3vf<N> naabb; //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space)
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
new file mode 100644
index 0000000000..834cf5ec28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
@@ -0,0 +1,90 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+ template<typename NodeRef, int N>
+ struct OBBNodeMB_t : public BaseNode_t<NodeRef, N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+
+ struct Create
+ {
+ __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+ {
+ OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+ return NodeRef::encodeNode(node);
+ }
+ };
+
+ struct Set
+ {
+ __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const {
+ node.ungetAABBNodeMB()->setRef(i,child);
+ node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt));
+ }
+ };
+
+ /*! Clears the node. */
+ __forceinline void clear()
+ {
+ space0 = one;
+ //b0.lower = b0.upper = Vec3fa(nan);
+ b1.lower = b1.upper = Vec3fa(nan);
+ BaseNode_t<NodeRef,N>::clear();
+ }
+
+ /*! Sets space and bounding boxes. */
+ __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) {
+ setBounds(i,space,lbounds.bounds0,lbounds.bounds1);
+ }
+
+ /*! Sets space and bounding boxes. */
+ __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c)
+ {
+ assert(i < N);
+
+ AffineSpace3fa space = s0;
+ space.p -= a.lower;
+ Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower);
+ space = AffineSpace3fa::scale(scale)*space;
+ BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale);
+ BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale);
+
+ space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z;
+ space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z;
+ space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z;
+ space0.p .x[i] = space.p .x; space0.p .y[i] = space.p .y; space0.p .z[i] = space.p .z;
+
+ /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z;
+ b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/
+
+ b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z;
+ b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z;
+ }
+
+ /*! Sets ID of child. */
+ __forceinline void setRef(size_t i, const NodeRef& ref) {
+ assert(i < N);
+ children[i] = ref;
+ }
+
+ /*! Returns the extent of the bounds of the ith child */
+ __forceinline Vec3fa extent0(size_t i) const {
+ assert(i < N);
+ const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]);
+ const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]);
+ const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]);
+ return rsqrt(vx*vx + vy*vy + vz*vz);
+ }
+
+ public:
+ AffineSpace3vf<N> space0;
+ //BBox3vf<N> b0; // these are the unit bounds
+ BBox3vf<N> b1;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
new file mode 100644
index 0000000000..5212821f3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
@@ -0,0 +1,265 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+ /*! BVHN Quantized Node */
+ template<int N>
+ struct __aligned(8) QuantizedBaseNode_t
+ {
+ typedef unsigned char T;
+ static const T MIN_QUAN = 0;
+ static const T MAX_QUAN = 255;
+
+ /*! Clears the node. */
+ __forceinline void clear() {
+ for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN;
+ for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN;
+ }
+
+ /*! Returns bounds of specified child. */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ assert(i < N);
+ const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x),
+ madd(scale.y,(float)lower_y[i],start.y),
+ madd(scale.z,(float)lower_z[i],start.z));
+ const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x),
+ madd(scale.y,(float)upper_y[i],start.y),
+ madd(scale.z,(float)upper_z[i],start.z));
+ return BBox3fa(lower,upper);
+ }
+
+ /*! Returns extent of bounds of specified child. */
+ __forceinline Vec3fa extent(size_t i) const {
+ return bounds(i).size();
+ }
+
+ static __forceinline void init_dim(const vfloat<N> &lower,
+ const vfloat<N> &upper,
+ T lower_quant[N],
+ T upper_quant[N],
+ float &start,
+ float &scale)
+ {
+ /* quantize bounds */
+ const vbool<N> m_valid = lower != vfloat<N>(pos_inf);
+ const float minF = reduce_min(lower);
+ const float maxF = reduce_max(upper);
+ float diff = (1.0f+2.0f*float(ulp))*(maxF - minF);
+ float decode_scale = diff / float(MAX_QUAN);
+ if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
+ assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
+ const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
+ vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
+ vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+
+ /* lower/upper correction */
+ vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
+ vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+ ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
+ iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
+
+ /* disable invalid lanes */
+ ilower = select(m_valid,ilower,MAX_QUAN);
+ iupper = select(m_valid,iupper,MIN_QUAN);
+
+ /* store as uchar to memory */
+ vint<N>::store(lower_quant,ilower);
+ vint<N>::store(upper_quant,iupper);
+ start = minF;
+ scale = decode_scale;
+
+#if defined(DEBUG)
+ vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
+ vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+ vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
+ vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
+ assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
+ assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid));
+#endif
+ }
+
+ __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node)
+ {
+ init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x);
+ init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y);
+ init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z);
+ }
+
+ __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); }
+
+#if defined(__AVX512F__) // KNL
+ __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
+#endif
+ __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+
+ __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+
+ __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+
+ __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+
+ __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+
+ __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+
+ template <int M>
+ __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+
+#if defined(__AVX512F__)
+ __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
+ __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); }
+ __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }
+#endif
+
+ union {
+ struct {
+ T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children
+ T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children
+ T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children
+ T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children
+ T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children
+ T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children
+ };
+ T all_planes[6*N];
+ };
+
+ Vec3f start;
+ Vec3f scale;
+
+ friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n)
+ {
+ o << "QuantizedBaseNode { " << embree_endl;
+ o << " start " << n.start << embree_endl;
+ o << " scale " << n.scale << embree_endl;
+ o << " lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl;
+ o << " upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl;
+ o << " lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl;
+ o << " upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl;
+ o << " lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl;
+ o << " upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl;
+ o << "}" << embree_endl;
+ return o;
+ }
+
+ };
+
+ template<typename NodeRef, int N>
+ struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N>
+ {
+ using BaseNode_t<NodeRef,N>::children;
+ using QuantizedBaseNode_t<N>::lower_x;
+ using QuantizedBaseNode_t<N>::upper_x;
+ using QuantizedBaseNode_t<N>::lower_y;
+ using QuantizedBaseNode_t<N>::upper_y;
+ using QuantizedBaseNode_t<N>::lower_z;
+ using QuantizedBaseNode_t<N>::upper_z;
+ using QuantizedBaseNode_t<N>::start;
+ using QuantizedBaseNode_t<N>::scale;
+ using QuantizedBaseNode_t<N>::init_dim;
+
+ __forceinline void setRef(size_t i, const NodeRef& ref) {
+ assert(i < N);
+ children[i] = ref;
+ }
+
+ struct Create2
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const
+ {
+ __aligned(64) AABBNode_t<NodeRef,N> node;
+ node.clear();
+ for (size_t i=0; i<n; i++) {
+ node.setBounds(i,children[i].bounds());
+ }
+ QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment);
+ qnode->init(node);
+
+ return (size_t)qnode | NodeRef::tyQuantizedNode;
+ }
+ };
+
+ struct Set2
+ {
+ template<typename BuildRecord>
+ __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+ {
+ QuantizedNode_t* node = ref.quantizedNode();
+ for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+ return ref;
+ }
+ };
+
+ __forceinline void init(AABBNode_t<NodeRef,N>& node)
+ {
+ for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode;
+ init_dim(node);
+ }
+
+ };
+
+ /*! BVHN Quantized Node */
+ template<int N>
+ struct __aligned(8) QuantizedBaseNodeMB_t
+ {
+ QuantizedBaseNode_t<N> node0;
+ QuantizedBaseNode_t<N> node1;
+
+ /*! Clears the node. */
+ __forceinline void clear() {
+ node0.clear();
+ node1.clear();
+ }
+
+ /*! Returns bounds of specified child. */
+ __forceinline BBox3fa bounds(size_t i) const
+ {
+ assert(i < N);
+ BBox3fa bounds0 = node0.bounds(i);
+ BBox3fa bounds1 = node1.bounds(i);
+ bounds0.extend(bounds1);
+ return bounds0;
+ }
+
+ /*! Returns extent of bounds of specified child. */
+ __forceinline Vec3fa extent(size_t i) const {
+ return bounds(i).size();
+ }
+
+ __forceinline vbool<N> validMask() const { return node0.validMask(); }
+
+ template<typename T>
+ __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); }
+ template<typename T>
+ __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); }
+ template<typename T>
+ __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); }
+ template<typename T>
+ __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); }
+ template<typename T>
+ __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); }
+ template<typename T>
+ __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); }
+
+
+ template<int M>
+ __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); }
+ template<int M>
+ __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); }
+ template<int M>
+ __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); }
+ template<int M>
+ __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); }
+ template<int M>
+ __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); }
+ template<int M>
+ __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); }
+
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
new file mode 100644
index 0000000000..0f6d4dac7e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/alloc.h"
+#include "../common/accel.h"
+#include "../common/device.h"
+#include "../common/scene.h"
+#include "../geometry/primitive.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+ /* BVH node reference with bounds */
+ template<typename NodeRef>
+ struct BVHNodeRecord
+ {
+ __forceinline BVHNodeRecord() {}
+ __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {}
+ __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {}
+
+ NodeRef ref;
+ BBox3fx bounds;
+ };
+
+ template<typename NodeRef>
+ struct BVHNodeRecordMB
+ {
+ __forceinline BVHNodeRecordMB() {}
+ __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {}
+
+ NodeRef ref;
+ LBBox3fa lbounds;
+ };
+
+ template<typename NodeRef>
+ struct BVHNodeRecordMB4D
+ {
+ __forceinline BVHNodeRecordMB4D() {}
+ __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {}
+
+ NodeRef ref;
+ LBBox3fa lbounds;
+ BBox1f dt;
+ };
+
+ template<typename NodeRef, int N> struct BaseNode_t;
+ template<typename NodeRef, int N> struct AABBNode_t;
+ template<typename NodeRef, int N> struct AABBNodeMB_t;
+ template<typename NodeRef, int N> struct AABBNodeMB4D_t;
+ template<typename NodeRef, int N> struct OBBNode_t;
+ template<typename NodeRef, int N> struct OBBNodeMB_t;
+ template<typename NodeRef, int N> struct QuantizedNode_t;
+ template<typename NodeRef, int N> struct QuantizedNodeMB_t;
+
+ /*! Pointer that points to a node or a list of primitives */
+ template<int N>
+ struct NodeRefPtr
+ {
+ //template<int NN> friend class BVHN;
+
+ /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+ static const size_t byteAlignment = 16;
+ static const size_t byteNodeAlignment = 4*N;
+
+ /*! highest address bit is used as barrier for some algorithms */
+ static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1));
+
+ /*! Masks the bits that store the number of items per leaf. */
+ static const size_t align_mask = byteAlignment-1;
+ static const size_t items_mask = byteAlignment-1;
+
+ /*! different supported node types */
+ static const size_t tyAABBNode = 0;
+ static const size_t tyAABBNodeMB = 1;
+ static const size_t tyAABBNodeMB4D = 6;
+ static const size_t tyOBBNode = 2;
+ static const size_t tyOBBNodeMB = 3;
+ static const size_t tyQuantizedNode = 5;
+ static const size_t tyLeaf = 8;
+
+ /*! Empty node */
+ static const size_t emptyNode = tyLeaf;
+
+ /*! Invalid node, used as marker in traversal */
+ static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0);
+ static const size_t popRay = (((size_t)-1) & (~items_mask)) | (tyLeaf+1);
+
+ /*! Maximum number of primitive blocks in a leaf. */
+ static const size_t maxLeafBlocks = items_mask-tyLeaf;
+
+ /*! Default constructor */
+ __forceinline NodeRefPtr () {}
+
+ /*! Construction from integer */
+ __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {}
+
+ /*! Cast to size_t */
+ __forceinline operator size_t() const { return ptr; }
+
+ /*! Sets the barrier bit. */
+ __forceinline void setBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+ assert(!isBarrier());
+ ptr |= barrier_mask;
+#else
+ assert(false);
+#endif
+ }
+
+ /*! Clears the barrier bit. */
+ __forceinline void clearBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+ ptr &= ~barrier_mask;
+#else
+ assert(false);
+#endif
+ }
+
+ /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */
+ __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; }
+
+ /*! checks if this is a leaf */
+ __forceinline size_t isLeaf() const { return ptr & tyLeaf; }
+
+ /*! returns node type */
+ __forceinline int type() const { return ptr & (size_t)align_mask; }
+
+ /*! checks if this is a node */
+ __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; }
+
+ /*! checks if this is a motion blur node */
+ __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; }
+
+ /*! checks if this is a 4D motion blur node */
+ __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; }
+
+ /*! checks if this is a node with unaligned bounding boxes */
+ __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; }
+
+ /*! checks if this is a motion blur node with unaligned bounding boxes */
+ __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; }
+
+ /*! checks if this is a quantized node */
+ __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; }
+
+ /*! Encodes a node */
+ static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) {
+ assert(!((size_t)node & align_mask));
+ return NodeRefPtr((size_t) node);
+ }
+
+ static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) {
+ assert(!((size_t)node & align_mask));
+ return NodeRefPtr((size_t) node | tyAABBNodeMB);
+ }
+
+ static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) {
+ assert(!((size_t)node & align_mask));
+ return NodeRefPtr((size_t) node | tyAABBNodeMB4D);
+ }
+
+ /*! Encodes an unaligned node */
+ static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) {
+ return NodeRefPtr((size_t) node | tyOBBNode);
+ }
+
+ /*! Encodes an unaligned motion blur node */
+ static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) {
+ return NodeRefPtr((size_t) node | tyOBBNodeMB);
+ }
+
+ /*! Encodes a leaf */
+ static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) {
+ assert(!((size_t)tri & align_mask));
+ assert(num <= maxLeafBlocks);
+ return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks)));
+ }
+
+ /*! Encodes a leaf */
+ static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) {
+ assert(!((size_t)ptr & align_mask));
+ return NodeRefPtr((size_t)ptr | (tyLeaf+ty));
+ }
+
+ /*! returns base node pointer */
+ __forceinline BaseNode_t<NodeRefPtr,N>* baseNode()
+ {
+ assert(!isLeaf());
+ return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+ }
+ __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const
+ {
+ assert(!isLeaf());
+ return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+ }
+
+ /*! returns node pointer */
+ __forceinline AABBNode_t<NodeRefPtr,N>* getAABBNode() { assert(isAABBNode()); return ( AABBNode_t<NodeRefPtr,N>*)ptr; }
+ __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; }
+
+ /*! returns motion blur node pointer */
+ __forceinline AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() { assert(isAABBNodeMB() || isAABBNodeMB4D()); return ( AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+ __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+
+ /*! returns 4D motion blur node pointer */
+ __forceinline AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() { assert(isAABBNodeMB4D()); return ( AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+ __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+
+ /*! returns unaligned node pointer */
+ __forceinline OBBNode_t<NodeRefPtr,N>* ungetAABBNode() { assert(isOBBNode()); return ( OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+ __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+
+ /*! returns unaligned motion blur node pointer */
+ __forceinline OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() { assert(isOBBNodeMB()); return ( OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+ __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+
+ /*! returns quantized node pointer */
+ __forceinline QuantizedNode_t<NodeRefPtr,N>* quantizedNode() { assert(isQuantizedNode()); return ( QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); }
+ __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask ); }
+
+ /*! returns leaf pointer */
+ __forceinline char* leaf(size_t& num) const {
+ assert(isLeaf());
+ num = (ptr & (size_t)items_mask)-tyLeaf;
+ return (char*)(ptr & ~(size_t)align_mask);
+ }
+
+ /*! clear all bit flags */
+ __forceinline void clearFlags() {
+ ptr &= ~(size_t)align_mask;
+ }
+
+ /*! returns the wideness */
+ __forceinline size_t getN() const { return N; }
+
+ public:
+ size_t ptr;
+ };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
new file mode 100644
index 0000000000..a273c21e8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_refit.h"
+#include "bvh_statistics.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ static const size_t SINGLE_THREAD_THRESHOLD = 4*1024;
+
+ template<int N>
+ __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b)
+ {
+ size_t sa = *(size_t*)&a->node()->lower_x;
+ size_t sb = *(size_t*)&b->node()->lower_x;
+ return sa < sb;
+ }
+
+ template<int N>
+ BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds)
+ : bvh(bvh), leafBounds(leafBounds), numSubTrees(0)
+ {
+ }
+
+ template<int N>
+ void BVHNRefitter<N>::refit()
+ {
+ if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) {
+ bvh->bounds = LBBox3fa(recurse_bottom(bvh->root));
+ }
+ else
+ {
+ BBox3fa subTreeBounds[MAX_NUM_SUB_TREES];
+ numSubTrees = 0;
+ gather_subtree_refs(bvh->root,numSubTrees,0);
+ if (numSubTrees)
+ parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) {
+ for (size_t i=r.begin(); i<r.end(); i++) {
+ NodeRef& ref = subTrees[i];
+ subTreeBounds[i] = recurse_bottom(ref);
+ }
+ });
+
+ numSubTrees = 0;
+ bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0));
+ }
+ }
+
+ template<int N>
+ void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref,
+ size_t &subtrees,
+ const size_t depth)
+ {
+ if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH)
+ {
+ assert(subtrees < MAX_NUM_SUB_TREES);
+ subTrees[subtrees++] = ref;
+ return;
+ }
+
+ if (ref.isAABBNode())
+ {
+ AABBNode* node = ref.getAABBNode();
+ for (size_t i=0; i<N; i++) {
+ NodeRef& child = node->child(i);
+ if (unlikely(child == BVH::emptyNode)) continue;
+ gather_subtree_refs(child,subtrees,depth+1);
+ }
+ }
+ }
+
+ template<int N>
+ BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
+ size_t &subtrees,
+ const BBox3fa *const subTreeBounds,
+ const size_t depth)
+ {
+ if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH)
+ {
+ assert(subtrees < MAX_NUM_SUB_TREES);
+ assert(subTrees[subtrees] == ref);
+ return subTreeBounds[subtrees++];
+ }
+
+ if (ref.isAABBNode())
+ {
+ AABBNode* node = ref.getAABBNode();
+ BBox3fa bounds[N];
+
+ for (size_t i=0; i<N; i++)
+ {
+ NodeRef& child = node->child(i);
+
+ if (unlikely(child == BVH::emptyNode))
+ bounds[i] = BBox3fa(empty);
+ else
+ bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1);
+ }
+
+ BBox3vf<N> boundsT = transpose<N>(bounds);
+
+ /* set new bounds */
+ node->lower_x = boundsT.lower.x;
+ node->lower_y = boundsT.lower.y;
+ node->lower_z = boundsT.lower.z;
+ node->upper_x = boundsT.upper.x;
+ node->upper_y = boundsT.upper.y;
+ node->upper_z = boundsT.upper.z;
+
+ return merge<N>(bounds);
+ }
+ else
+ return leafBounds.leafBounds(ref);
+ }
+
+ // =========================================================
+ // =========================================================
+ // =========================================================
+
+
+ template<int N>
+ BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref)
+ {
+ /* this is a leaf node */
+ if (unlikely(ref.isLeaf()))
+ return leafBounds.leafBounds(ref);
+
+ /* recurse if this is an internal node */
+ AABBNode* node = ref.getAABBNode();
+
+ /* enable exclusive prefetch for >= AVX platforms */
+#if defined(__AVX__)
+ BVH::prefetchW(ref);
+#endif
+ BBox3fa bounds[N];
+
+ for (size_t i=0; i<N; i++)
+ if (unlikely(node->child(i) == BVH::emptyNode))
+ {
+ bounds[i] = BBox3fa(empty);
+ }
+ else
+ bounds[i] = recurse_bottom(node->child(i));
+
+ /* AOS to SOA transform */
+ BBox3vf<N> boundsT = transpose<N>(bounds);
+
+ /* set new bounds */
+ node->lower_x = boundsT.lower.x;
+ node->lower_y = boundsT.lower.y;
+ node->lower_z = boundsT.lower.z;
+ node->upper_x = boundsT.upper.x;
+ node->upper_y = boundsT.upper.y;
+ node->upper_z = boundsT.upper.z;
+
+ return merge<N>(bounds);
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode)
+ : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {}
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNRefitT<N,Mesh,Primitive>::clear()
+ {
+ if (builder)
+ builder->clear();
+ }
+
+ template<int N, typename Mesh, typename Primitive>
+ void BVHNRefitT<N,Mesh,Primitive>::build()
+ {
+ if (mesh->topologyChanged(topologyVersion)) {
+ topologyVersion = mesh->getTopologyVersion();
+ builder->build();
+ }
+ else
+ refitter->refit();
+ }
+
+ template class BVHNRefitter<4>;
+#if defined(__AVX__)
+ template class BVHNRefitter<8>;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+ Builder* BVH4Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+ Builder* BVH4Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+ Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+ Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#if defined(__AVX__)
+ Builder* BVH8Triangle4MeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+ Builder* BVH8Triangle4MeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+ Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+ Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+ Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if defined(__AVX__)
+ Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+ Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if defined(__AVX__)
+ Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+ Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+ Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+ Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if defined(__AVX__)
+ Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+ Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
new file mode 100644
index 0000000000..4aa9bdd7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
@@ -0,0 +1,95 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N>
+ class BVHNRefitter
+ {
+ public:
+
+ /*! Type shortcuts */
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::NodeRef NodeRef;
+
+ struct LeafBoundsInterface {
+ virtual const BBox3fa leafBounds(NodeRef& ref) const = 0;
+ };
+
+ public:
+
+ /*! Constructor. */
+ BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds);
+
+ /*! refits the BVH */
+ void refit();
+
+ private:
+ /* single-threaded subtree extraction based on BVH depth */
+ void gather_subtree_refs(NodeRef& ref,
+ size_t &subtrees,
+ const size_t depth = 0);
+
+ /* single-threaded top-level refit */
+ BBox3fa refit_toplevel(NodeRef& ref,
+ size_t &subtrees,
+ const BBox3fa *const subTreeBounds,
+ const size_t depth = 0);
+
+ /* single-threaded subtree refit */
+ BBox3fa recurse_bottom(NodeRef& ref);
+
+ public:
+ BVH* bvh; //!< BVH to refit
+ const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves
+
+ static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4 : (N==8) ? 3 : 3;
+ static const size_t MAX_NUM_SUB_TREES = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH
+ size_t numSubTrees;
+ NodeRef subTrees[MAX_NUM_SUB_TREES];
+ };
+
+ template<int N, typename Mesh, typename Primitive>
+ class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface
+ {
+ public:
+
+ /*! Type shortcuts */
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::NodeRef NodeRef;
+
+ public:
+ BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode);
+
+ virtual void build();
+
+ virtual void clear();
+
+ virtual const BBox3fa leafBounds (NodeRef& ref) const
+ {
+ size_t num; char* prim = ref.leaf(num);
+ if (unlikely(ref == BVH::emptyNode)) return empty;
+
+ BBox3fa bounds = empty;
+ for (size_t i=0; i<num; i++)
+ bounds.extend(((Primitive*)prim)[i].update(mesh));
+ return bounds;
+ }
+
+ private:
+ BVH* bvh;
+ std::unique_ptr<Builder> builder;
+ std::unique_ptr<BVHNRefitter<N>> refitter;
+ Mesh* mesh;
+ unsigned int topologyVersion;
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
new file mode 100644
index 0000000000..2bb431bf0e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_rotate.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! Computes half surface area of box. */
+ __forceinline float halfArea3f(const BBox<vfloat4>& box) {
+ const vfloat4 d = box.size();
+ const vfloat4 a = d*shuffle<1,2,0,3>(d);
+ return a[0]+a[1]+a[2];
+ }
+
+ size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
+ {
+ /*! nothing to rotate if we reached a leaf node. */
+ if (parentRef.isBarrier()) return 0;
+ if (parentRef.isLeaf()) return 0;
+ AABBNode* parent = parentRef.getAABBNode();
+
+ /*! rotate all children first */
+ vint4 cdepth;
+ for (size_t c=0; c<4; c++)
+ cdepth[c] = (int)rotate(parent->child(c),depth+1);
+
+ /* compute current areas of all children */
+ vfloat4 sizeX = parent->upper_x-parent->lower_x;
+ vfloat4 sizeY = parent->upper_y-parent->lower_y;
+ vfloat4 sizeZ = parent->upper_z-parent->lower_z;
+ vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ);
+
+ /*! get node bounds */
+ BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
+ parent->bounds(child1_0,child1_1,child1_2,child1_3);
+
+ /*! Find best rotation. We pick a first child (child1) and a sub-child
+ (child2child) of a different second child (child2), and swap child1
+ and child2child. We perform the best such swap. */
+ float bestArea = 0;
+ size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
+ for (size_t c2=0; c2<4; c2++)
+ {
+ /*! ignore leaf nodes as we cannot descent into them */
+ if (parent->child(c2).isBarrier()) continue;
+ if (parent->child(c2).isLeaf()) continue;
+ AABBNode* child2 = parent->child(c2).getAABBNode();
+
+ /*! transpose child bounds */
+ BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
+ child2->bounds(child2c0,child2c1,child2c2,child2c3);
+
+ /*! put child1_0 at each child2 position */
+ float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
+ float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
+ float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
+ float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
+ vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
+ vfloat4 min0 = vreduce_min(cost0);
+ int pos0 = (int)bsf(movemask(min0 == cost0));
+
+ /*! put child1_1 at each child2 position */
+ float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
+ float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
+ float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
+ float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
+ vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
+ vfloat4 min1 = vreduce_min(cost1);
+ int pos1 = (int)bsf(movemask(min1 == cost1));
+
+ /*! put child1_2 at each child2 position */
+ float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
+ float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
+ float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
+ float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
+ vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
+ vfloat4 min2 = vreduce_min(cost2);
+ int pos2 = (int)bsf(movemask(min2 == cost2));
+
+ /*! put child1_3 at each child2 position */
+ float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
+ float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
+ float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
+ float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
+ vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
+ vfloat4 min3 = vreduce_min(cost3);
+ int pos3 = (int)bsf(movemask(min3 == cost3));
+
+ /*! find best other child */
+ vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
+ int pos[4] = { pos0,pos1,pos2,pos3 };
+ const size_t mbd = BVH4::maxBuildDepth;
+ vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
+ valid &= vint4(int(c2)) != vint4(step);
+ if (none(valid)) continue;
+ size_t c1 = select_min(valid,area0123);
+ float area = area0123[c1];
+ if (c1 == c2) continue; // can happen if bounds are NANs
+
+ /*! accept a swap when it reduces cost and is not swapping a node with itself */
+ if (area < bestArea) {
+ bestArea = area;
+ bestChild1 = c1;
+ bestChild2 = c2;
+ bestChild2Child = pos[c1];
+ }
+ }
+
+ /*! if we did not find a swap that improves the SAH then do nothing */
+ if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
+
+ /*! perform the best found tree rotation */
+ AABBNode* child2 = parent->child(bestChild2).getAABBNode();
+ AABBNode::swap(parent,bestChild1,child2,bestChild2Child);
+ parent->setBounds(bestChild2,child2->bounds());
+ AABBNode::compact(parent);
+ AABBNode::compact(child2);
+
+ /*! This returned depth is conservative as the child that was
+ * pulled up in the tree could have been on the critical path. */
+ cdepth[bestChild1]++; // bestChild1 was pushed down one level
+ return 1+reduce_max(cdepth);
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
new file mode 100644
index 0000000000..009bef339e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N>
+ class BVHNRotate
+ {
+ typedef typename BVHN<N>::NodeRef NodeRef;
+
+ public:
+ static const bool enabled = false;
+
+ static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; }
+ static __forceinline void restructure(NodeRef ref, size_t depth = 1) {}
+ };
+
+ /* BVH4 tree rotations */
+ template<>
+ class BVHNRotate<4>
+ {
+ typedef BVH4::AABBNode AABBNode;
+ typedef BVH4::NodeRef NodeRef;
+
+ public:
+ static const bool enabled = true;
+
+ static size_t rotate(NodeRef parentRef, size_t depth = 1);
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
new file mode 100644
index 0000000000..aa56035026
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
@@ -0,0 +1,168 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_statistics.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+ template<int N>
+ BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh)
+ {
+ double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea());
+ stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f));
+ }
+
+ template<int N>
+ std::string BVHNStatistics<N>::str()
+ {
+ std::ostringstream stream;
+ stream.setf(std::ios::fixed, std::ios::floatfield);
+ stream << " primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl;
+ size_t totalBytes = stat.bytes(bvh);
+ double totalSAH = stat.sah(bvh);
+ stream << " total : sah = " << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), ";
+ stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), ";
+ stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), ";
+ stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl;
+ if (stat.statAABBNodes.numNodes ) stream << " getAABBNodes : " << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (stat.statOBBNodes.numNodes ) stream << " ungetAABBNodes : " << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (stat.statAABBNodesMB.numNodes ) stream << " getAABBNodesMB : " << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (stat.statAABBNodesMB4D.numNodes) stream << " getAABBNodesMB4D : " << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (stat.statOBBNodesMB.numNodes) stream << " ungetAABBNodesMB : " << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (stat.statQuantizedNodes.numNodes ) stream << " quantizedNodes : " << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (true) stream << " leaves : " << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl;
+ if (true) stream << " histogram : " << stat.statLeaf.histToString() << std::endl;
+ return stream.str();
+ }
+
+ template<int N>
+ typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1)
+ {
+ Statistics s;
+ assert(t0t1.size() > 0.0f);
+ double dt = max(0.0f,t0t1.size());
+ if (node.isAABBNode())
+ {
+ AABBNode* n = node.getAABBNode();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const double Ai = max(0.0f,halfArea(n->extend(i)));
+ Statistics s = statistics(n->child(i),Ai,t0t1);
+ s.statAABBNodes.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statAABBNodes.numNodes++;
+ s.statAABBNodes.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isOBBNode())
+ {
+ OBBNode* n = node.ungetAABBNode();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const double Ai = max(0.0f,halfArea(n->extent(i)));
+ Statistics s = statistics(n->child(i),Ai,t0t1);
+ s.statOBBNodes.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statOBBNodes.numNodes++;
+ s.statOBBNodes.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isAABBNodeMB())
+ {
+ AABBNodeMB* n = node.getAABBNodeMB();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1));
+ Statistics s = statistics(n->child(i),Ai,t0t1);
+ s.statAABBNodesMB.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statAABBNodesMB.numNodes++;
+ s.statAABBNodesMB.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isAABBNodeMB4D())
+ {
+ AABBNodeMB4D* n = node.getAABBNodeMB4D();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const BBox1f t0t1i = intersect(t0t1,n->timeRange(i));
+ assert(!t0t1i.empty());
+ const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i);
+ Statistics s = statistics(n->child(i),Ai,t0t1i);
+ s.statAABBNodesMB4D.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statAABBNodesMB4D.numNodes++;
+ s.statAABBNodesMB4D.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isOBBNodeMB())
+ {
+ OBBNodeMB* n = node.ungetAABBNodeMB();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const double Ai = max(0.0f,halfArea(n->extent0(i)));
+ Statistics s = statistics(n->child(i),Ai,t0t1);
+ s.statOBBNodesMB.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statOBBNodesMB.numNodes++;
+ s.statOBBNodesMB.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isQuantizedNode())
+ {
+ QuantizedNode* n = node.quantizedNode();
+ s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+ if (n->child(i) == BVH::emptyNode) return Statistics();
+ const double Ai = max(0.0f,halfArea(n->extent(i)));
+ Statistics s = statistics(n->child(i),Ai,t0t1);
+ s.statQuantizedNodes.numChildren++;
+ return s;
+ }, Statistics::add);
+ s.statQuantizedNodes.numNodes++;
+ s.statQuantizedNodes.nodeSAH += dt*A;
+ s.depth++;
+ }
+ else if (node.isLeaf())
+ {
+ size_t num; const char* tri = node.leaf(num);
+ if (num)
+ {
+ for (size_t i=0; i<num; i++)
+ {
+ const size_t bytes = bvh->primTy->getBytes(tri);
+ s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri);
+ s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri);
+ s.statLeaf.numBytes += bytes;
+ tri+=bytes;
+ }
+ s.statLeaf.numLeaves++;
+ s.statLeaf.numPrimBlocks += num;
+ s.statLeaf.leafSAH += dt*A*num;
+ if (num-1 < Statistics::LeafStat::NHIST) {
+ s.statLeaf.numPrimBlocksHistogram[num-1]++;
+ }
+ }
+ }
+ else {
+ // -- GODOT start --
+ // throw std::runtime_error("not supported node type in bvh_statistics");
+ abort();
+ // -- GODOT end --
+ }
+ return s;
+ }
+
+#if defined(__AVX__)
+ template class BVHNStatistics<8>;
+#endif
+
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
+ template class BVHNStatistics<4>;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
new file mode 100644
index 0000000000..73dfc6fbcc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
@@ -0,0 +1,285 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include <sstream>
+
+namespace embree
+{
+ template<int N>
+ class BVHNStatistics
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::AABBNode AABBNode;
+ typedef typename BVH::OBBNode OBBNode;
+ typedef typename BVH::AABBNodeMB AABBNodeMB;
+ typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+ typedef typename BVH::OBBNodeMB OBBNodeMB;
+ typedef typename BVH::QuantizedNode QuantizedNode;
+
+ typedef typename BVH::NodeRef NodeRef;
+
+ struct Statistics
+ {
+ template<typename Node>
+ struct NodeStat
+ {
+ NodeStat ( double nodeSAH = 0,
+ size_t numNodes = 0,
+ size_t numChildren = 0)
+ : nodeSAH(nodeSAH),
+ numNodes(numNodes),
+ numChildren(numChildren) {}
+
+ double sah(BVH* bvh) const {
+ return nodeSAH/bvh->getLinearBounds().expectedHalfArea();
+ }
+
+ size_t bytes() const {
+ return numNodes*sizeof(Node);
+ }
+
+ size_t size() const {
+ return numNodes;
+ }
+
+ double fillRateNom () const { return double(numChildren); }
+ double fillRateDen () const { return double(numNodes*N); }
+ double fillRate () const { return fillRateNom()/fillRateDen(); }
+
+ __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+ {
+ return NodeStat(a.nodeSAH + b.nodeSAH,
+ a.numNodes+b.numNodes,
+ a.numChildren+b.numChildren);
+ }
+
+ std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+ {
+ std::ostringstream stream;
+ stream.setf(std::ios::fixed, std::ios::floatfield);
+ stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+ stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+ stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6 << " MB ";
+ stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), ";
+ stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), ";
+ stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives);
+ return stream.str();
+ }
+
+ public:
+ double nodeSAH;
+ size_t numNodes;
+ size_t numChildren;
+ };
+
+ struct LeafStat
+ {
+ static const int NHIST = 8;
+
+ LeafStat ( double leafSAH = 0.0f,
+ size_t numLeaves = 0,
+ size_t numPrimsActive = 0,
+ size_t numPrimsTotal = 0,
+ size_t numPrimBlocks = 0,
+ size_t numBytes = 0)
+ : leafSAH(leafSAH),
+ numLeaves(numLeaves),
+ numPrimsActive(numPrimsActive),
+ numPrimsTotal(numPrimsTotal),
+ numPrimBlocks(numPrimBlocks),
+ numBytes(numBytes)
+ {
+ for (size_t i=0; i<NHIST; i++)
+ numPrimBlocksHistogram[i] = 0;
+ }
+
+ double sah(BVH* bvh) const {
+ return leafSAH/bvh->getLinearBounds().expectedHalfArea();
+ }
+
+ size_t bytes(BVH* bvh) const {
+ return numBytes;
+ }
+
+ size_t size() const {
+ return numLeaves;
+ }
+
+ double fillRateNom (BVH* bvh) const { return double(numPrimsActive); }
+ double fillRateDen (BVH* bvh) const { return double(numPrimsTotal); }
+ double fillRate (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); }
+
+ __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+ {
+ LeafStat stat(a.leafSAH + b.leafSAH,
+ a.numLeaves+b.numLeaves,
+ a.numPrimsActive+b.numPrimsActive,
+ a.numPrimsTotal+b.numPrimsTotal,
+ a.numPrimBlocks+b.numPrimBlocks,
+ a.numBytes+b.numBytes);
+ for (size_t i=0; i<NHIST; i++) {
+ stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i];
+ stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i];
+ }
+ return stat;
+ }
+
+ std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+ {
+ std::ostringstream stream;
+ stream.setf(std::ios::fixed, std::ios::floatfield);
+ stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+ stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+ stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6 << " MB ";
+ stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), ";
+ stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), ";
+ stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives);
+ return stream.str();
+ }
+
+ std::string histToString() const
+ {
+ std::ostringstream stream;
+ stream.setf(std::ios::fixed, std::ios::floatfield);
+ for (size_t i=0; i<NHIST; i++)
+ stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% ";
+ return stream.str();
+ }
+
+ public:
+ double leafSAH; //!< SAH of the leaves only
+ size_t numLeaves; //!< Number of leaf nodes.
+ size_t numPrimsActive; //!< Number of active primitives (
+ size_t numPrimsTotal; //!< Number of active and inactive primitives
+ size_t numPrimBlocks; //!< Number of primitive blocks.
+ size_t numBytes; //!< Number of bytes of leaves.
+ size_t numPrimBlocksHistogram[8];
+ };
+
+ public:
+ Statistics (size_t depth = 0,
+ LeafStat statLeaf = LeafStat(),
+ NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(),
+ NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(),
+ NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(),
+ NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(),
+ NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(),
+ NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>())
+
+ : depth(depth),
+ statLeaf(statLeaf),
+ statAABBNodes(statAABBNodes),
+ statOBBNodes(statOBBNodes),
+ statAABBNodesMB(statAABBNodesMB),
+ statAABBNodesMB4D(statAABBNodesMB4D),
+ statOBBNodesMB(statOBBNodesMB),
+ statQuantizedNodes(statQuantizedNodes) {}
+
+ double sah(BVH* bvh) const
+ {
+ return statLeaf.sah(bvh) +
+ statAABBNodes.sah(bvh) +
+ statOBBNodes.sah(bvh) +
+ statAABBNodesMB.sah(bvh) +
+ statAABBNodesMB4D.sah(bvh) +
+ statOBBNodesMB.sah(bvh) +
+ statQuantizedNodes.sah(bvh);
+ }
+
+ size_t bytes(BVH* bvh) const {
+ return statLeaf.bytes(bvh) +
+ statAABBNodes.bytes() +
+ statOBBNodes.bytes() +
+ statAABBNodesMB.bytes() +
+ statAABBNodesMB4D.bytes() +
+ statOBBNodesMB.bytes() +
+ statQuantizedNodes.bytes();
+ }
+
+ size_t size() const
+ {
+ return statLeaf.size() +
+ statAABBNodes.size() +
+ statOBBNodes.size() +
+ statAABBNodesMB.size() +
+ statAABBNodesMB4D.size() +
+ statOBBNodesMB.size() +
+ statQuantizedNodes.size();
+ }
+
+ double fillRate (BVH* bvh) const
+ {
+ double nom = statLeaf.fillRateNom(bvh) +
+ statAABBNodes.fillRateNom() +
+ statOBBNodes.fillRateNom() +
+ statAABBNodesMB.fillRateNom() +
+ statAABBNodesMB4D.fillRateNom() +
+ statOBBNodesMB.fillRateNom() +
+ statQuantizedNodes.fillRateNom();
+ double den = statLeaf.fillRateDen(bvh) +
+ statAABBNodes.fillRateDen() +
+ statOBBNodes.fillRateDen() +
+ statAABBNodesMB.fillRateDen() +
+ statAABBNodesMB4D.fillRateDen() +
+ statOBBNodesMB.fillRateDen() +
+ statQuantizedNodes.fillRateDen();
+ return nom/den;
+ }
+
+ friend Statistics operator+ ( const Statistics& a, const Statistics& b )
+ {
+ return Statistics(max(a.depth,b.depth),
+ a.statLeaf + b.statLeaf,
+ a.statAABBNodes + b.statAABBNodes,
+ a.statOBBNodes + b.statOBBNodes,
+ a.statAABBNodesMB + b.statAABBNodesMB,
+ a.statAABBNodesMB4D + b.statAABBNodesMB4D,
+ a.statOBBNodesMB + b.statOBBNodesMB,
+ a.statQuantizedNodes + b.statQuantizedNodes);
+ }
+
+ static Statistics add ( const Statistics& a, const Statistics& b ) {
+ return a+b;
+ }
+
+ public:
+ size_t depth;
+ LeafStat statLeaf;
+ NodeStat<AABBNode> statAABBNodes;
+ NodeStat<OBBNode> statOBBNodes;
+ NodeStat<AABBNodeMB> statAABBNodesMB;
+ NodeStat<AABBNodeMB4D> statAABBNodesMB4D;
+ NodeStat<OBBNodeMB> statOBBNodesMB;
+ NodeStat<QuantizedNode> statQuantizedNodes;
+ };
+
+ public:
+
+ /* Constructor gathers statistics. */
+ BVHNStatistics (BVH* bvh);
+
+ /*! Convert statistics into a string */
+ std::string str();
+
+ double sah() const {
+ return stat.sah(bvh);
+ }
+
+ size_t bytesUsed() const {
+ return stat.bytes(bvh);
+ }
+
+ private:
+ Statistics statistics(NodeRef node, const double A, const BBox1f dt);
+
+ private:
+ BVH* bvh;
+ Statistics stat;
+ };
+
+ typedef BVHNStatistics<4> BVH4Statistics;
+ typedef BVHNStatistics<8> BVH8Statistics;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
new file mode 100644
index 0000000000..7f17084b81
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
@@ -0,0 +1,676 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "node_intersector1.h"
+#include "../common/stack_item.h"
+
+#define NEW_SORTING_CODE 1
+
+namespace embree
+{
+ namespace isa
+ {
+ /*! BVH regular node traversal for single rays. */
+ template<int N, int Nx, int types>
+ class BVHNNodeTraverser1Hit;
+
+ /*! Helper functions for fast sorting using AVX512 instructions. */
+#if defined(__AVX512ER__)
+
+ /* KNL code path */
+ __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+ {
+ const vfloat16 dist_shift = align_shift_right<15>(dist,dist);
+ const vllong8 ptr_shift = align_shift_right<7>(ptr,ptr);
+ const vbool16 m_geq = d >= dist;
+ const vbool16 m_geq_shift = m_geq << 1;
+ dist = select(m_geq,d,dist);
+ ptr = select(vboold8(m_geq),p,ptr);
+ dist = select(m_geq_shift,dist_shift,dist);
+ ptr = select(vboold8(m_geq_shift),ptr_shift,ptr);
+ }
+
+ __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+ {
+ //dist = align_shift_right<15>(dist,d);
+ //ptr = align_shift_right<7>(ptr,p);
+ dist = align_shift_right<15>(dist,permute(d,vint16(zero)));
+ ptr = align_shift_right<7>(ptr,permute(p,vllong8(zero)));
+ }
+
+ template<int N, int Nx, int types, class NodeRef, class BaseNode>
+ __forceinline void traverseClosestHitAVX512(NodeRef& cur,
+ size_t mask,
+ const vfloat<Nx>& tNear,
+ StackItemT<NodeRef>*& stackPtr,
+ StackItemT<NodeRef>* stackEnd)
+ {
+ assert(mask != 0);
+ const BaseNode* node = cur.baseNode();
+
+ vllong8 children( vllong<N>::loadu((void*)node->children) );
+ children = vllong8::compact((int)mask,children);
+ vfloat16 distance = tNear;
+ distance = vfloat16::compact((int)mask,distance,tNear);
+
+ cur = toScalar(children);
+ BVHN<N>::prefetch(cur,types);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) return;
+
+ /* 2 hits: order A0 B0 */
+ const vllong8 c0(children);
+ const vfloat16 d0(distance);
+ children = align_shift_right<1>(children,children);
+ distance = align_shift_right<1>(distance,distance);
+ const vllong8 c1(children);
+ const vfloat16 d1(distance);
+
+ cur = toScalar(children);
+ BVHN<N>::prefetch(cur,types);
+
+ /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */
+ const vboolf16 m_dist = d0 < d1;
+ const vfloat16 dist_A0 = select(m_dist, d0, d1);
+ const vfloat16 dist_B0 = select(m_dist, d1, d0);
+ const vllong8 ptr_A0 = select(vboold8(m_dist), c0, c1);
+ const vllong8 ptr_B0 = select(vboold8(m_dist), c1, c0);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = toScalar(ptr_A0);
+ stackPtr[0].ptr = toScalar(ptr_B0);
+ *(float*)&stackPtr[0].dist = toScalar(dist_B0);
+ stackPtr++;
+ return;
+ }
+
+ /* 3 hits: order A1 B1 C1 */
+
+ children = align_shift_right<1>(children,children);
+ distance = align_shift_right<1>(distance,distance);
+
+ const vllong8 c2(children);
+ const vfloat16 d2(distance);
+
+ cur = toScalar(children);
+ BVHN<N>::prefetch(cur,types);
+
+ const vboolf16 m_dist1 = dist_A0 <= d2;
+ const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0);
+ const vllong8 ptr_A1 = select(vboold8(m_dist1), ptr_A0, c2);
+ const vllong8 ptr_tmp_B1 = select(vboold8(m_dist1), c2, ptr_A0);
+
+ const vboolf16 m_dist2 = dist_B0 <= dist_tmp_B1;
+ const vfloat16 dist_B1 = select(m_dist2, dist_B0 , dist_tmp_B1);
+ const vfloat16 dist_C1 = select(m_dist2, dist_tmp_B1, dist_B0);
+ const vllong8 ptr_B1 = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1);
+ const vllong8 ptr_C1 = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = toScalar(ptr_A1);
+ stackPtr[0].ptr = toScalar(ptr_C1);
+ *(float*)&stackPtr[0].dist = toScalar(dist_C1);
+ stackPtr[1].ptr = toScalar(ptr_B1);
+ *(float*)&stackPtr[1].dist = toScalar(dist_B1);
+ stackPtr+=2;
+ return;
+ }
+
+ /* 4 hits: order A2 B2 C2 D2 */
+
+ const vfloat16 dist_A1 = select(m_dist1, dist_A0, d2);
+
+ children = align_shift_right<1>(children,children);
+ distance = align_shift_right<1>(distance,distance);
+
+ const vllong8 c3(children);
+ const vfloat16 d3(distance);
+
+ cur = toScalar(children);
+ BVHN<N>::prefetch(cur,types);
+
+ const vboolf16 m_dist3 = dist_A1 <= d3;
+ const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1);
+ const vllong8 ptr_A2 = select(vboold8(m_dist3), ptr_A1, c3);
+ const vllong8 ptr_tmp_B2 = select(vboold8(m_dist3), c3, ptr_A1);
+
+ const vboolf16 m_dist4 = dist_B1 <= dist_tmp_B2;
+ const vfloat16 dist_B2 = select(m_dist4, dist_B1 , dist_tmp_B2);
+ const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1);
+ const vllong8 ptr_B2 = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2);
+ const vllong8 ptr_tmp_C2 = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1);
+
+ const vboolf16 m_dist5 = dist_C1 <= dist_tmp_C2;
+ const vfloat16 dist_C2 = select(m_dist5, dist_C1 , dist_tmp_C2);
+ const vfloat16 dist_D2 = select(m_dist5, dist_tmp_C2, dist_C1);
+ const vllong8 ptr_C2 = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2);
+ const vllong8 ptr_D2 = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = toScalar(ptr_A2);
+ stackPtr[0].ptr = toScalar(ptr_D2);
+ *(float*)&stackPtr[0].dist = toScalar(dist_D2);
+ stackPtr[1].ptr = toScalar(ptr_C2);
+ *(float*)&stackPtr[1].dist = toScalar(dist_C2);
+ stackPtr[2].ptr = toScalar(ptr_B2);
+ *(float*)&stackPtr[2].dist = toScalar(dist_B2);
+ stackPtr+=3;
+ return;
+ }
+
+ /* >=5 hits: reverse to descending order for writing to stack */
+
+ const size_t hits = 4 + popcnt(mask);
+ const vfloat16 dist_A2 = select(m_dist3, dist_A1, d3);
+ vfloat16 dist(neg_inf);
+ vllong8 ptr(zero);
+
+
+ isort_quick_update(dist,ptr,dist_A2,ptr_A2);
+ isort_quick_update(dist,ptr,dist_B2,ptr_B2);
+ isort_quick_update(dist,ptr,dist_C2,ptr_C2);
+ isort_quick_update(dist,ptr,dist_D2,ptr_D2);
+
+ do {
+
+ children = align_shift_right<1>(children,children);
+ distance = align_shift_right<1>(distance,distance);
+
+ cur = toScalar(children);
+ BVHN<N>::prefetch(cur,types);
+
+ const vfloat16 new_dist(permute(distance,vint16(zero)));
+ const vllong8 new_ptr(permute(children,vllong8(zero)));
+
+ mask &= mask-1;
+ isort_update(dist,ptr,new_dist,new_ptr);
+
+ } while(mask);
+
+ const vboold8 m_stack_ptr(0x55); // 10101010 (lsb -> msb)
+ const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb)
+
+ /* extract current noderef */
+ cur = toScalar(permute(ptr,vllong8(hits-1)));
+ /* rearrange pointers to beginning of 16 bytes block */
+ vllong8 stackElementA0;
+ stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0);
+ /* put distances in between */
+ vuint16 stackElementA1((__m512i)stackElementA0);
+ stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1);
+ /* write out first 4 x 16 bytes block to stack */
+ vuint16::storeu(stackPtr,stackElementA1);
+ /* get upper half of dist and ptr */
+ dist = align_shift_right<4>(dist,dist);
+ ptr = align_shift_right<4>(ptr,ptr);
+ /* assemble and write out second block */
+ vllong8 stackElementB0;
+ stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0);
+ vuint16 stackElementB1((__m512i)stackElementB0);
+ stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1);
+ vuint16::storeu(stackPtr + 4,stackElementB1);
+ /* increase stack pointer */
+ stackPtr += hits-1;
+ }
+#endif
+
+#if defined(__AVX512VL__) // SKX
+
+ template<int N>
+ __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
+ {
+ const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
+ const vboolf<N> m_geq = d >= dist;
+ const vboolf<N> m_geq_shift = m_geq << 1;
+ dist = select(m_geq,d,dist);
+ dist = select(m_geq_shift,dist_shift,dist);
+ }
+
+ template<int N>
+ __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
+ dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+ }
+
+ __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
+ return toScalar(permutex2var((__m256i)index,n0,n1));
+ }
+
+ __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
+ return toScalar(permute(n,index));
+ }
+
+#endif
+
+ /* Specialization for BVH4. */
+ template<int Nx, int types>
+ class BVHNNodeTraverser1Hit<4, Nx, types>
+ {
+ typedef BVH4 BVH;
+ typedef BVH4::NodeRef NodeRef;
+ typedef BVH4::BaseNode BaseNode;
+
+
+ public:
+ /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
+ static __forceinline void traverseClosestHit(NodeRef& cur,
+ size_t mask,
+ const vfloat<Nx>& tNear,
+ StackItemT<NodeRef>*& stackPtr,
+ StackItemT<NodeRef>* stackEnd)
+ {
+ assert(mask != 0);
+#if defined(__AVX512ER__)
+ traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ cur = node->child(r);
+ BVH::prefetch(cur,types);
+ if (likely(mask == 0)) {
+ assert(cur != BVH::emptyNode);
+ return;
+ }
+
+ /*! two children are hit, push far child, and continue with closer child */
+ NodeRef c0 = cur;
+ const unsigned int d0 = ((unsigned int*)&tNear)[r];
+ r = bscf(mask);
+ NodeRef c1 = node->child(r);
+ BVH::prefetch(c1,types);
+ const unsigned int d1 = ((unsigned int*)&tNear)[r];
+ assert(c0 != BVH::emptyNode);
+ assert(c1 != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ assert(stackPtr < stackEnd);
+ if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+ else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+ }
+
+#if NEW_SORTING_CODE == 1
+ vint4 s0((size_t)c0,(size_t)d0);
+ vint4 s1((size_t)c1,(size_t)d1);
+ r = bscf(mask);
+ NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];
+ vint4 s2((size_t)c2,(size_t)d2);
+ /* 3 hits */
+ if (likely(mask == 0)) {
+ StackItemT<NodeRef>::sort3(s0,s1,s2);
+ *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+ cur = toSizeT(s2);
+ stackPtr+=2;
+ return;
+ }
+ r = bscf(mask);
+ NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];
+ vint4 s3((size_t)c3,(size_t)d3);
+ /* 4 hits */
+ StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+ *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+ cur = toSizeT(s3);
+ stackPtr+=3;
+#else
+ /*! Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there. */
+ assert(stackPtr < stackEnd);
+ stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+ assert(stackPtr < stackEnd);
+ stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+ /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+ assert(c != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+ return;
+ }
+
+ /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+ assert(c != BVH::emptyNode);
+ sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+ }
+
+ /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
+ static __forceinline void traverseAnyHit(NodeRef& cur,
+ size_t mask,
+ const vfloat<Nx>& tNear,
+ NodeRef*& stackPtr,
+ NodeRef* stackEnd)
+ {
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ cur = node->child(r);
+ BVH::prefetch(cur,types);
+
+ /* simpler in sequence traversal order */
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ assert(stackPtr < stackEnd);
+ *stackPtr = cur; stackPtr++;
+
+ for (; ;)
+ {
+ r = bscf(mask);
+ cur = node->child(r); BVH::prefetch(cur,types);
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ assert(stackPtr < stackEnd);
+ *stackPtr = cur; stackPtr++;
+ }
+ }
+ };
+
+ /* Specialization for BVH8. */
+ template<int Nx, int types>
+ class BVHNNodeTraverser1Hit<8, Nx, types>
+ {
+ typedef BVH8 BVH;
+ typedef BVH8::NodeRef NodeRef;
+ typedef BVH8::BaseNode BaseNode;
+
+#if defined(__AVX512VL__)
+ template<class NodeRef, class BaseNode>
+ static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
+ size_t mask,
+ const vfloat8& tNear,
+ StackItemT<NodeRef>*& stackPtr,
+ StackItemT<NodeRef>* stackEnd)
+ {
+ assert(mask != 0);
+ const BaseNode* node = cur.baseNode();
+ const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
+ const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
+ vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
+ distance_i = vint8::compact((int)mask,distance_i,distance_i);
+ cur = permuteExtract(distance_i,n0,n1);
+ BVH::prefetch(cur,types);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) return;
+
+ /* 2 hits: order A0 B0 */
+ const vint8 d0(distance_i);
+ const vint8 d1(shuffle<1>(distance_i));
+ cur = permuteExtract(d1,n0,n1);
+ BVH::prefetch(cur,types);
+
+ const vint8 dist_A0 = min(d0, d1);
+ const vint8 dist_B0 = max(d0, d1);
+ assert(dist_A0[0] < dist_B0[0]);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = permuteExtract(dist_A0,n0,n1);
+ stackPtr[0].ptr = permuteExtract(dist_B0,n0,n1);
+ *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
+ stackPtr++;
+ return;
+ }
+
+ /* 3 hits: order A1 B1 C1 */
+
+ const vint8 d2(shuffle<2>(distance_i));
+ cur = permuteExtract(d2,n0,n1);
+ BVH::prefetch(cur,types);
+
+ const vint8 dist_A1 = min(dist_A0,d2);
+ const vint8 dist_tmp_B1 = max(dist_A0,d2);
+ const vint8 dist_B1 = min(dist_B0,dist_tmp_B1);
+ const vint8 dist_C1 = max(dist_B0,dist_tmp_B1);
+ assert(dist_A1[0] < dist_B1[0]);
+ assert(dist_B1[0] < dist_C1[0]);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = permuteExtract(dist_A1,n0,n1);
+ stackPtr[0].ptr = permuteExtract(dist_C1,n0,n1);
+ *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
+ stackPtr[1].ptr = permuteExtract(dist_B1,n0,n1);
+ *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
+ stackPtr+=2;
+ return;
+ }
+
+ /* 4 hits: order A2 B2 C2 D2 */
+
+ const vint8 d3(shuffle<3>(distance_i));
+ cur = permuteExtract(d3,n0,n1);
+ BVH::prefetch(cur,types);
+
+ const vint8 dist_A2 = min(dist_A1,d3);
+ const vint8 dist_tmp_B2 = max(dist_A1,d3);
+ const vint8 dist_B2 = min(dist_B1,dist_tmp_B2);
+ const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
+ const vint8 dist_C2 = min(dist_C1,dist_tmp_C2);
+ const vint8 dist_D2 = max(dist_C1,dist_tmp_C2);
+ assert(dist_A2[0] < dist_B2[0]);
+ assert(dist_B2[0] < dist_C2[0]);
+ assert(dist_C2[0] < dist_D2[0]);
+
+ mask &= mask-1;
+ if (likely(mask == 0)) {
+ cur = permuteExtract(dist_A2,n0,n1);
+ stackPtr[0].ptr = permuteExtract(dist_D2,n0,n1);
+ *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
+ stackPtr[1].ptr = permuteExtract(dist_C2,n0,n1);
+ *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
+ stackPtr[2].ptr = permuteExtract(dist_B2,n0,n1);
+ *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
+ stackPtr+=3;
+ return;
+ }
+
+ /* >=5 hits: reverse to descending order for writing to stack */
+
+ distance_i = align_shift_right<3>(distance_i,distance_i);
+ const size_t hits = 4 + popcnt(mask);
+ vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
+
+ isort_quick_update(dist,dist_A2);
+ isort_quick_update(dist,dist_B2);
+ isort_quick_update(dist,dist_C2);
+ isort_quick_update(dist,dist_D2);
+
+ do {
+
+ distance_i = align_shift_right<1>(distance_i,distance_i);
+ cur = permuteExtract(distance_i,n0,n1);
+ BVH::prefetch(cur,types);
+ const vint8 new_dist(permute(distance_i,vint8(zero)));
+ mask &= mask-1;
+ isort_update(dist,new_dist);
+
+ } while(mask);
+
+ for (size_t i=0; i<7; i++)
+ assert(dist[i+0]>=dist[i+1]);
+
+ for (size_t i=0;i<hits-1;i++)
+ {
+ stackPtr->ptr = permuteExtract(dist,n0,n1);
+ *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
+ dist = align_shift_right<1>(dist,dist);
+ stackPtr++;
+ }
+ cur = permuteExtract(dist,n0,n1);
+ }
+#endif
+
+ public:
+ static __forceinline void traverseClosestHit(NodeRef& cur,
+ size_t mask,
+ const vfloat<Nx>& tNear,
+ StackItemT<NodeRef>*& stackPtr,
+ StackItemT<NodeRef>* stackEnd)
+ {
+ assert(mask != 0);
+#if defined(__AVX512ER__)
+ traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#elif defined(__AVX512VL__)
+ traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ cur = node->child(r);
+ BVH::prefetch(cur,types);
+ if (likely(mask == 0)) {
+ assert(cur != BVH::emptyNode);
+ return;
+ }
+
+ /*! two children are hit, push far child, and continue with closer child */
+ NodeRef c0 = cur;
+ const unsigned int d0 = ((unsigned int*)&tNear)[r];
+ r = bscf(mask);
+ NodeRef c1 = node->child(r);
+ BVH::prefetch(c1,types);
+ const unsigned int d1 = ((unsigned int*)&tNear)[r];
+
+ assert(c0 != BVH::emptyNode);
+ assert(c1 != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ assert(stackPtr < stackEnd);
+ if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+ else { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+ }
+#if NEW_SORTING_CODE == 1
+ vint4 s0((size_t)c0,(size_t)d0);
+ vint4 s1((size_t)c1,(size_t)d1);
+
+ r = bscf(mask);
+ NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r];
+ vint4 s2((size_t)c2,(size_t)d2);
+ /* 3 hits */
+ if (likely(mask == 0)) {
+ StackItemT<NodeRef>::sort3(s0,s1,s2);
+ *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+ cur = toSizeT(s2);
+ stackPtr+=2;
+ return;
+ }
+ r = bscf(mask);
+ NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r];
+ vint4 s3((size_t)c3,(size_t)d3);
+ /* 4 hits */
+ if (likely(mask == 0)) {
+ StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+ *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+ cur = toSizeT(s3);
+ stackPtr+=3;
+ return;
+ }
+ *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
+ /*! fallback case if more than 4 children are hit */
+ StackItemT<NodeRef>* stackFirst = stackPtr;
+ stackPtr+=4;
+ while (1)
+ {
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r];
+ const vint4 s((size_t)c,(size_t)d);
+ *(vint4*)stackPtr++ = s;
+ assert(c != BVH::emptyNode);
+ if (unlikely(mask == 0)) break;
+ }
+ sort(stackFirst,stackPtr);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#else
+ /*! Here starts the slow path for 3 or 4 hit children. We push
+ * all nodes onto the stack to sort them there. */
+ assert(stackPtr < stackEnd);
+ stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+ assert(stackPtr < stackEnd);
+ stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+ /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+ assert(c != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+ return;
+ }
+
+ /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+ assert(c != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+ return;
+ }
+ /*! fallback case if more than 4 children are hit */
+ StackItemT<NodeRef>* stackFirst = stackPtr-4;
+ while (1)
+ {
+ assert(stackPtr < stackEnd);
+ r = bscf(mask);
+ c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+ assert(c != BVH::emptyNode);
+ if (unlikely(mask == 0)) break;
+ }
+ sort(stackFirst,stackPtr);
+ cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+ }
+
+ static __forceinline void traverseAnyHit(NodeRef& cur,
+ size_t mask,
+ const vfloat<Nx>& tNear,
+ NodeRef*& stackPtr,
+ NodeRef* stackEnd)
+ {
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ cur = node->child(r);
+ BVH::prefetch(cur,types);
+
+ /* simpler in sequence traversal order */
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ assert(stackPtr < stackEnd);
+ *stackPtr = cur; stackPtr++;
+
+ for (; ;)
+ {
+ r = bscf(mask);
+ cur = node->child(r); BVH::prefetch(cur,types);
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ assert(stackPtr < stackEnd);
+ *stackPtr = cur; stackPtr++;
+ }
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
new file mode 100644
index 0000000000..9c603babf0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
@@ -0,0 +1,154 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, int Nx, int types>
+ class BVHNNodeTraverserStreamHitCoherent
+ {
+ typedef BVHN<N> BVH;
+ typedef typename BVH::NodeRef NodeRef;
+ typedef typename BVH::BaseNode BaseNode;
+
+ public:
+ template<class T>
+ static __forceinline void traverseClosestHit(NodeRef& cur,
+ size_t& m_trav_active,
+ const vbool<Nx>& vmask,
+ const vfloat<Nx>& tNear,
+ const T* const tMask,
+ StackItemMaskCoherent*& stackPtr)
+ {
+ const NodeRef parent = cur;
+ size_t mask = movemask(vmask);
+ assert(mask != 0);
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ const size_t r0 = bscf(mask);
+ assert(r0 < 8);
+ cur = node->child(r0);
+ BVHN<N>::prefetch(cur,types);
+ m_trav_active = tMask[r0];
+ assert(cur != BVH::emptyNode);
+ if (unlikely(mask == 0)) return;
+
+ const unsigned int* const tNear_i = (unsigned int*)&tNear;
+
+ /*! two children are hit, push far child, and continue with closer child */
+ NodeRef c0 = cur;
+ unsigned int d0 = tNear_i[r0];
+ const size_t r1 = bscf(mask);
+ assert(r1 < 8);
+ NodeRef c1 = node->child(r1);
+ BVHN<N>::prefetch(c1,types);
+ unsigned int d1 = tNear_i[r1];
+
+ assert(c0 != BVH::emptyNode);
+ assert(c1 != BVH::emptyNode);
+ if (likely(mask == 0)) {
+ if (d0 < d1) {
+ assert(tNear[r1] >= 0.0f);
+ stackPtr->mask = tMask[r1];
+ stackPtr->parent = parent;
+ stackPtr->child = c1;
+ stackPtr++;
+ cur = c0;
+ m_trav_active = tMask[r0];
+ return;
+ }
+ else {
+ assert(tNear[r0] >= 0.0f);
+ stackPtr->mask = tMask[r0];
+ stackPtr->parent = parent;
+ stackPtr->child = c0;
+ stackPtr++;
+ cur = c1;
+ m_trav_active = tMask[r1];
+ return;
+ }
+ }
+
+ /*! slow path for more than two hits */
+ size_t hits = movemask(vmask);
+ const vint<Nx> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<Nx>(step), 0);
+ #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+ const vint<N> tmp = extractN<N,0>(dist_i);
+ const vint<Nx> dist_i_sorted = usort_descending(tmp);
+ #else
+ const vint<Nx> dist_i_sorted = usort_descending(dist_i);
+ #endif
+ const vint<Nx> sorted_index = dist_i_sorted & 7;
+
+ size_t i = 0;
+ for (;;)
+ {
+ const unsigned int index = sorted_index[i];
+ assert(index < 8);
+ cur = node->child(index);
+ m_trav_active = tMask[index];
+ assert(m_trav_active);
+ BVHN<N>::prefetch(cur,types);
+ bscf(hits);
+ if (unlikely(hits==0)) break;
+ i++;
+ assert(cur != BVH::emptyNode);
+ assert(tNear[index] >= 0.0f);
+ stackPtr->mask = m_trav_active;
+ stackPtr->parent = parent;
+ stackPtr->child = cur;
+ stackPtr++;
+ }
+ }
+
+ template<class T>
+ static __forceinline void traverseAnyHit(NodeRef& cur,
+ size_t& m_trav_active,
+ const vbool<Nx>& vmask,
+ const T* const tMask,
+ StackItemMaskCoherent*& stackPtr)
+ {
+ const NodeRef parent = cur;
+ size_t mask = movemask(vmask);
+ assert(mask != 0);
+ const BaseNode* node = cur.baseNode();
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ cur = node->child(r);
+ BVHN<N>::prefetch(cur,types);
+ m_trav_active = tMask[r];
+
+ /* simple in order sequence */
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ stackPtr->mask = m_trav_active;
+ stackPtr->parent = parent;
+ stackPtr->child = cur;
+ stackPtr++;
+
+ for (; ;)
+ {
+ r = bscf(mask);
+ cur = node->child(r);
+ BVHN<N>::prefetch(cur,types);
+ m_trav_active = tMask[r];
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) return;
+ stackPtr->mask = m_trav_active;
+ stackPtr->parent = parent;
+ stackPtr->child = cur;
+ stackPtr++;
+ }
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
new file mode 100644
index 0000000000..a978c0c459
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ struct NearFarPrecalculations
+ {
+ size_t nearX, nearY, nearZ;
+ size_t farX, farY, farZ;
+
+ __forceinline NearFarPrecalculations() {}
+
+ __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N)
+ {
+ const size_t size = sizeof(float)*N;
+ nearX = (dir.x < 0.0f) ? 1*size : 0*size;
+ nearY = (dir.y < 0.0f) ? 3*size : 2*size;
+ nearZ = (dir.z < 0.0f) ? 5*size : 4*size;
+ farX = nearX ^ size;
+ farY = nearY ^ size;
+ farZ = nearZ ^ size;
+ }
+ };
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
new file mode 100644
index 0000000000..aa0d4ba4d7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
@@ -0,0 +1,1788 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
+namespace embree
+{
+ namespace isa
+ {
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Ray structure used in single-ray traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx, bool robust>
+ struct TravRayBase;
+
+ /* Base (without tnear and tfar) */
+ template<int N, int Nx>
+ struct TravRayBase<N,Nx,false>
+ {
+ __forceinline TravRayBase() {}
+
+ __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+ : org_xyz(ray_org), dir_xyz(ray_dir)
+ {
+ const Vec3fa ray_rdir = rcp_safe(ray_dir);
+ org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+ dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+ rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
+#if defined(__FMA_X4__)
+ const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
+ org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+ //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+ //x86 will use msub
+ neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
+#endif
+ nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+ nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+ nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+ farX = nearX ^ sizeof(vfloat<N>);
+ farY = nearY ^ sizeof(vfloat<N>);
+ farZ = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+ /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+ const vint<16> id(step);
+ const vint<16> id2 = align_shift_right<16/2>(id, id);
+ permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+ permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+ permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+
+ }
+
+ template<int K>
+ __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+ const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+ size_t flip = sizeof(vfloat<N>))
+ {
+ org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+ dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+ rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+ org_rdir = org*rdir;
+#else
+ neg_org_rdir = -(org*rdir);
+#endif
+#endif
+ nearX = nearXYZ.x[k];
+ nearY = nearXYZ.y[k];
+ nearZ = nearXYZ.z[k];
+ farX = nearX ^ flip;
+ farY = nearY ^ flip;
+ farZ = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+ /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+ const vint<16> id(step);
+ const vint<16> id2 = align_shift_right<16/2>(id, id);
+ permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+ permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+ permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+ }
+
+ Vec3fa org_xyz, dir_xyz;
+ Vec3vf<Nx> org, dir, rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+ Vec3vf<Nx> org_rdir;
+#else
+ //aarch64 version are keeping negation of the org_rdir and use madd
+ //x86 uses msub
+ Vec3vf<Nx> neg_org_rdir;
+#endif
+#endif
+#if defined(__AVX512ER__) // KNL+
+ vint16 permX, permY, permZ;
+#endif
+
+ size_t nearX, nearY, nearZ;
+ size_t farX, farY, farZ;
+ };
+
+ /* Base (without tnear and tfar) */
+ template<int N, int Nx>
+ struct TravRayBase<N,Nx,true>
+ {
+ __forceinline TravRayBase() {}
+
+ __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+ : org_xyz(ray_org), dir_xyz(ray_dir)
+ {
+ const float round_down = 1.0f-3.0f*float(ulp);
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir);
+ const Vec3fa ray_rdir_near = round_down*ray_rdir;
+ const Vec3fa ray_rdir_far = round_up *ray_rdir;
+ org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+ dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+ rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
+ rdir_far = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
+ nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+ nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+ nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+ farX = nearX ^ sizeof(vfloat<N>);
+ farY = nearY ^ sizeof(vfloat<N>);
+ farZ = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+ /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+ const vint<16> id(step);
+ const vint<16> id2 = align_shift_right<16/2>(id, id);
+ permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+ permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+ permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+ }
+
+ template<int K>
+ __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+ const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+ size_t flip = sizeof(vfloat<N>))
+ {
+ const vfloat<Nx> round_down = 1.0f-3.0f*float(ulp);
+ const vfloat<Nx> round_up = 1.0f+3.0f*float(ulp);
+ org = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+ dir = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+ rdir_near = round_down*Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+ rdir_far = round_up *Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+
+ nearX = nearXYZ.x[k];
+ nearY = nearXYZ.y[k];
+ nearZ = nearXYZ.z[k];
+ farX = nearX ^ flip;
+ farY = nearY ^ flip;
+ farZ = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+ /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+ const vint<16> id(step);
+ const vint<16> id2 = align_shift_right<16/2>(id, id);
+ permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+ permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+ permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+ }
+
+ Vec3fa org_xyz, dir_xyz;
+ Vec3vf<Nx> org, dir, rdir_near, rdir_far;
+#if defined(__AVX512ER__) // KNL+
+ vint16 permX, permY, permZ;
+#endif
+
+ size_t nearX, nearY, nearZ;
+ size_t farX, farY, farZ;
+ };
+
+ /* Full (with tnear and tfar) */
+ template<int N, int Nx, bool robust>
+ struct TravRay : TravRayBase<N,Nx,robust>
+ {
+ __forceinline TravRay() {}
+
+ __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar)
+ : TravRayBase<N,Nx,robust>(ray_org, ray_dir),
+ tnear(ray_tnear), tfar(ray_tfar) {}
+
+ template<int K>
+ __forceinline TravRay(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+ const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+ float ray_tnear, float ray_tfar,
+ size_t flip = sizeof(vfloat<N>))
+ : TravRayBase<N,Nx,robust>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip),
+ tnear(ray_tnear), tfar(ray_tfar) {}
+
+ vfloat<Nx> tnear;
+ vfloat<Nx> tfar;
+ };
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Point Query structure used in single-ray traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ struct TravPointQuery
+ {
+ __forceinline TravPointQuery() {}
+
+ __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad)
+ {
+ org = Vec3vf<N>(query_org.x, query_org.y, query_org.z);
+ rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z);
+ }
+
+ __forceinline vfloat<N> const& tfar() const {
+ return rad.x;
+ }
+
+ Vec3vf<N> org, rad;
+ };
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // point query
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ __forceinline size_t pointQuerySphereDistAndMask(
+ const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX,
+ vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+ {
+ const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+ const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+ const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+ dist = vX * vX + vY * vY + vZ * vZ;
+ const vbool<N> vmask = dist <= query.tfar()*query.tfar();
+ const vbool<N> valid = minX <= maxX;
+ return movemask(vmask) & movemask(valid);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+ const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+ const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+ const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+ const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+ const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+ return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+ const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+ const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+ const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+ const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+ const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+ const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+ const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+ const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+ const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+ const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+ const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+ return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+ size_t mask = pointQueryNodeSphere(node, query, time, dist);
+
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+ mask &= movemask(vmask);
+ }
+
+ return mask;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ const vfloat<N> start_x(node->start.x);
+ const vfloat<N> scale_x(node->scale.x);
+ const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+ const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+ const vfloat<N> start_y(node->start.y);
+ const vfloat<N> scale_y(node->scale.y);
+ const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+ const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+ const vfloat<N> start_z(node->start.z);
+ const vfloat<N> scale_z(node->scale.z);
+ const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+ const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+ return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const vfloat<N> minX = node->dequantizeLowerX(time);
+ const vfloat<N> maxX = node->dequantizeUpperX(time);
+ const vfloat<N> minY = node->dequantizeLowerY(time);
+ const vfloat<N> maxY = node->dequantizeUpperY(time);
+ const vfloat<N> minZ = node->dequantizeLowerZ(time);
+ const vfloat<N> maxZ = node->dequantizeUpperZ(time);
+ return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ // TODO: point query - implement
+ const vbool<N> vmask = vbool<N>(true);
+ const size_t mask = movemask(vmask) & ((1<<N)-1);
+ dist = vfloat<N>(0.0f);
+ return mask;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ // TODO: point query - implement
+ const vbool<N> vmask = vbool<N>(true);
+ const size_t mask = movemask(vmask) & ((1<<N)-1);
+ dist = vfloat<N>(0.0f);
+ return mask;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryAABBDistAndMask(
+ const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX,
+ vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+ {
+ const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+ const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+ const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+ dist = vX * vX + vY * vY + vZ * vZ;
+ const vbool<N> valid = minX <= maxX;
+ const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) |
+ (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) |
+ (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z));
+ return movemask(vmask) & movemask(valid);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+ const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+ const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+ const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+ const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+ const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+ return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+ const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+ const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+ const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+ const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+ const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+ const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+ const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+ const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+ const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+ const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+ const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+ return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+ size_t mask = pointQueryNodeAABB(node, query, time, dist);
+
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+ mask &= movemask(vmask);
+ }
+
+ return mask;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat<N> start_x(node->start.x);
+ const vfloat<N> scale_x(node->scale.x);
+ const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+ const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+ const vfloat<N> start_y(node->start.y);
+ const vfloat<N> scale_y(node->scale.y);
+ const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+ const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+ const vfloat<N> start_z(node->start.z);
+ const vfloat<N> scale_z(node->scale.z);
+ const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+ const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+ return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat<N> minX = node->dequantizeLowerX(time);
+ const vfloat<N> maxX = node->dequantizeUpperX(time);
+ const vfloat<N> minY = node->dequantizeLowerY(time);
+ const vfloat<N> maxY = node->dequantizeUpperY(time);
+ const vfloat<N> minZ = node->dequantizeLowerZ(time);
+ const vfloat<N> maxZ = node->dequantizeUpperZ(time);
+ return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ // TODO: point query - implement
+ const vbool<N> vmask = vbool<N>(true);
+ const size_t mask = movemask(vmask) & ((1<<N)-1);
+ dist = vfloat<N>(0.0f);
+ return mask;
+ }
+
+ template<int N>
+ __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ // TODO: point query - implement
+ const vbool<N> vmask = vbool<N>(true);
+ const size_t mask = movemask(vmask) & ((1<<N)-1);
+ dist = vfloat<N>(0.0f);
+ return mask;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx, bool robust>
+ __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+ template<>
+ __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+ {
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+ const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat4 tFarX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat4 tFarY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat4 tFarZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+ const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+ const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+ const vfloat4 tFarX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+ const vfloat4 tFarY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+ const vfloat4 tFarZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+ const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+ const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+ const vfloat4 tFarX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+ const vfloat4 tFarY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+ const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__aarch64__)
+ const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+ const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+ const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+ const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = asInt(tNear) > asInt(tFar);
+ const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#else
+ const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+#endif
+ dist = tNear;
+ return mask;
+ }
+
+#if defined(__AVX__)
+
+ template<>
+ __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+ {
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+ const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat8 tFarX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat8 tFarY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat8 tFarZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+ const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+ const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+ const vfloat8 tFarX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+ const vfloat8 tFarY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+ const vfloat8 tFarZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+
+#else
+ const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+ const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+ const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+ const vfloat8 tFarX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+ const vfloat8 tFarY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+ const vfloat8 tFarZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+ const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = asInt(tNear) > asInt(tFar);
+ const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#else
+ const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+#endif
+ dist = tNear;
+ return mask;
+ }
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+ template<>
+ __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+ {
+ const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tFarX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tFarY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tFarZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool16 vmask = le(vbool16(0xf),tNear,tFar);
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+ {
+ const vllong8 invalid((size_t)BVH8::emptyNode);
+ const vboold8 m_valid(invalid != vllong8::loadu(node->children));
+ const vfloat16 bminmaxX = permute(vfloat16::load((const float*)&node->lower_x), ray.permX);
+ const vfloat16 bminmaxY = permute(vfloat16::load((const float*)&node->lower_y), ray.permY);
+ const vfloat16 bminmaxZ = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ);
+ const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+ const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+ const vbool16 vmask = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar));
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+#endif
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx>
+ __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+ {
+ const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+ const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+ const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+ const vfloat<N> tFarX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+ const vfloat<N> tFarY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+ const vfloat<N> tFarZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+ const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat<N> tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool<N> vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+ template<>
+ __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist)
+ {
+ const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+ const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+ const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+ const vfloat16 tFarX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+ const vfloat16 tFarY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+ const vfloat16 tFarZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+ const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool16 vmask = le((1 << 4)-1,tNear,tFar);
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ template<>
+ __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist)
+ {
+ const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+ const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+ const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+ const vfloat16 tFarX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+ const vfloat16 tFarY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+ const vfloat16 tFarZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+ const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool16 vmask = le((1 << 8)-1,tNear,tFar);
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+#endif
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+ {
+ const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+ const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+ const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+ const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+ const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+ const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+ const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+ const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+ const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
+ const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool<N> vmask = asInt(tNear) > asInt(tFar);
+ const size_t mask = movemask(vmask) ^ ((1<<N)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat<N> tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool<N> vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#else
+ const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+ const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+ const vbool<N> vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+#endif
+ dist = tNear;
+ return mask;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+ {
+ const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+ const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+ const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+ const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+ const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+ const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+ const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+ const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+ const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+ const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+ const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+ const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+ const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+ const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+ const size_t mask = movemask(tNear <= tFar);
+ dist = tNear;
+ return mask;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNodeMB4D intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+ const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+ const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+ const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+ const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+ const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+ const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+ const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<N> tFarX = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tFarY = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tFarZ = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+ const vfloat<N> tFarX = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tFarY = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tFarZ = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+ const vfloat<N> tFarX = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tFarY = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tFarZ = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
+ const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
+ const vfloat<N> tFar = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
+#else
+ const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+ const vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+#endif
+ vbool<N> vmask = tNear <= tFar;
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+ }
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNodeMB4D intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N>
+ __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+ const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+ const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+ const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+ const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+ const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+ const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+ const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+ const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+ const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+ const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+ const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+ const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+ const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+ const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+ vbool<N> vmask = tNear <= tFar;
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+ }
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast QuantizedBaseNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx, bool robust>
+ __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+ template<>
+ __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat4 start_x(node->start.x);
+ const vfloat4 scale_x(node->scale.x);
+ const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+ const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x);
+ const vfloat4 start_y(node->start.y);
+ const vfloat4 scale_y(node->scale.y);
+ const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+ const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y);
+ const vfloat4 start_z(node->start.z);
+ const vfloat4 scale_z(node->scale.z);
+ const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+ const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z);
+
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+ const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat4 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat4 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat4 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat4 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat4 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat4 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+ const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+ const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+ const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir.x;
+ const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir.y;
+ const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+ const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = asInt(tNear) > asInt(tFar);
+ const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#else
+ const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+#endif
+ dist = tNear;
+ return mask & mvalid;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat4 start_x(node->start.x);
+ const vfloat4 scale_x(node->scale.x);
+ const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+ const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX >> 2),scale_x,start_x);
+ const vfloat4 start_y(node->start.y);
+ const vfloat4 scale_y(node->scale.y);
+ const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+ const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY >> 2),scale_y,start_y);
+ const vfloat4 start_z(node->start.z);
+ const vfloat4 scale_z(node->scale.z);
+ const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+ const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z);
+
+ const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+ const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+ const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+ const vfloat4 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x;
+ const vfloat4 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y;
+ const vfloat4 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+ const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat4 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool4 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask & mvalid;
+ }
+
+
+#if defined(__AVX__)
+
+ template<>
+ __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat8 start_x(node->start.x);
+ const vfloat8 scale_x(node->scale.x);
+ const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+ const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x);
+ const vfloat8 start_y(node->start.y);
+ const vfloat8 scale_y(node->scale.y);
+ const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+ const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y);
+ const vfloat8 start_z(node->start.z);
+ const vfloat8 scale_z(node->scale.z);
+ const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+ const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z);
+
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+ const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat8 tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat8 tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat8 tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat8 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat8 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat8 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+ const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+ const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+ const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir.x;
+ const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir.y;
+ const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+ const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = asInt(tNear) > asInt(tFar);
+ const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+ const size_t mask = movemask(vmask);
+#else
+ const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+#endif
+ dist = tNear;
+ return mask & mvalid;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat8 start_x(node->start.x);
+ const vfloat8 scale_x(node->scale.x);
+ const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+ const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX >> 2),scale_x,start_x);
+ const vfloat8 start_y(node->start.y);
+ const vfloat8 scale_y(node->scale.y);
+ const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+ const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY >> 2),scale_y,start_y);
+ const vfloat8 start_z(node->start.z);
+ const vfloat8 scale_z(node->scale.z);
+ const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+ const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ >> 2),scale_z,start_z);
+
+ const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+ const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+ const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+ const vfloat8 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x;
+ const vfloat8 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y;
+ const vfloat8 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+ const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat8 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool8 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask);
+
+ dist = tNear;
+ return mask & mvalid;
+ }
+
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+ template<>
+ __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat16 start_x(node->start.x);
+ const vfloat16 scale_x(node->scale.x);
+ const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+ const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x);
+ const vfloat16 start_y(node->start.y);
+ const vfloat16 scale_y(node->scale.y);
+ const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+ const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y);
+ const vfloat16 start_z(node->start.z);
+ const vfloat16 scale_z(node->scale.z);
+ const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+ const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z);
+
+ const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool16 vmask = le(vbool16(0xf),tNear,tFar);
+ const size_t mask = movemask(vmask) & mvalid;
+ dist = tNear;
+ return mask;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat16 start_x(node->start.x);
+ const vfloat16 scale_x(node->scale.x);
+ const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+ const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX >> 2)),scale_x,start_x);
+ const vfloat16 start_y(node->start.y);
+ const vfloat16 scale_y(node->scale.y);
+ const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+ const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY >> 2)),scale_y,start_y);
+ const vfloat16 start_z(node->start.z);
+ const vfloat16 scale_z(node->scale.z);
+ const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+ const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ >> 2)),scale_z,start_z);
+
+ const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+ const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+ const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+ const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x;
+ const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y;
+ const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+ const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+ const vfloat16 tFar = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+ const vbool16 vmask = le(vbool16(0xf),tNear,tFar);
+ const size_t mask = movemask(vmask) & mvalid;
+ dist = tNear;
+ return mask;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+ {
+ const vbool16 m_valid(node->validMask16());
+ const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX);
+ const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY);
+ const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ);
+ const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+ const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+ const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ template<>
+ __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist)
+ {
+ const vbool16 m_valid(node->validMask16());
+ const vfloat16 bminmaxX = node->dequantizeLowerUpperX(ray.permX);
+ const vfloat16 bminmaxY = node->dequantizeLowerUpperY(ray.permY);
+ const vfloat16 bminmaxZ = node->dequantizeLowerUpperZ(ray.permZ);
+ const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!!
+ const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y;
+ const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z;
+ const vfloat16 tNear = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+ const vfloat16 tFar = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+ const vbool16 vmask = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+
+#endif
+
+
+ template<int N, int Nx>
+ __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+ {
+ const vboolf<N> mvalid = node->validMask();
+ const vfloat<N> lower_x = node->dequantizeLowerX(time);
+ const vfloat<N> upper_x = node->dequantizeUpperX(time);
+ const vfloat<N> lower_y = node->dequantizeLowerY(time);
+ const vfloat<N> upper_y = node->dequantizeUpperY(time);
+ const vfloat<N> lower_z = node->dequantizeLowerZ(time);
+ const vfloat<N> upper_z = node->dequantizeUpperZ(time);
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+ const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<N> tFarX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<N> tFarY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<N> tFarZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat<N> tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<N> tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<N> tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+ const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+ const vfloat<N> tminX = mini(tNearX,tFarX);
+ const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+ const vfloat<N> tminY = mini(tNearY,tFarY);
+ const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+ const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+ const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+ const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+ const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar));
+#else
+ const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+ template<int N, int Nx>
+ __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+ {
+ const vboolf<N> mvalid = node->validMask();
+ const vfloat<N> lower_x = node->dequantizeLowerX(time);
+ const vfloat<N> upper_x = node->dequantizeUpperX(time);
+ const vfloat<N> lower_y = node->dequantizeLowerY(time);
+ const vfloat<N> upper_y = node->dequantizeUpperY(time);
+ const vfloat<N> lower_z = node->dequantizeLowerZ(time);
+ const vfloat<N> upper_z = node->dequantizeUpperZ(time);
+ const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+ const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+ const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+ const vfloat<N> tFarX = (upper_x - ray.org.x) * ray.rdir_far.x;
+ const vfloat<N> tFarY = (upper_y - ray.org.y) * ray.rdir_far.y;
+ const vfloat<N> tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+ const vfloat<N> tminX = mini(tNearX,tFarX);
+ const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+ const vfloat<N> tminY = mini(tNearY,tFarY);
+ const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+ const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+ const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+ const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+ const vfloat<N> tFar = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vbool<N> vmask = le(mvalid,asInt(tNear),asInt(tFar));
+#else
+ const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+ const size_t mask = movemask(vmask);
+ dist = tNear;
+ return mask;
+ }
+
+
+#if defined(__AVX512ER__)
+ // for KNL
+ template<>
+ __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat16 lower_x = node->dequantizeLowerX(time);
+ const vfloat16 upper_x = node->dequantizeUpperX(time);
+ const vfloat16 lower_y = node->dequantizeLowerY(time);
+ const vfloat16 upper_y = node->dequantizeUpperY(time);
+ const vfloat16 lower_z = node->dequantizeLowerZ(time);
+ const vfloat16 upper_z = node->dequantizeUpperZ(time);
+
+ const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat16 tFarX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat16 tFarY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat16 tFarZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+
+ const vfloat16 tminX = min(tNearX,tFarX);
+ const vfloat16 tmaxX = max(tNearX,tFarX);
+ const vfloat16 tminY = min(tNearY,tFarY);
+ const vfloat16 tmaxY = max(tNearY,tFarY);
+ const vfloat16 tminZ = min(tNearZ,tFarZ);
+ const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+ const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+ const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+ const vbool16 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask) & mvalid;
+ dist = extractN<4,0>(tNear);
+ return mask;
+ }
+
+
+ // for KNL
+ template<>
+ __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist)
+ {
+ const size_t mvalid = movemask(node->validMask());
+ const vfloat16 lower_x = node->dequantizeLowerX(time);
+ const vfloat16 upper_x = node->dequantizeUpperX(time);
+ const vfloat16 lower_y = node->dequantizeLowerY(time);
+ const vfloat16 upper_y = node->dequantizeUpperY(time);
+ const vfloat16 lower_z = node->dequantizeLowerZ(time);
+ const vfloat16 upper_z = node->dequantizeUpperZ(time);
+
+ const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+ const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+ const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+ const vfloat16 tFarX = (upper_x - ray.org.x) * ray.rdir_far.x;
+ const vfloat16 tFarY = (upper_y - ray.org.y) * ray.rdir_far.y;
+ const vfloat16 tFarZ = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+ const vfloat16 tminX = min(tNearX,tFarX);
+ const vfloat16 tmaxX = max(tNearX,tFarX);
+ const vfloat16 tminY = min(tNearY,tFarY);
+ const vfloat16 tmaxY = max(tNearY,tFarY);
+ const vfloat16 tminZ = min(tNearZ,tFarZ);
+ const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+ const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+ const vfloat16 tFar = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+ const vbool16 vmask = tNear <= tFar;
+ const size_t mask = movemask(vmask) & mvalid;
+ dist = extractN<4,0>(tNear);
+ return mask;
+ }
+
+#endif
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast OBBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, bool robust>
+ __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,N,robust>& ray, vfloat<N>& dist)
+ {
+ const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir);
+ //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir;
+ const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir);
+ const Vec3vf<N> org = xfmPoint(node->naabb,ray.org);
+ const Vec3vf<N> tLowerXYZ = org * nrdir; // (Vec3fa(zero) - org) * rdir;
+ const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir;
+
+ const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+ const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+ const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+ const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x);
+ const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y);
+ const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z);
+ vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ);
+ vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+ if (robust) {
+ tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+ tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+ }
+ const vbool<N> vmask = tNear <= tFar;
+ dist = tNear;
+ return movemask(vmask);
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast OBBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, bool robust>
+ __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,N,robust>& ray, const float time, vfloat<N>& dist)
+ {
+ const AffineSpace3vf<N> xfm = node->space0;
+ const Vec3vf<N> b0_lower = zero;
+ const Vec3vf<N> b0_upper = one;
+ const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time));
+ const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time));
+
+ const BBox3vf<N> bounds(lower,upper);
+ const Vec3vf<N> dir = xfmVector(xfm,ray.dir);
+ const Vec3vf<N> rdir = rcp_safe(dir);
+ const Vec3vf<N> org = xfmPoint(xfm,ray.org);
+
+ const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir;
+ const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir;
+
+ const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+ const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+ const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+ const vfloat<N> tFarX = maxi(tLowerXYZ.x,tUpperXYZ.x);
+ const vfloat<N> tFarY = maxi(tLowerXYZ.y,tUpperXYZ.y);
+ const vfloat<N> tFarZ = maxi(tLowerXYZ.z,tUpperXYZ.z);
+ vfloat<N> tNear = max(ray.tnear, tNearX,tNearY,tNearZ);
+ vfloat<N> tFar = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+ if (robust) {
+ tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+ tFar = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+ }
+ const vbool<N> vmask = tNear <= tFar;
+ dist = tNear;
+ return movemask(vmask);
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Node intersectors used in point query raversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ /*! Computes traversal information for N nodes with 1 point query */
+ template<int N, int types>
+ struct BVHNNodePointQuerySphere1;
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNode())) mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+ else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNodeMB())) mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+ else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+ else mask = pointQueryNodeSphereMB4D(node, query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQuerySphere1<N, BVH_QN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNQuantizedBaseNodePointQuerySphere1
+ {
+ static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ return pointQueryNodeSphere(node,query,dist);
+ }
+
+ static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ return pointQueryNodeSphere(node,query,time,dist);
+ }
+ };
+
+ /*! Computes traversal information for N nodes with 1 point query */
+ template<int N, int types>
+ struct BVHNNodePointQueryAABB1;
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNode())) mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+ else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNodeMB())) mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+ else if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+ else mask = pointQueryNodeAABBMB4D(node, query, time, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNNodePointQueryAABB1<N, BVH_QN1>
+ {
+ static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+ return true;
+ }
+ };
+
+ template<int N>
+ struct BVHNQuantizedBaseNodePointQueryAABB1
+ {
+ static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+ {
+ return pointQueryNodeAABB(node,query,dist);
+ }
+
+ static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+ {
+ return pointQueryNodeAABB(node,query,time,dist);
+ }
+ };
+
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Node intersectors used in ray traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ /*! Intersects N nodes with 1 ray */
+ template<int N, int Nx, int types, bool robust>
+ struct BVHNNodeIntersector1;
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN1, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNode(node.getAABBNode(), ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN1, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNodeMB4D<N>(node, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNodeMB4DRobust<N>(node, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNode())) mask = intersectNode(node.getAABBNode(), ray, dist);
+ else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNode())) mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+ else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNodeMB())) mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+ else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (likely(node.isAABBNodeMB())) mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+ else if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+ else return false;
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+ else mask = intersectNodeMB4D(node, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+ else mask = intersectNodeMB4DRobust(node, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_QN1, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int Nx>
+ struct BVHNNodeIntersector1<N, Nx, BVH_QN1, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+ {
+ if (unlikely(node.isLeaf())) return false;
+ mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+ return true;
+ }
+ };
+
+ /*! Intersects N nodes with K rays */
+ template<int N, int Nx, bool robust>
+ struct BVHNQuantizedBaseNodeIntersector1;
+
+ template<int N, int Nx>
+ struct BVHNQuantizedBaseNodeIntersector1<N, Nx, false>
+ {
+ static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,false>& ray, vfloat<Nx>& dist)
+ {
+ return intersectNode(node,ray,dist);
+ }
+
+ static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+ {
+ return intersectNode(node,ray,time,dist);
+ }
+
+ };
+
+ template<int N, int Nx>
+ struct BVHNQuantizedBaseNodeIntersector1<N, Nx, true>
+ {
+ static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+ {
+ return intersectNode(node,ray,dist);
+ }
+
+ static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+ {
+ return intersectNode(node,ray,time,dist);
+ }
+
+ };
+
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
new file mode 100644
index 0000000000..800ac8b478
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
@@ -0,0 +1,269 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Frustum structure used in hybrid and stream traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ /*
+ Optimized frustum test. We calculate t=(p-org)/dir in ray/box
+ intersection. We assume the rays are split by octant, thus
+ dir intervals are either positive or negative in each
+ dimension.
+
+ Case 1: dir.min >= 0 && dir.max >= 0:
+ t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+ t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+
+ Case 2: dir.min < 0 && dir.max < 0:
+ t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+ t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+ */
+
+ template<bool robust>
+ struct Frustum;
+
+ /* Fast variant */
+ template<>
+ struct Frustum<false>
+ {
+ __forceinline Frustum() {}
+
+ template<int K>
+ __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ init(valid, org, rdir, ray_tnear, ray_tfar, N);
+ }
+
+ template<int K>
+ __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+ reduce_min(select(valid, org.y, pos_inf)),
+ reduce_min(select(valid, org.z, pos_inf)));
+
+ const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+ reduce_max(select(valid, org.y, neg_inf)),
+ reduce_max(select(valid, org.z, neg_inf)));
+
+ const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+ reduce_min(select(valid, rdir.y, pos_inf)),
+ reduce_min(select(valid, rdir.z, pos_inf)));
+
+ const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+ reduce_max(select(valid, rdir.y, neg_inf)),
+ reduce_max(select(valid, rdir.z, neg_inf)));
+
+ const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+ const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+ init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+ }
+
+ __forceinline void init(const Vec3fa& reduced_min_org,
+ const Vec3fa& reduced_max_org,
+ const Vec3fa& reduced_min_rdir,
+ const Vec3fa& reduced_max_rdir,
+ float reduced_min_dist,
+ float reduced_max_dist,
+ int N)
+ {
+ const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+
+ min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+ max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+#if defined (__aarch64__)
+ neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+ neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
+ min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
+ max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
+#endif
+ min_dist = reduced_min_dist;
+ max_dist = reduced_max_dist;
+
+ nf = NearFarPrecalculations(min_rdir, N);
+ }
+
+ template<int K>
+ __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+ {
+ max_dist = reduce_max(ray_tfar);
+ }
+
+ NearFarPrecalculations nf;
+
+ Vec3fa min_rdir;
+ Vec3fa max_rdir;
+
+#if defined (__aarch64__)
+ Vec3fa neg_min_org_rdir;
+ Vec3fa neg_max_org_rdir;
+#else
+ Vec3fa min_org_rdir;
+ Vec3fa max_org_rdir;
+#endif
+ float min_dist;
+ float max_dist;
+ };
+
+ typedef Frustum<false> FrustumFast;
+
+ /* Robust variant */
+ template<>
+ struct Frustum<true>
+ {
+ __forceinline Frustum() {}
+
+ template<int K>
+ __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ init(valid, org, rdir, ray_tnear, ray_tfar, N);
+ }
+
+ template<int K>
+ __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+ reduce_min(select(valid, org.y, pos_inf)),
+ reduce_min(select(valid, org.z, pos_inf)));
+
+ const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+ reduce_max(select(valid, org.y, neg_inf)),
+ reduce_max(select(valid, org.z, neg_inf)));
+
+ const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+ reduce_min(select(valid, rdir.y, pos_inf)),
+ reduce_min(select(valid, rdir.z, pos_inf)));
+
+ const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+ reduce_max(select(valid, rdir.y, neg_inf)),
+ reduce_max(select(valid, rdir.z, neg_inf)));
+
+ const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+ const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+ init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+ }
+
+ __forceinline void init(const Vec3fa& reduced_min_org,
+ const Vec3fa& reduced_max_org,
+ const Vec3fa& reduced_min_rdir,
+ const Vec3fa& reduced_max_rdir,
+ float reduced_min_dist,
+ float reduced_max_dist,
+ int N)
+ {
+ const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+ min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+ max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+ min_org = select(pos_rdir, reduced_max_org, reduced_min_org);
+ max_org = select(pos_rdir, reduced_min_org, reduced_max_org);
+
+ min_dist = reduced_min_dist;
+ max_dist = reduced_max_dist;
+
+ nf = NearFarPrecalculations(min_rdir, N);
+ }
+
+ template<int K>
+ __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+ {
+ max_dist = reduce_max(ray_tfar);
+ }
+
+ NearFarPrecalculations nf;
+
+ Vec3fa min_rdir;
+ Vec3fa max_rdir;
+
+ Vec3fa min_org;
+ Vec3fa max_org;
+
+ float min_dist;
+ float max_dist;
+ };
+
+ typedef Frustum<true> FrustumRobust;
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx>
+ __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+ const FrustumFast& frustum, vfloat<Nx>& dist)
+ {
+ const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+ const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+ const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+ const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+ const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+ const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+#if defined (__aarch64__)
+ const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
+ const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
+ const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
+ const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
+ const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
+ const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
+#else
+ const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
+ const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
+ const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
+ const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
+ const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
+ const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
+#endif
+ const vfloat<Nx> fmin = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+ dist = fmin;
+ const vfloat<Nx> fmax = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+ const vbool<Nx> vmask_node_hit = fmin <= fmax;
+ size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+ return m_node;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx>
+ __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+ const FrustumRobust& frustum, vfloat<Nx>& dist)
+ {
+ const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+ const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+ const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+ const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+ const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+ const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+ const vfloat<Nx> fminX = (bminX - vfloat<Nx>(frustum.min_org.x)) * vfloat<Nx>(frustum.min_rdir.x);
+ const vfloat<Nx> fminY = (bminY - vfloat<Nx>(frustum.min_org.y)) * vfloat<Nx>(frustum.min_rdir.y);
+ const vfloat<Nx> fminZ = (bminZ - vfloat<Nx>(frustum.min_org.z)) * vfloat<Nx>(frustum.min_rdir.z);
+ const vfloat<Nx> fmaxX = (bmaxX - vfloat<Nx>(frustum.max_org.x)) * vfloat<Nx>(frustum.max_rdir.x);
+ const vfloat<Nx> fmaxY = (bmaxY - vfloat<Nx>(frustum.max_org.y)) * vfloat<Nx>(frustum.max_rdir.y);
+ const vfloat<Nx> fmaxZ = (bmaxZ - vfloat<Nx>(frustum.max_org.z)) * vfloat<Nx>(frustum.max_rdir.z);
+
+ const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+ const float round_up = 1.0f+2.0f*float(ulp);
+ const vfloat<Nx> fmin = max(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+ dist = fmin;
+ const vfloat<Nx> fmax = min(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+ const vbool<Nx> vmask_node_hit = (round_down*fmin <= round_up*fmax);
+ size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+ return m_node;
+ }
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
new file mode 100644
index 0000000000..0543e56f8e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
@@ -0,0 +1,843 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Ray packet structure used in hybrid traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int K, bool robust>
+ struct TravRayK;
+
+ /* Fast variant */
+ template<int K>
+ struct TravRayK<K, false>
+ {
+ __forceinline TravRayK() {}
+
+ __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+ {
+ init(ray_org, ray_dir, N);
+ }
+
+ __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ init(ray_org, ray_dir, N);
+ tnear = ray_tnear;
+ tfar = ray_tfar;
+ }
+
+ __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+ {
+ org = ray_org;
+ dir = ray_dir;
+ rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+ neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
+ org_rdir = org * rdir;
+#endif
+ if (N)
+ {
+ const int size = sizeof(float)*N;
+ nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+ nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+ nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+ }
+ }
+
+ Vec3vf<K> org;
+ Vec3vf<K> dir;
+ Vec3vf<K> rdir;
+#if defined(__aarch64__)
+ Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
+ Vec3vf<K> org_rdir;
+#endif
+ Vec3vi<K> nearXYZ;
+ vfloat<K> tnear;
+ vfloat<K> tfar;
+ };
+
+ template<int K>
+ using TravRayKFast = TravRayK<K, false>;
+
+ /* Robust variant */
+ template<int K>
+ struct TravRayK<K, true>
+ {
+ __forceinline TravRayK() {}
+
+ __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+ {
+ init(ray_org, ray_dir, N);
+ }
+
+ __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+ {
+ init(ray_org, ray_dir, N);
+ tnear = ray_tnear;
+ tfar = ray_tfar;
+ }
+
+ __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+ {
+ org = ray_org;
+ dir = ray_dir;
+ rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+
+ if (N)
+ {
+ const int size = sizeof(float)*N;
+ nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+ nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+ nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+ }
+ }
+
+ Vec3vf<K> org;
+ Vec3vf<K> dir;
+ Vec3vf<K> rdir;
+ Vec3vi<K> nearXYZ;
+ vfloat<K> tnear;
+ vfloat<K> tfar;
+ };
+
+ template<int K>
+ using TravRayKRobust = TravRayK<K, true>;
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i,
+ const TravRayKFast<K>& ray, vfloat<K>& dist)
+
+ {
+#if defined(__aarch64__)
+ const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+ const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z);
+ #else
+ const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+ #endif
+
+ #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ if (K == 16)
+ {
+ /* use mixed float/int min/max */
+ const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+ dist = lnearP;
+ return lhit;
+ }
+ else
+ #endif
+ {
+ const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+ #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+ #else
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ #endif
+ dist = lnearP;
+ return lhit;
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i,
+ const TravRayKRobust<K>& ray, vfloat<K>& dist)
+ {
+ // FIXME: use per instruction rounding for AVX512
+ const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const float round_down = 1.0f-3.0f*float(ulp);
+ const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+ const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+ const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+ const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+ const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+ const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+ const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+ const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+ const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ if (K == 16)
+ {
+ /* use mixed float/int min/max */
+ const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+ dist = lnearP;
+ return lhit;
+ }
+ else
+#endif
+ {
+ const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+#else
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+#endif
+ dist = lnearP;
+ return lhit;
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+ const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+ const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+ const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+ const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+ const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+ const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const float round_down = 1.0f-3.0f*float(ulp);
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ if (K == 16)
+ {
+ const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+ else
+#endif
+ {
+ const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNodeMB4D intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+ const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+ const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+ const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+ const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+ const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+ const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+ const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+ const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+ const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+ const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+ vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+ }
+ dist = lnearP;
+ return lhit;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNodeMB4D intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+ const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+ const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+ const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+ const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+ const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+ const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+ const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const float round_down = 1.0f-3.0f*float(ulp);
+ const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+ vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+
+ if (unlikely(ref.isAABBNodeMB4D())) {
+ const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+ lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+ }
+ dist = lnearP;
+ return lhit;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast OBBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K, bool robust>
+ __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i,
+ const TravRayK<K,robust>& ray, vfloat<K>& dist)
+ {
+ const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]),
+ Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]),
+ Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]),
+ Vec3f(node->naabb.p .x[i], node->naabb.p .y[i], node->naabb.p .z[i]));
+
+ const Vec3vf<K> dir = xfmVector(naabb, ray.dir);
+ const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1?
+ const Vec3vf<K> org = xfmPoint(naabb, ray.org);
+
+ const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir;
+ const vfloat<K> lclipMinY = org.y * nrdir.y;
+ const vfloat<K> lclipMinZ = org.z * nrdir.z;
+ const vfloat<K> lclipMaxX = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir;
+ const vfloat<K> lclipMaxY = lclipMinY - nrdir.y;
+ const vfloat<K> lclipMaxZ = lclipMinZ - nrdir.z;
+
+ vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+ if (robust) {
+ lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+ lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+ }
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast OBBNodeMB intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K, bool robust>
+ __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i,
+ const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]),
+ Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]),
+ Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]),
+ Vec3f(node->space0.p .x[i], node->space0.p .y[i], node->space0.p .z[i]));
+
+ const Vec3vf<K> b0_lower = zero;
+ const Vec3vf<K> b0_upper = one;
+ const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]);
+ const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]);
+ const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time);
+ const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time);
+
+ const Vec3vf<K> dir = xfmVector(xfm, ray.dir);
+ const Vec3vf<K> rdir = rcp_safe(dir);
+ const Vec3vf<K> org = xfmPoint(xfm, ray.org);
+
+ const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x;
+ const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y;
+ const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z;
+ const vfloat<K> lclipMaxX = (upper.x - org.x) * rdir.x;
+ const vfloat<K> lclipMaxY = (upper.y - org.y) * rdir.y;
+ const vfloat<K> lclipMaxZ = (upper.z - org.z) * rdir.z;
+
+ vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+ if (robust) {
+ lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+ lfarP = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+ }
+
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // QuantizedBaseNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+ const TravRayK<K,false>& ray, vfloat<K>& dist)
+
+ {
+ assert(movemask(node->validMask()) & ((size_t)1 << i));
+ const vfloat<N> lower_x = node->dequantizeLowerX();
+ const vfloat<N> upper_x = node->dequantizeUpperX();
+ const vfloat<N> lower_y = node->dequantizeLowerY();
+ const vfloat<N> upper_y = node->dequantizeUpperY();
+ const vfloat<N> lower_z = node->dequantizeLowerZ();
+ const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+ #if defined(__aarch64__)
+ const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+ #elif defined(__AVX2__)
+ const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z);
+ #else
+ const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+ #endif
+
+ #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ if (K == 16)
+ {
+ /* use mixed float/int min/max */
+ const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+ dist = lnearP;
+ return lhit;
+ }
+ else
+ #endif
+ {
+ const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+ #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+ const vbool<K> lhit = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+ #else
+ const vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+ #endif
+ dist = lnearP;
+ return lhit;
+ }
+ }
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+ const TravRayK<K,true>& ray, vfloat<K>& dist)
+
+ {
+ assert(movemask(node->validMask()) & ((size_t)1 << i));
+ const vfloat<N> lower_x = node->dequantizeLowerX();
+ const vfloat<N> upper_x = node->dequantizeUpperX();
+ const vfloat<N> lower_y = node->dequantizeLowerY();
+ const vfloat<N> upper_y = node->dequantizeUpperY();
+ const vfloat<N> lower_z = node->dequantizeLowerZ();
+ const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+ const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const float round_down = 1.0f-3.0f*float(ulp);
+
+ const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+ const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+ {
+ assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+ const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+ const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+ const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+ const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+ const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+ const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+
+#if defined(__aarch64__)
+ const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+ const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+ const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+ #endif
+ const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+
+ template<int N, int K>
+ __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+ const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+ {
+ assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+ const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+ const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+ const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+ const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+ const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+ const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+
+ const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+ const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+ const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+ const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const float round_down = 1.0f-3.0f*float(ulp);
+
+ const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+ const vfloat<K> lfarP = round_up *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+ const vbool<K> lhit = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+ dist = lnearP;
+ return lhit;
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Node intersectors used in hybrid traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ /*! Intersects N nodes with K rays */
+ template<int N, int K, int types, bool robust>
+ struct BVHNNodeIntersectorK;
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN1, false>
+ {
+ /* vmask is both an input and an output parameter! Its initial value should be the parent node
+ hit mask, which is used for correctly computing the current hit mask. The parent hit mask
+ is actually required only for motion blur node intersections (because different rays may
+ have different times), so for regular nodes vmask is simply overwritten. */
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN1, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNode())) vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+ else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNode())) vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+ else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNodeMB())) vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+ else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNodeMB())) vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+ else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+ vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+ } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+ assert(node.isOBBNodeMB());
+ vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+ }
+ return true;
+ }
+ };
+
+ template<int N, int K>
+ struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true>
+ {
+ static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+ const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+ {
+ if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+ vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+ } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+ assert(node.isOBBNodeMB());
+ vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+ }
+ return true;
+ }
+ };
+
+
+ /*! Intersects N nodes with K rays */
+ template<int N, int K, bool robust>
+ struct BVHNQuantizedBaseNodeIntersectorK;
+
+ template<int N, int K>
+ struct BVHNQuantizedBaseNodeIntersectorK<N, K, false>
+ {
+ static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+ const TravRayK<K,false>& ray, vfloat<K>& dist)
+ {
+ return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+ }
+
+ static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+ const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+ }
+
+ };
+
+ template<int N, int K>
+ struct BVHNQuantizedBaseNodeIntersectorK<N, K, true>
+ {
+ static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+ const TravRayK<K,true>& ray, vfloat<K>& dist)
+ {
+ return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+ }
+
+ static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+ const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+ {
+ return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+ }
+ };
+
+
+ }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
new file mode 100644
index 0000000000..f379b57aea
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Ray packet structure used in stream traversal
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int K, bool robust>
+ struct TravRayKStream;
+
+ /* Fast variant */
+ template<int K>
+ struct TravRayKStream<K, false>
+ {
+ __forceinline TravRayKStream() {}
+
+ __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+ {
+ init(ray_org, ray_dir);
+ tnear = ray_tnear;
+ tfar = ray_tfar;
+ }
+
+ __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+ {
+ rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+ neg_org_rdir = -(ray_org * rdir);
+#else
+ org_rdir = ray_org * rdir;
+#endif
+ }
+
+ Vec3vf<K> rdir;
+#if defined(__aarch64__)
+ Vec3vf<K> neg_org_rdir;
+#else
+ Vec3vf<K> org_rdir;
+#endif
+ vfloat<K> tnear;
+ vfloat<K> tfar;
+ };
+
+ template<int K>
+ using TravRayKStreamFast = TravRayKStream<K, false>;
+
+ /* Robust variant */
+ template<int K>
+ struct TravRayKStream<K, true>
+ {
+ __forceinline TravRayKStream() {}
+
+ __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+ {
+ init(ray_org, ray_dir);
+ tnear = ray_tnear;
+ tfar = ray_tfar;
+ }
+
+ __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+ {
+ rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+ org = ray_org;
+ }
+
+ Vec3vf<K> rdir;
+ Vec3vf<K> org;
+ vfloat<K> tnear;
+ vfloat<K> tfar;
+ };
+
+ template<int K>
+ using TravRayKStreamRobust = TravRayKStream<K, true>;
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Fast AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx, int K>
+ __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+ const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
+ {
+ const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+ const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+ const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+ const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+ const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+ const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+#if defined (__aarch64__)
+ const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+ const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+ const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+ const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+ const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+ const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+#else
+ const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+ const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+ const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+ const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+ const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+ const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+#endif
+ const vfloat<Nx> rmin = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+ const vfloat<Nx> rmax = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+ const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+ return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+ }
+
+ template<int N, int K>
+ __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+ const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
+ {
+ char* ptr = (char*)&node->lower_x + i*sizeof(float);
+ const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+ const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+ const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+ const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+ const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+ const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+#if defined (__aarch64__)
+ const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+ const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+ const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+ const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+ const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
+ const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
+ const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
+ const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
+
+ const vfloat<K> rmin = maxi(rminX, rminY, rminZ, ray.tnear);
+ const vfloat<K> rmax = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
+
+ const vbool<K> vmask_first_hit = rmin <= rmax;
+
+ return movemask(vmask_first_hit);
+ }
+
+ //////////////////////////////////////////////////////////////////////////////////////
+ // Robust AABBNode intersection
+ //////////////////////////////////////////////////////////////////////////////////////
+
+ template<int N, int Nx, int K>
+ __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+ const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
+ {
+ const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+ const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+ const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+ const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+ const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+ const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+ const vfloat<Nx> rminX = (bminX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+ const vfloat<Nx> rminY = (bminY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+ const vfloat<Nx> rminZ = (bminZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+ const vfloat<Nx> rmaxX = (bmaxX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+ const vfloat<Nx> rmaxY = (bmaxY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+ const vfloat<Nx> rmaxZ = (bmaxZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+ const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+ const vfloat<Nx> rmin = max(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+ const vfloat<Nx> rmax = round_up *min(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+ const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+ return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+ }
+
+ template<int N, int K>
+ __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+ const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
+ {
+ char *ptr = (char*)&node->lower_x + i*sizeof(float);
+ const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+ const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+ const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+ const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+ const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+ const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+ const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
+ const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
+ const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
+ const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
+ const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
+ const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
+
+ const float round_up = 1.0f+3.0f*float(ulp);
+ const vfloat<K> rmin = max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
+ const vfloat<K> rmax = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
+
+ const vbool<K> vmask_first_hit = rmin <= rmax;
+
+ return movemask(vmask_first_hit);
+ }
+ }
+}