summaryrefslogtreecommitdiff
path: root/thirdparty/embree/kernels
diff options
context:
space:
mode:
authorJoan Fons <joan.fonssanchez@gmail.com>2021-09-13 15:05:21 +0200
committerJoan Fons <joan.fonssanchez@gmail.com>2021-09-13 16:17:28 +0200
commit595cbacdf1c4fe9a75f55d9e5b78e49765b28c86 (patch)
tree034f131acf644068537e3c7bea5991e328575ba4 /thirdparty/embree/kernels
parentdde48ebed6303b53a912005570179382e3bd250b (diff)
Upgrade Embree and enable ray packets
Minor patch upgrade. Enabling ray packets results in faster processing of ray streams (i.e. occlusion culling buffer updates) at the cost of slightly larger binary sizes.
Diffstat (limited to 'thirdparty/embree/kernels')
-rw-r--r--thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp917
-rw-r--r--thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp59
-rw-r--r--thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp528
-rw-r--r--thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp36
-rw-r--r--thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp657
-rw-r--r--thirdparty/embree/kernels/config.h2
-rw-r--r--thirdparty/embree/kernels/hash.h2
7 files changed, 2199 insertions, 2 deletions
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
new file mode 100644
index 0000000000..6e9a5a538e
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp
@@ -0,0 +1,917 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector_hybrid.h"
+#include "bvh_traverser1.h"
+#include "node_intersector1.h"
+#include "node_intersector_packet.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+#define SWITCH_DURING_DOWN_TRAVERSAL 1
+#define FORCE_SINGLE_MODE 0
+
+#define ENABLE_FAST_COHERENT_CODEPATHS 1
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect1(Accel::Intersectors* This,
+ const BVH* bvh,
+ NodeRef root,
+ size_t k,
+ Precalculations& pre,
+ RayHitK<K>& ray,
+ const TravRayK<K, robust>& tray,
+ IntersectContext* context)
+ {
+ /* stack state */
+ StackItemT<NodeRef> stack[stackSizeSingle]; // stack of nodes
+ StackItemT<NodeRef>* stackPtr = stack + 1; // current stack pointer
+ StackItemT<NodeRef>* stackEnd = stack + stackSizeSingle;
+ stack[0].ptr = root;
+ stack[0].dist = neg_inf;
+
+ /* load the ray into SIMD registers */
+ TravRay<N,robust> tray1;
+ tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]);
+
+ /* pop loop */
+ while (true) pop:
+ {
+ /* pop next node */
+ if (unlikely(stackPtr == stack)) break;
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+
+ /* if popped node is too far, pop next one */
+ if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k]))
+ continue;
+
+ /* downtraversal loop */
+ while (true)
+ {
+ /* intersect node */
+ size_t mask; vfloat<N> tNear;
+ STAT3(normal.trav_nodes, 1, 1, 1);
+ bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask);
+ if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+ /* if no child is hit, pop next node */
+ if (unlikely(mask == 0))
+ goto pop;
+
+ /* select next child and push other children */
+ BVHNNodeTraverser1Hit<N, types>::traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+ }
+
+ /* this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(normal.trav_leaves, 1, 1, 1);
+ size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+
+ size_t lazy_node = 0;
+ PrimitiveIntersectorK::intersect(This, pre, ray, k, context, prim, num, tray1, lazy_node);
+
+ tray1.tfar = ray.tfar[k];
+
+ if (unlikely(lazy_node)) {
+ stackPtr->ptr = lazy_node;
+ stackPtr->dist = neg_inf;
+ stackPtr++;
+ }
+ }
+ }
+
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect(vint<K>* __restrict__ valid_i,
+ Accel::Intersectors* __restrict__ This,
+ RayHitK<K>& __restrict__ ray,
+ IntersectContext* __restrict__ context)
+ {
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+#if ENABLE_FAST_COHERENT_CODEPATHS == 1
+ assert(context);
+ if (unlikely(types == BVH_AN1 && context->user && context->isCoherent()))
+ {
+ intersectCoherent(valid_i, This, ray, context);
+ return;
+ }
+#endif
+
+ /* filter out invalid rays */
+ vbool<K> valid = *valid_i == -1;
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ valid &= ray.valid();
+#endif
+
+ /* return if there are no valid rays */
+ size_t valid_bits = movemask(valid);
+
+#if defined(__AVX__)
+ STAT3(normal.trav_hit_boxes[popcnt(movemask(valid))], 1, 1, 1);
+#endif
+
+ if (unlikely(valid_bits == 0)) return;
+
+ /* verify correct input */
+ assert(all(valid, ray.valid()));
+ assert(all(valid, ray.tnear() >= 0.0f));
+ assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f)));
+ Precalculations pre(valid, ray);
+
+ /* load ray */
+ TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0);
+ const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f);
+ const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f);
+
+ if (single)
+ {
+ tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf));
+ tray.tfar = select(valid, org_ray_tfar , vfloat<K>(neg_inf));
+
+ for (; valid_bits!=0; ) {
+ const size_t i = bscf(valid_bits);
+ intersect1(This, bvh, bvh->root, i, pre, ray, tray, context);
+ }
+ return;
+ }
+
+ /* determine switch threshold based on flags */
+ const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent;
+
+ vint<K> octant = ray.octant();
+ octant = select(valid, octant, vint<K>(0xffffffff));
+
+ /* test whether we have ray with opposing direction signs in the packet */
+ bool split = false;
+ {
+ size_t bits = valid_bits;
+ vbool<K> vsplit( false );
+ do
+ {
+ const size_t valid_index = bsf(bits);
+ vbool<K> octant_valid = octant[valid_index] == octant;
+ bits &= ~(size_t)movemask(octant_valid);
+ vsplit |= vint<K>(octant[valid_index]) == (octant^vint<K>(0x7));
+ } while (bits);
+ if (any(vsplit)) split = true;
+ }
+
+ do
+ {
+ const size_t valid_index = bsf(valid_bits);
+ const vint<K> diff_octant = vint<K>(octant[valid_index])^octant;
+ const vint<K> count_diff_octant = \
+ ((diff_octant >> 2) & 1) +
+ ((diff_octant >> 1) & 1) +
+ ((diff_octant >> 0) & 1);
+
+ vbool<K> octant_valid = (count_diff_octant <= 1) & (octant != vint<K>(0xffffffff));
+ if (!single || !split) octant_valid = valid; // deactivate octant sorting in pure chunk mode, otherwise instance traversal performance goes down
+
+
+ octant = select(octant_valid,vint<K>(0xffffffff),octant);
+ valid_bits &= ~(size_t)movemask(octant_valid);
+
+ tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf));
+ tray.tfar = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf));
+
+ /* allocate stack and push root node */
+ vfloat<K> stack_near[stackSizeChunk];
+ NodeRef stack_node[stackSizeChunk];
+ stack_node[0] = BVH::invalidNode;
+ stack_near[0] = inf;
+ stack_node[1] = bvh->root;
+ stack_near[1] = tray.tnear;
+ NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk;
+ NodeRef* __restrict__ sptr_node = stack_node + 2;
+ vfloat<K>* __restrict__ sptr_near = stack_near + 2;
+
+ while (1) pop:
+ {
+ /* pop next node from stack */
+ assert(sptr_node > stack_node);
+ sptr_node--;
+ sptr_near--;
+ NodeRef cur = *sptr_node;
+ if (unlikely(cur == BVH::invalidNode)) {
+ assert(sptr_node == stack_node);
+ break;
+ }
+
+ /* cull node if behind closest hit point */
+ vfloat<K> curDist = *sptr_near;
+ const vbool<K> active = curDist < tray.tfar;
+ if (unlikely(none(active)))
+ continue;
+
+ /* switch to single ray traversal */
+#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if FORCE_SINGLE_MODE == 0
+ if (single)
+#endif
+ {
+ size_t bits = movemask(active);
+#if FORCE_SINGLE_MODE == 0
+ if (unlikely(popcnt(bits) <= switchThreshold))
+#endif
+ {
+ for (; bits!=0; ) {
+ const size_t i = bscf(bits);
+ intersect1(This, bvh, cur, i, pre, ray, tray, context);
+ }
+ tray.tfar = min(tray.tfar, ray.tfar);
+ continue;
+ }
+ }
+#endif
+ while (likely(!cur.isLeaf()))
+ {
+ /* process nodes */
+ const vbool<K> valid_node = tray.tfar > curDist;
+ STAT3(normal.trav_nodes, 1, popcnt(valid_node), K);
+ const NodeRef nodeRef = cur;
+ const BaseNode* __restrict__ const node = nodeRef.baseNode();
+
+ /* set cur to invalid */
+ cur = BVH::emptyNode;
+ curDist = pos_inf;
+
+ size_t num_child_hits = 0;
+
+ for (unsigned i = 0; i < N; i++)
+ {
+ const NodeRef child = node->children[i];
+ if (unlikely(child == BVH::emptyNode)) break;
+ vfloat<K> lnearP;
+ vbool<K> lhit = valid_node;
+ BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit);
+
+ /* if we hit the child we choose to continue with that child if it
+ is closer than the current next child, or we push it onto the stack */
+ if (likely(any(lhit)))
+ {
+ assert(sptr_node < stackEnd);
+ assert(child != BVH::emptyNode);
+ const vfloat<K> childDist = select(lhit, lnearP, inf);
+ /* push cur node onto stack and continue with hit child */
+ if (any(childDist < curDist))
+ {
+ if (likely(cur != BVH::emptyNode)) {
+ num_child_hits++;
+ *sptr_node = cur; sptr_node++;
+ *sptr_near = curDist; sptr_near++;
+ }
+ curDist = childDist;
+ cur = child;
+ }
+
+ /* push hit child onto stack */
+ else {
+ num_child_hits++;
+ *sptr_node = child; sptr_node++;
+ *sptr_near = childDist; sptr_near++;
+ }
+ }
+ }
+
+#if defined(__AVX__)
+ //STAT3(normal.trav_hit_boxes[num_child_hits], 1, 1, 1);
+#endif
+
+ if (unlikely(cur == BVH::emptyNode))
+ goto pop;
+
+ /* improved distance sorting for 3 or more hits */
+ if (unlikely(num_child_hits >= 2))
+ {
+ if (any(sptr_near[-2] < sptr_near[-1]))
+ {
+ std::swap(sptr_near[-2],sptr_near[-1]);
+ std::swap(sptr_node[-2],sptr_node[-1]);
+ }
+ if (unlikely(num_child_hits >= 3))
+ {
+ if (any(sptr_near[-3] < sptr_near[-1]))
+ {
+ std::swap(sptr_near[-3],sptr_near[-1]);
+ std::swap(sptr_node[-3],sptr_node[-1]);
+ }
+ if (any(sptr_near[-3] < sptr_near[-2]))
+ {
+ std::swap(sptr_near[-3],sptr_near[-2]);
+ std::swap(sptr_node[-3],sptr_node[-2]);
+ }
+ }
+ }
+
+#if SWITCH_DURING_DOWN_TRAVERSAL == 1
+ if (single)
+ {
+ // seems to be the best place for testing utilization
+ if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold))
+ {
+ *sptr_node++ = cur;
+ *sptr_near++ = curDist;
+ goto pop;
+ }
+ }
+#endif
+ }
+
+ /* return if stack is empty */
+ if (unlikely(cur == BVH::invalidNode)) {
+ assert(sptr_node == stack_node);
+ break;
+ }
+
+ /* intersect leaf */
+ assert(cur != BVH::emptyNode);
+ const vbool<K> valid_leaf = tray.tfar > curDist;
+ STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K);
+ if (unlikely(none(valid_leaf))) continue;
+ size_t items; const Primitive* prim = (Primitive*)cur.leaf(items);
+
+ size_t lazy_node = 0;
+ PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node);
+ tray.tfar = select(valid_leaf, ray.tfar, tray.tfar);
+
+ if (unlikely(lazy_node)) {
+ *sptr_node = lazy_node; sptr_node++;
+ *sptr_near = neg_inf; sptr_near++;
+ }
+ }
+ } while(valid_bits);
+ }
+
+
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersectCoherent(vint<K>* __restrict__ valid_i,
+ Accel::Intersectors* __restrict__ This,
+ RayHitK<K>& __restrict__ ray,
+ IntersectContext* context)
+ {
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+
+ /* filter out invalid rays */
+ vbool<K> valid = *valid_i == -1;
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ valid &= ray.valid();
+#endif
+
+ /* return if there are no valid rays */
+ size_t valid_bits = movemask(valid);
+ if (unlikely(valid_bits == 0)) return;
+
+ /* verify correct input */
+ assert(all(valid, ray.valid()));
+ assert(all(valid, ray.tnear() >= 0.0f));
+ assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f)));
+ Precalculations pre(valid, ray);
+
+ /* load ray */
+ TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0);
+ const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f);
+ const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f);
+
+ vint<K> octant = ray.octant();
+ octant = select(valid, octant, vint<K>(0xffffffff));
+
+ do
+ {
+ const size_t valid_index = bsf(valid_bits);
+ const vbool<K> octant_valid = octant[valid_index] == octant;
+ valid_bits &= ~(size_t)movemask(octant_valid);
+
+ tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf));
+ tray.tfar = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf));
+
+ Frustum<robust> frustum;
+ frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N);
+
+ StackItemT<NodeRef> stack[stackSizeSingle]; // stack of nodes
+ StackItemT<NodeRef>* stackPtr = stack + 1; // current stack pointer
+ stack[0].ptr = bvh->root;
+ stack[0].dist = neg_inf;
+
+ while (1) pop:
+ {
+ /* pop next node from stack */
+ if (unlikely(stackPtr == stack)) break;
+
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+
+ /* cull node if behind closest hit point */
+ vfloat<K> curDist = *(float*)&stackPtr->dist;
+ const vbool<K> active = curDist < tray.tfar;
+ if (unlikely(none(active))) continue;
+
+ while (likely(!cur.isLeaf()))
+ {
+ /* process nodes */
+ //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K);
+ const NodeRef nodeRef = cur;
+ const AABBNode* __restrict__ const node = nodeRef.getAABBNode();
+
+ vfloat<N> fmin;
+ size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin);
+
+ if (unlikely(!m_frustum_node)) goto pop;
+ cur = BVH::emptyNode;
+ curDist = pos_inf;
+
+#if defined(__AVX__)
+ //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1);
+#endif
+ size_t num_child_hits = 0;
+ do {
+ const size_t i = bscf(m_frustum_node);
+ vfloat<K> lnearP;
+ vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored
+ STAT3(normal.trav_nodes, 1, 1, 1);
+ BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit);
+
+ if (likely(any(lhit)))
+ {
+ const vfloat<K> childDist = fmin[i];
+ const NodeRef child = node->child(i);
+ BVHN<N>::prefetch(child);
+ if (any(childDist < curDist))
+ {
+ if (likely(cur != BVH::emptyNode)) {
+ num_child_hits++;
+ stackPtr->ptr = cur;
+ *(float*)&stackPtr->dist = toScalar(curDist);
+ stackPtr++;
+ }
+ curDist = childDist;
+ cur = child;
+ }
+ /* push hit child onto stack */
+ else {
+ num_child_hits++;
+ stackPtr->ptr = child;
+ *(float*)&stackPtr->dist = toScalar(childDist);
+ stackPtr++;
+ }
+ }
+ } while(m_frustum_node);
+
+ if (unlikely(cur == BVH::emptyNode)) goto pop;
+
+ /* improved distance sorting for 3 or more hits */
+ if (unlikely(num_child_hits >= 2))
+ {
+ if (stackPtr[-2].dist < stackPtr[-1].dist)
+ std::swap(stackPtr[-2],stackPtr[-1]);
+ if (unlikely(num_child_hits >= 3))
+ {
+ if (stackPtr[-3].dist < stackPtr[-1].dist)
+ std::swap(stackPtr[-3],stackPtr[-1]);
+ if (stackPtr[-3].dist < stackPtr[-2].dist)
+ std::swap(stackPtr[-3],stackPtr[-2]);
+ }
+ }
+ }
+
+ /* intersect leaf */
+ assert(cur != BVH::invalidNode);
+ assert(cur != BVH::emptyNode);
+ const vbool<K> valid_leaf = tray.tfar > curDist;
+ STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K);
+ if (unlikely(none(valid_leaf))) continue;
+ size_t items; const Primitive* prim = (Primitive*)cur.leaf(items);
+
+ size_t lazy_node = 0;
+ PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node);
+
+ /* reduce max distance interval on successful intersection */
+ if (likely(any((ray.tfar < tray.tfar) & valid_leaf)))
+ {
+ tray.tfar = select(valid_leaf, ray.tfar, tray.tfar);
+ frustum.template updateMaxDist<K>(tray.tfar);
+ }
+
+ if (unlikely(lazy_node)) {
+ stackPtr->ptr = lazy_node;
+ stackPtr->dist = neg_inf;
+ stackPtr++;
+ }
+ }
+
+ } while(valid_bits);
+ }
+
+ // ===================================================================================================================================================================
+ // ===================================================================================================================================================================
+ // ===================================================================================================================================================================
+
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ bool BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded1(Accel::Intersectors* This,
+ const BVH* bvh,
+ NodeRef root,
+ size_t k,
+ Precalculations& pre,
+ RayK<K>& ray,
+ const TravRayK<K, robust>& tray,
+ IntersectContext* context)
+ {
+ /* stack state */
+ NodeRef stack[stackSizeSingle]; // stack of nodes that still need to get traversed
+ NodeRef* stackPtr = stack+1; // current stack pointer
+ NodeRef* stackEnd = stack+stackSizeSingle;
+ stack[0] = root;
+
+ /* load the ray into SIMD registers */
+ TravRay<N,robust> tray1;
+ tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]);
+
+ /* pop loop */
+ while (true) pop:
+ {
+ /* pop next node */
+ if (unlikely(stackPtr == stack)) break;
+ stackPtr--;
+ NodeRef cur = (NodeRef)*stackPtr;
+
+ /* downtraversal loop */
+ while (true)
+ {
+ /* intersect node */
+ size_t mask; vfloat<N> tNear;
+ STAT3(shadow.trav_nodes, 1, 1, 1);
+ bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask);
+ if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+ /* if no child is hit, pop next node */
+ if (unlikely(mask == 0))
+ goto pop;
+
+ /* select next child and push other children */
+ BVHNNodeTraverser1Hit<N, types>::traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+ }
+
+ /* this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(shadow.trav_leaves, 1, 1, 1);
+ size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+
+ size_t lazy_node = 0;
+ if (PrimitiveIntersectorK::occluded(This, pre, ray, k, context, prim, num, tray1, lazy_node)) {
+ ray.tfar[k] = neg_inf;
+ return true;
+ }
+
+ if (unlikely(lazy_node)) {
+ *stackPtr = lazy_node;
+ stackPtr++;
+ }
+ }
+ return false;
+ }
+
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded(vint<K>* __restrict__ valid_i,
+ Accel::Intersectors* __restrict__ This,
+ RayK<K>& __restrict__ ray,
+ IntersectContext* context)
+ {
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+#if ENABLE_FAST_COHERENT_CODEPATHS == 1
+ assert(context);
+ if (unlikely(types == BVH_AN1 && context->user && context->isCoherent()))
+ {
+ occludedCoherent(valid_i, This, ray, context);
+ return;
+ }
+#endif
+
+ /* filter out already occluded and invalid rays */
+ vbool<K> valid = (*valid_i == -1) & (ray.tfar >= 0.0f);
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ valid &= ray.valid();
+#endif
+
+ /* return if there are no valid rays */
+ const size_t valid_bits = movemask(valid);
+ if (unlikely(valid_bits == 0)) return;
+
+ /* verify correct input */
+ assert(all(valid, ray.valid()));
+ assert(all(valid, ray.tnear() >= 0.0f));
+ assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f)));
+ Precalculations pre(valid, ray);
+
+ /* load ray */
+ TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0);
+ const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f);
+ const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f);
+
+ tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf));
+ tray.tfar = select(valid, org_ray_tfar , vfloat<K>(neg_inf));
+
+ vbool<K> terminated = !valid;
+ const vfloat<K> inf = vfloat<K>(pos_inf);
+
+ /* determine switch threshold based on flags */
+ const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent;
+
+ /* allocate stack and push root node */
+ vfloat<K> stack_near[stackSizeChunk];
+ NodeRef stack_node[stackSizeChunk];
+ stack_node[0] = BVH::invalidNode;
+ stack_near[0] = inf;
+ stack_node[1] = bvh->root;
+ stack_near[1] = tray.tnear;
+ NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk;
+ NodeRef* __restrict__ sptr_node = stack_node + 2;
+ vfloat<K>* __restrict__ sptr_near = stack_near + 2;
+
+ while (1) pop:
+ {
+ /* pop next node from stack */
+ assert(sptr_node > stack_node);
+ sptr_node--;
+ sptr_near--;
+ NodeRef cur = *sptr_node;
+ if (unlikely(cur == BVH::invalidNode)) {
+ assert(sptr_node == stack_node);
+ break;
+ }
+
+ /* cull node if behind closest hit point */
+ vfloat<K> curDist = *sptr_near;
+ const vbool<K> active = curDist < tray.tfar;
+ if (unlikely(none(active)))
+ continue;
+
+ /* switch to single ray traversal */
+#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__)
+#if FORCE_SINGLE_MODE == 0
+ if (single)
+#endif
+ {
+ size_t bits = movemask(active);
+#if FORCE_SINGLE_MODE == 0
+ if (unlikely(popcnt(bits) <= switchThreshold))
+#endif
+ {
+ for (; bits!=0; ) {
+ const size_t i = bscf(bits);
+ if (occluded1(This, bvh, cur, i, pre, ray, tray, context))
+ set(terminated, i);
+ }
+ if (all(terminated)) break;
+ tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar);
+ continue;
+ }
+ }
+#endif
+
+ while (likely(!cur.isLeaf()))
+ {
+ /* process nodes */
+ const vbool<K> valid_node = tray.tfar > curDist;
+ STAT3(shadow.trav_nodes, 1, popcnt(valid_node), K);
+ const NodeRef nodeRef = cur;
+ const BaseNode* __restrict__ const node = nodeRef.baseNode();
+
+ /* set cur to invalid */
+ cur = BVH::emptyNode;
+ curDist = pos_inf;
+
+ for (unsigned i = 0; i < N; i++)
+ {
+ const NodeRef child = node->children[i];
+ if (unlikely(child == BVH::emptyNode)) break;
+ vfloat<K> lnearP;
+ vbool<K> lhit = valid_node;
+ BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit);
+
+ /* if we hit the child we push the previously hit node onto the stack, and continue with the currently hit child */
+ if (likely(any(lhit)))
+ {
+ assert(sptr_node < stackEnd);
+ assert(child != BVH::emptyNode);
+ const vfloat<K> childDist = select(lhit, lnearP, inf);
+
+ /* push 'cur' node onto stack and continue with hit child */
+ if (likely(cur != BVH::emptyNode)) {
+ *sptr_node = cur; sptr_node++;
+ *sptr_near = curDist; sptr_near++;
+ }
+ curDist = childDist;
+ cur = child;
+ }
+ }
+ if (unlikely(cur == BVH::emptyNode))
+ goto pop;
+
+#if SWITCH_DURING_DOWN_TRAVERSAL == 1
+ if (single)
+ {
+ // seems to be the best place for testing utilization
+ if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold))
+ {
+ *sptr_node++ = cur;
+ *sptr_near++ = curDist;
+ goto pop;
+ }
+ }
+#endif
+ }
+
+ /* return if stack is empty */
+ if (unlikely(cur == BVH::invalidNode)) {
+ assert(sptr_node == stack_node);
+ break;
+ }
+
+
+ /* intersect leaf */
+ assert(cur != BVH::emptyNode);
+ const vbool<K> valid_leaf = tray.tfar > curDist;
+ STAT3(shadow.trav_leaves, 1, popcnt(valid_leaf), K);
+ if (unlikely(none(valid_leaf))) continue;
+ size_t items; const Primitive* prim = (Primitive*) cur.leaf(items);
+
+ size_t lazy_node = 0;
+ terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node);
+ if (all(terminated)) break;
+ tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays
+
+ if (unlikely(lazy_node)) {
+ *sptr_node = lazy_node; sptr_node++;
+ *sptr_near = neg_inf; sptr_near++;
+ }
+ }
+
+ vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf);
+ }
+
+
+ template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single>
+ void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occludedCoherent(vint<K>* __restrict__ valid_i,
+ Accel::Intersectors* __restrict__ This,
+ RayK<K>& __restrict__ ray,
+ IntersectContext* context)
+ {
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+
+ /* filter out invalid rays */
+ vbool<K> valid = *valid_i == -1;
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ valid &= ray.valid();
+#endif
+
+ /* return if there are no valid rays */
+ size_t valid_bits = movemask(valid);
+ if (unlikely(valid_bits == 0)) return;
+
+ /* verify correct input */
+ assert(all(valid, ray.valid()));
+ assert(all(valid, ray.tnear() >= 0.0f));
+ assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f)));
+ Precalculations pre(valid,ray);
+
+ /* load ray */
+ TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0);
+ const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f);
+ const vfloat<K> org_ray_tfar = max(ray.tfar , 0.0f);
+
+ vbool<K> terminated = !valid;
+
+ vint<K> octant = ray.octant();
+ octant = select(valid, octant, vint<K>(0xffffffff));
+
+ do
+ {
+ const size_t valid_index = bsf(valid_bits);
+ vbool<K> octant_valid = octant[valid_index] == octant;
+ valid_bits &= ~(size_t)movemask(octant_valid);
+
+ tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf));
+ tray.tfar = select(octant_valid, org_ray_tfar, vfloat<K>(neg_inf));
+
+ Frustum<robust> frustum;
+ frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N);
+
+ StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes
+ StackItemMaskT<NodeRef>* stackPtr = stack + 1; // current stack pointer
+ stack[0].ptr = bvh->root;
+ stack[0].mask = movemask(octant_valid);
+
+ while (1) pop:
+ {
+ /* pop next node from stack */
+ if (unlikely(stackPtr == stack)) break;
+
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+
+ /* cull node of active rays have already been terminated */
+ size_t m_active = (size_t)stackPtr->mask & (~(size_t)movemask(terminated));
+
+ if (unlikely(m_active == 0)) continue;
+
+ while (likely(!cur.isLeaf()))
+ {
+ /* process nodes */
+ //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K);
+ const NodeRef nodeRef = cur;
+ const AABBNode* __restrict__ const node = nodeRef.getAABBNode();
+
+ vfloat<N> fmin;
+ size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin);
+
+ if (unlikely(!m_frustum_node)) goto pop;
+ cur = BVH::emptyNode;
+ m_active = 0;
+
+#if defined(__AVX__)
+ //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1);
+#endif
+ size_t num_child_hits = 0;
+ do {
+ const size_t i = bscf(m_frustum_node);
+ vfloat<K> lnearP;
+ vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored
+ STAT3(normal.trav_nodes, 1, 1, 1);
+ BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit);
+
+ if (likely(any(lhit)))
+ {
+ const NodeRef child = node->child(i);
+ assert(child != BVH::emptyNode);
+ BVHN<N>::prefetch(child);
+ if (likely(cur != BVH::emptyNode)) {
+ num_child_hits++;
+ stackPtr->ptr = cur;
+ stackPtr->mask = m_active;
+ stackPtr++;
+ }
+ cur = child;
+ m_active = movemask(lhit);
+ }
+ } while(m_frustum_node);
+
+ if (unlikely(cur == BVH::emptyNode)) goto pop;
+ }
+
+ /* intersect leaf */
+ assert(cur != BVH::invalidNode);
+ assert(cur != BVH::emptyNode);
+#if defined(__AVX__)
+ STAT3(normal.trav_leaves, 1, popcnt(m_active), K);
+#endif
+ if (unlikely(!m_active)) continue;
+ size_t items; const Primitive* prim = (Primitive*)cur.leaf(items);
+
+ size_t lazy_node = 0;
+ terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node);
+ octant_valid &= !terminated;
+ if (unlikely(none(octant_valid))) break;
+ tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays
+
+ if (unlikely(lazy_node)) {
+ stackPtr->ptr = lazy_node;
+ stackPtr->mask = movemask(octant_valid);
+ stackPtr++;
+ }
+ }
+ } while(valid_bits);
+
+ vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf);
+ }
+ }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp
new file mode 100644
index 0000000000..2137da6a25
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp
@@ -0,0 +1,59 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector_hybrid.cpp"
+
+namespace embree
+{
+ namespace isa
+ {
+ ////////////////////////////////////////////////////////////////////////////////
+ /// BVH4Intersector4 Definitions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoellerNoFilter, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller <4 COMMA 4 COMMA false> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKMoeller <4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvIntersectorKPluecker<4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKPluecker<4 COMMA 4 COMMA true> > >));
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >));
+
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA true > > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoellerNoFilter,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA false> > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKMoeller <4 COMMA 4 COMMA true > > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKPluecker<4 COMMA 4 COMMA true > > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKPluecker<4 COMMA 4 COMMA true > > >));
+
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKMoeller <4 COMMA 4 COMMA true > > >));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridPluecker,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKPluecker<4 COMMA 4 COMMA true > > >));
+
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersectorK<4> >));
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersectorK<4> >));
+
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersectorK<4> >));
+ IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersectorK<4> >));
+
+ //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>));
+ IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>));
+ IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>));
+ //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>));
+
+ IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4> >));
+ IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4MB> >));
+
+ IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >));
+ IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >));
+
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >));
+ //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >));
+
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersectorKPluecker <4 COMMA 4 COMMA true> >));
+ IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersectorKPluecker <4 COMMA 4 COMMA true> >));
+
+ }
+}
+
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp
new file mode 100644
index 0000000000..4a74d8468d
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp
@@ -0,0 +1,528 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector_stream.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+
+#include "../common/scene.h"
+#include <bitset>
+
+namespace embree
+{
+ namespace isa
+ {
+ __aligned(64) static const int shiftTable[32] = {
+ (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7,
+ (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15,
+ (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23,
+ (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31
+ };
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector>
+ __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersect(Accel::Intersectors* __restrict__ This,
+ RayHitN** inputPackets,
+ size_t numOctantRays,
+ IntersectContext* context)
+ {
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ BVH* __restrict__ bvh = (BVH*) This->ptr;
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+ // Only the coherent code path is implemented
+ assert(context->isCoherent());
+ intersectCoherent(This, (RayHitK<VSIZEL>**)inputPackets, numOctantRays, context);
+ }
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector>
+ template<int K>
+ __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersectCoherent(Accel::Intersectors* __restrict__ This,
+ RayHitK<K>** inputPackets,
+ size_t numOctantRays,
+ IntersectContext* context)
+ {
+ assert(context->isCoherent());
+
+ BVH* __restrict__ bvh = (BVH*) This->ptr;
+ __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes
+ assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE);
+
+ __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K];
+ __aligned(64) Frustum<robust> frustum;
+
+ bool commonOctant = true;
+ const size_t m_active = initPacketsAndFrustum((RayK<K>**)inputPackets, numOctantRays, packets, frustum, commonOctant);
+ if (unlikely(m_active == 0)) return;
+
+ /* case of non-common origin */
+ if (unlikely(!commonOctant))
+ {
+ const size_t numPackets = (numOctantRays+K-1)/K;
+ for (size_t i = 0; i < numPackets; i++)
+ This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context);
+ return;
+ }
+
+ stack[0].mask = m_active;
+ stack[0].parent = 0;
+ stack[0].child = bvh->root;
+
+ ///////////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////////
+
+ StackItemMaskCoherent* stackPtr = stack + 1;
+
+ while (1) pop:
+ {
+ if (unlikely(stackPtr == stack)) break;
+
+ STAT3(normal.trav_stack_pop,1,1,1);
+ stackPtr--;
+ /*! pop next node */
+ NodeRef cur = NodeRef(stackPtr->child);
+ size_t m_trav_active = stackPtr->mask;
+ assert(m_trav_active);
+ NodeRef parent = stackPtr->parent;
+
+ while (1)
+ {
+ if (unlikely(cur.isLeaf())) break;
+ const AABBNode* __restrict__ const node = cur.getAABBNode();
+ parent = cur;
+
+ __aligned(64) size_t maskK[N];
+ for (size_t i = 0; i < N; i++)
+ maskK[i] = m_trav_active;
+ vfloat<N> dist;
+ const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist);
+ if (unlikely(m_node_hit == 0)) goto pop;
+
+ BVHNNodeTraverserStreamHitCoherent<N, types>::traverseClosestHit(cur, m_trav_active, vbool<N>((int)m_node_hit), dist, (size_t*)maskK, stackPtr);
+ assert(m_trav_active);
+ }
+
+ /* non-root and leaf => full culling test for all rays */
+ if (unlikely(parent != 0 && cur.isLeaf()))
+ {
+ const AABBNode* __restrict__ const node = parent.getAABBNode();
+ size_t boxID = 0xff;
+ for (size_t i = 0; i < N; i++)
+ if (node->child(i) == cur) { boxID = i; break; }
+ assert(boxID < N);
+ assert(cur == node->child(boxID));
+ m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf);
+ }
+
+ /*! this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(normal.trav_leaves, 1, 1, 1);
+ size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
+
+ size_t bits = m_trav_active;
+
+ /*! intersect stream of rays with all primitives */
+ size_t lazy_node = 0;
+#if defined(__SSE4_2__)
+ STAT_USER(1,(popcnt(bits)+K-1)/K*4);
+#endif
+ while(bits)
+ {
+ size_t i = bsf(bits) / K;
+ const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
+ assert(m_isec & bits);
+ bits &= ~m_isec;
+
+ TravRayKStream<K, robust>& p = packets[i];
+ vbool<K> m_valid = p.tnear <= p.tfar;
+ PrimitiveIntersectorK<K>::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node);
+ p.tfar = min(p.tfar, inputPackets[i]->tfar);
+ };
+
+ } // traversal + intersection
+ }
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector>
+ __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occluded(Accel::Intersectors* __restrict__ This,
+ RayN** inputPackets,
+ size_t numOctantRays,
+ IntersectContext* context)
+ {
+ /* we may traverse an empty BVH in case all geometry was invalid */
+ BVH* __restrict__ bvh = (BVH*) This->ptr;
+ if (bvh->root == BVH::emptyNode)
+ return;
+
+ if (unlikely(context->isCoherent()))
+ occludedCoherent(This, (RayK<VSIZEL>**)inputPackets, numOctantRays, context);
+ else
+ occludedIncoherent(This, (RayK<VSIZEX>**)inputPackets, numOctantRays, context);
+ }
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector>
+ template<int K>
+ __noinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedCoherent(Accel::Intersectors* __restrict__ This,
+ RayK<K>** inputPackets,
+ size_t numOctantRays,
+ IntersectContext* context)
+ {
+ assert(context->isCoherent());
+
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+ __aligned(64) StackItemMaskCoherent stack[stackSizeSingle]; // stack of nodes
+ assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE);
+
+ /* inactive rays should have been filtered out before */
+ __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K];
+ __aligned(64) Frustum<robust> frustum;
+
+ bool commonOctant = true;
+ size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant);
+
+ /* valid rays */
+ if (unlikely(m_active == 0)) return;
+
+ /* case of non-common origin */
+ if (unlikely(!commonOctant))
+ {
+ const size_t numPackets = (numOctantRays+K-1)/K;
+ for (size_t i = 0; i < numPackets; i++)
+ This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context);
+ return;
+ }
+
+ stack[0].mask = m_active;
+ stack[0].parent = 0;
+ stack[0].child = bvh->root;
+
+ ///////////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////////
+
+ StackItemMaskCoherent* stackPtr = stack + 1;
+
+ while (1) pop:
+ {
+ if (unlikely(stackPtr == stack)) break;
+
+ STAT3(normal.trav_stack_pop,1,1,1);
+ stackPtr--;
+ /*! pop next node */
+ NodeRef cur = NodeRef(stackPtr->child);
+ size_t m_trav_active = stackPtr->mask & m_active;
+ if (unlikely(!m_trav_active)) continue;
+ assert(m_trav_active);
+ NodeRef parent = stackPtr->parent;
+
+ while (1)
+ {
+ if (unlikely(cur.isLeaf())) break;
+ const AABBNode* __restrict__ const node = cur.getAABBNode();
+ parent = cur;
+
+ __aligned(64) size_t maskK[N];
+ for (size_t i = 0; i < N; i++)
+ maskK[i] = m_trav_active;
+
+ vfloat<N> dist;
+ const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist);
+ if (unlikely(m_node_hit == 0)) goto pop;
+
+ BVHNNodeTraverserStreamHitCoherent<N, types>::traverseAnyHit(cur, m_trav_active, vbool<N>((int)m_node_hit), (size_t*)maskK, stackPtr);
+ assert(m_trav_active);
+ }
+
+ /* non-root and leaf => full culling test for all rays */
+ if (unlikely(parent != 0 && cur.isLeaf()))
+ {
+ const AABBNode* __restrict__ const node = parent.getAABBNode();
+ size_t boxID = 0xff;
+ for (size_t i = 0; i < N; i++)
+ if (node->child(i) == cur) { boxID = i; break; }
+ assert(boxID < N);
+ assert(cur == node->child(boxID));
+ m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf);
+ }
+
+ /*! this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(normal.trav_leaves, 1, 1, 1);
+ size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
+
+ size_t bits = m_trav_active & m_active;
+ /*! intersect stream of rays with all primitives */
+ size_t lazy_node = 0;
+#if defined(__SSE4_2__)
+ STAT_USER(1,(popcnt(bits)+K-1)/K*4);
+#endif
+ while (bits)
+ {
+ size_t i = bsf(bits) / K;
+ const size_t m_isec = ((((size_t)1 << K)-1) << (i*K));
+ assert(m_isec & bits);
+ bits &= ~m_isec;
+ TravRayKStream<K, robust>& p = packets[i];
+ vbool<K> m_valid = p.tnear <= p.tfar;
+ vbool<K> m_hit = PrimitiveIntersectorK<K>::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node);
+ inputPackets[i]->tfar = select(m_hit & m_valid, vfloat<K>(neg_inf), inputPackets[i]->tfar);
+ m_active &= ~((size_t)movemask(m_hit) << (i*K));
+ }
+
+ } // traversal + intersection
+ }
+
+
+ template<int N, int types, bool robust, typename PrimitiveIntersector>
+ template<int K>
+ __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedIncoherent(Accel::Intersectors* __restrict__ This,
+ RayK<K>** inputPackets,
+ size_t numOctantRays,
+ IntersectContext* context)
+ {
+ assert(!context->isCoherent());
+ assert(types & BVH_FLAG_ALIGNED_NODE);
+
+ __aligned(64) TravRayKStream<K,robust> packet[MAX_INTERNAL_STREAM_SIZE/K];
+
+ assert(numOctantRays <= 32);
+ const size_t numPackets = (numOctantRays+K-1)/K;
+ size_t m_active = 0;
+ for (size_t i = 0; i < numPackets; i++)
+ {
+ const vfloat<K> tnear = inputPackets[i]->tnear();
+ const vfloat<K> tfar = inputPackets[i]->tfar;
+ vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+ m_active |= (size_t)movemask(m_valid) << (K*i);
+ const Vec3vf<K>& org = inputPackets[i]->org;
+ const Vec3vf<K>& dir = inputPackets[i]->dir;
+ vfloat<K> packet_min_dist = max(tnear, 0.0f);
+ vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+ new (&packet[i]) TravRayKStream<K,robust>(org, dir, packet_min_dist, packet_max_dist);
+ }
+
+ BVH* __restrict__ bvh = (BVH*)This->ptr;
+
+ StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes
+ StackItemMaskT<NodeRef>* stackPtr = stack + 1; // current stack pointer
+ stack[0].ptr = bvh->root;
+ stack[0].mask = m_active;
+
+ size_t terminated = ~m_active;
+
+ /* near/far offsets based on first ray */
+ const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N);
+
+ while (1) pop:
+ {
+ if (unlikely(stackPtr == stack)) break;
+ STAT3(shadow.trav_stack_pop,1,1,1);
+ stackPtr--;
+ NodeRef cur = NodeRef(stackPtr->ptr);
+ size_t cur_mask = stackPtr->mask & (~terminated);
+ if (unlikely(cur_mask == 0)) continue;
+
+ while (true)
+ {
+ /*! stop if we found a leaf node */
+ if (unlikely(cur.isLeaf())) break;
+ const AABBNode* __restrict__ const node = cur.getAABBNode();
+
+ const vint<N> vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable);
+
+ size_t mask = movemask(vmask != vint<N>(zero));
+ if (unlikely(mask == 0)) goto pop;
+
+ __aligned(64) unsigned int child_mask[N];
+ vint<N>::storeu(child_mask, vmask); // this explicit store here causes much better code generation
+
+ /*! one child is hit, continue with that child */
+ size_t r = bscf(mask);
+ assert(r < N);
+ cur = node->child(r);
+ BVHN<N>::prefetch(cur,types);
+ cur_mask = child_mask[r];
+
+ /* simple in order sequence */
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) continue;
+ stackPtr->ptr = cur;
+ stackPtr->mask = cur_mask;
+ stackPtr++;
+
+ for (; ;)
+ {
+ r = bscf(mask);
+ assert(r < N);
+
+ cur = node->child(r);
+ BVHN<N>::prefetch(cur,types);
+ cur_mask = child_mask[r];
+ assert(cur != BVH::emptyNode);
+ if (likely(mask == 0)) break;
+ stackPtr->ptr = cur;
+ stackPtr->mask = cur_mask;
+ stackPtr++;
+ }
+ }
+
+ /*! this is a leaf node */
+ assert(cur != BVH::emptyNode);
+ STAT3(shadow.trav_leaves,1,1,1);
+ size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num);
+
+ size_t bits = cur_mask;
+ size_t lazy_node = 0;
+
+ for (; bits != 0;)
+ {
+ const size_t rayID = bscf(bits);
+
+ RayK<K> &ray = *inputPackets[rayID / K];
+ const size_t k = rayID % K;
+ if (PrimitiveIntersectorK<K>::occluded(This, ray, k, context, prim, num, lazy_node))
+ {
+ ray.tfar[k] = neg_inf;
+ terminated |= (size_t)1 << rayID;
+ }
+
+ /* lazy node */
+ if (unlikely(lazy_node))
+ {
+ stackPtr->ptr = lazy_node;
+ stackPtr->mask = cur_mask;
+ stackPtr++;
+ }
+ }
+
+ if (unlikely(terminated == (size_t)-1)) break;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// ArrayIntersectorKStream Definitions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<bool filter>
+ struct Triangle4IntersectorStreamMoeller {
+ template<int K> using Type = ArrayIntersectorKStream<K,TriangleMIntersectorKMoeller<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Triangle4vIntersectorStreamPluecker {
+ template<int K> using Type = ArrayIntersectorKStream<K,TriangleMvIntersectorKPluecker<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Triangle4iIntersectorStreamMoeller {
+ template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKMoeller<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Triangle4iIntersectorStreamPluecker {
+ template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKPluecker<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Quad4vIntersectorStreamMoeller {
+ template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKMoeller<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Quad4iIntersectorStreamMoeller {
+ template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKMoeller<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Quad4vIntersectorStreamPluecker {
+ template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKPluecker<4 COMMA K COMMA true>>;
+ };
+
+ template<bool filter>
+ struct Quad4iIntersectorStreamPluecker {
+ template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKPluecker<4 COMMA K COMMA true>>;
+ };
+
+ struct ObjectIntersectorStream {
+ template<int K> using Type = ArrayIntersectorKStream<K,ObjectIntersectorK<K COMMA false>>;
+ };
+
+ struct InstanceIntersectorStream {
+ template<int K> using Type = ArrayIntersectorKStream<K,InstanceIntersectorK<K>>;
+ };
+
+ // =====================================================================================================
+ // =====================================================================================================
+ // =====================================================================================================
+
+ template<int N>
+ void BVHNIntersectorStreamPacketFallback<N>::intersect(Accel::Intersectors* __restrict__ This,
+ RayHitN** inputRays,
+ size_t numTotalRays,
+ IntersectContext* context)
+ {
+ if (unlikely(context->isCoherent()))
+ intersectK(This, (RayHitK<VSIZEL>**)inputRays, numTotalRays, context);
+ else
+ intersectK(This, (RayHitK<VSIZEX>**)inputRays, numTotalRays, context);
+ }
+
+ template<int N>
+ void BVHNIntersectorStreamPacketFallback<N>::occluded(Accel::Intersectors* __restrict__ This,
+ RayN** inputRays,
+ size_t numTotalRays,
+ IntersectContext* context)
+ {
+ if (unlikely(context->isCoherent()))
+ occludedK(This, (RayK<VSIZEL>**)inputRays, numTotalRays, context);
+ else
+ occludedK(This, (RayK<VSIZEX>**)inputRays, numTotalRays, context);
+ }
+
+ template<int N>
+ template<int K>
+ __noinline void BVHNIntersectorStreamPacketFallback<N>::intersectK(Accel::Intersectors* __restrict__ This,
+ RayHitK<K>** inputRays,
+ size_t numTotalRays,
+ IntersectContext* context)
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < numTotalRays; i += K)
+ {
+ const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
+ vbool<K> valid = vi < vint<K>(int(numTotalRays));
+ RayHitK<K>& ray = *(inputRays[i / K]);
+ valid &= ray.tnear() <= ray.tfar;
+ This->intersect(valid, ray, context);
+ }
+ }
+
+ template<int N>
+ template<int K>
+ __noinline void BVHNIntersectorStreamPacketFallback<N>::occludedK(Accel::Intersectors* __restrict__ This,
+ RayK<K>** inputRays,
+ size_t numTotalRays,
+ IntersectContext* context)
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < numTotalRays; i += K)
+ {
+ const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
+ vbool<K> valid = vi < vint<K>(int(numTotalRays));
+ RayK<K>& ray = *(inputRays[i / K]);
+ valid &= ray.tnear() <= ray.tfar;
+ This->occluded(valid, ray, context);
+ }
+ }
+ }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp
new file mode 100644
index 0000000000..c3e5f137b8
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp
@@ -0,0 +1,36 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector_stream.cpp"
+
+namespace embree
+{
+ namespace isa
+ {
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// General BVHIntersectorStreamPacketFallback Intersector
+ ////////////////////////////////////////////////////////////////////////////////
+
+ DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>);
+
+ ////////////////////////////////////////////////////////////////////////////////
+ /// BVH4IntersectorStream Definitions
+ ////////////////////////////////////////////////////////////////////////////////
+
+ IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller<true>>));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4vIntersectorStreamPluecker<true>>));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Triangle4iIntersectorStreamPluecker<true>>));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<true>>));
+ IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<false>>));
+
+ IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<true>>));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<false>>));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller<true>>));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4vIntersectorStreamPluecker<true>>));
+ IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true COMMA Quad4iIntersectorStreamPluecker<true>>));
+
+ IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>));
+ IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>));
+ }
+}
diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp
new file mode 100644
index 0000000000..b858eb163f
--- /dev/null
+++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp
@@ -0,0 +1,657 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector_stream_filters.h"
+#include "bvh_intersector_stream.h"
+
+namespace embree
+{
+ namespace isa
+ {
+ template<int K, bool intersect>
+ __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context)
+ {
+ RayStreamAOS rayN(_rayN);
+
+ /* use fast path for coherent ray mode */
+ if (unlikely(context->isCoherent()))
+ {
+ __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
+ {
+ const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
+
+ /* convert from AOS to SOA */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const vint<K> offset = vij * int(stride);
+ const size_t packetIndex = j / K;
+
+ RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+
+ rays[packetIndex] = ray;
+ rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
+ }
+
+ /* trace stream */
+ scene->intersectors.intersectN(rayPtrs, size, context);
+
+ /* convert from SOA to AOS */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const vint<K> offset = vij * int(stride);
+ const size_t packetIndex = j / K;
+ rayN.setHitByOffset(valid, offset, rays[packetIndex]);
+ }
+ }
+ }
+ else if (unlikely(!intersect))
+ {
+ /* octant sorting for occlusion rays */
+ __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
+ __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ unsigned int raysInOctant[8];
+ for (unsigned int i = 0; i < 8; i++)
+ raysInOctant[i] = 0;
+ size_t inputRayID = 0;
+
+ for (;;)
+ {
+ int curOctant = -1;
+
+ /* sort rays into octants */
+ for (; inputRayID < N;)
+ {
+ const Ray& ray = rayN.getRayByOffset(inputRayID * stride);
+
+ /* skip invalid rays */
+ if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ if (unlikely(!ray.valid())) { inputRayID++; continue; }
+#endif
+
+ const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7;
+
+ assert(octantID < 8);
+ octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID;
+ inputRayID++;
+ if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
+ {
+ curOctant = octantID;
+ break;
+ }
+ }
+
+ /* need to flush rays in octant? */
+ if (unlikely(curOctant == -1))
+ {
+ for (unsigned int i = 0; i < 8; i++)
+ if (raysInOctant[i]) { curOctant = i; break; }
+ }
+
+ /* all rays traced? */
+ if (unlikely(curOctant == -1))
+ break;
+
+ unsigned int* const rayIDs = &octants[curOctant][0];
+ const unsigned int numOctantRays = raysInOctant[curOctant];
+ assert(numOctantRays);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride);
+ RayK<K>& ray = rays[j/K];
+ rayPtrs[j/K] = &ray;
+ ray = rayN.getRayByOffset<K>(valid, offset);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+ }
+
+ scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride);
+ rayN.setHitByOffset<K>(valid, offset, rays[j/K]);
+ }
+
+ raysInOctant[curOctant] = 0;
+ }
+ }
+ else
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < N; i += K)
+ {
+ const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
+ vbool<K> valid = vi < vint<K>(int(N));
+ const vint<K> offset = vi * int(stride);
+
+ RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
+ valid &= ray.tnear() <= ray.tfar;
+
+ scene->intersectors.intersect(valid, ray, context);
+
+ rayN.setHitByOffset<K>(valid, offset, ray);
+ }
+ }
+ }
+
+ template<int K, bool intersect>
+ __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context)
+ {
+ RayStreamAOP rayN(_rayN);
+
+ /* use fast path for coherent ray mode */
+ if (unlikely(context->isCoherent()))
+ {
+ __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
+ {
+ const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
+
+ /* convert from AOP to SOA */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const size_t packetIndex = j / K;
+
+ RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vij);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+
+ rays[packetIndex] = ray;
+ rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
+ }
+
+ /* trace stream */
+ scene->intersectors.intersectN(rayPtrs, size, context);
+
+ /* convert from SOA to AOP */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const size_t packetIndex = j / K;
+
+ rayN.setHitByIndex<K>(valid, vij, rays[packetIndex]);
+ }
+ }
+ }
+ else if (unlikely(!intersect))
+ {
+ /* octant sorting for occlusion rays */
+ __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
+ __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ unsigned int raysInOctant[8];
+ for (unsigned int i = 0; i < 8; i++)
+ raysInOctant[i] = 0;
+ size_t inputRayID = 0;
+
+ for (;;)
+ {
+ int curOctant = -1;
+
+ /* sort rays into octants */
+ for (; inputRayID < N;)
+ {
+ const Ray& ray = rayN.getRayByIndex(inputRayID);
+
+ /* skip invalid rays */
+ if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ if (unlikely(!ray.valid())) { inputRayID++; continue; }
+#endif
+
+ const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f)));
+
+ assert(octantID < 8);
+ octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID;
+ inputRayID++;
+ if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
+ {
+ curOctant = octantID;
+ break;
+ }
+ }
+
+ /* need to flush rays in octant? */
+ if (unlikely(curOctant == -1))
+ {
+ for (unsigned int i = 0; i < 8; i++)
+ if (raysInOctant[i]) { curOctant = i; break; }
+ }
+
+ /* all rays traced? */
+ if (unlikely(curOctant == -1))
+ break;
+
+ unsigned int* const rayIDs = &octants[curOctant][0];
+ const unsigned int numOctantRays = raysInOctant[curOctant];
+ assert(numOctantRays);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> index = *(vint<K>*)&rayIDs[j];
+ RayK<K>& ray = rays[j/K];
+ rayPtrs[j/K] = &ray;
+ ray = rayN.getRayByIndex<K>(valid, index);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+ }
+
+ scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> index = *(vint<K>*)&rayIDs[j];
+ rayN.setHitByIndex<K>(valid, index, rays[j/K]);
+ }
+
+ raysInOctant[curOctant] = 0;
+ }
+ }
+ else
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < N; i += K)
+ {
+ const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
+ vbool<K> valid = vi < vint<K>(int(N));
+
+ RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vi);
+ valid &= ray.tnear() <= ray.tfar;
+
+ scene->intersectors.intersect(valid, ray, context);
+
+ rayN.setHitByIndex<K>(valid, vi, ray);
+ }
+ }
+ }
+
+ template<int K, bool intersect>
+ __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context)
+ {
+ const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float));
+ const size_t offsetAlignment = (size_t)stride % (K*sizeof(float));
+
+ /* fast path for packets with the correct width and data alignment */
+ if (likely(N == K &&
+ !rayDataAlignment &&
+ !offsetAlignment))
+ {
+ if (unlikely(context->isCoherent()))
+ {
+ __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ size_t packetIndex = 0;
+ for (size_t i = 0; i < numPackets; i++)
+ {
+ const size_t offset = i * stride;
+ RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset);
+ rayPtrs[packetIndex++] = &ray;
+
+ /* trace as stream */
+ if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K))
+ {
+ const size_t size = packetIndex*K;
+ scene->intersectors.intersectN(rayPtrs, size, context);
+ packetIndex = 0;
+ }
+ }
+
+ /* flush remaining packets */
+ if (unlikely(packetIndex > 0))
+ {
+ const size_t size = packetIndex*K;
+ scene->intersectors.intersectN(rayPtrs, size, context);
+ }
+ }
+ else if (unlikely(!intersect))
+ {
+ /* octant sorting for occlusion rays */
+ RayStreamSOA rayN(rayData, K);
+
+ __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
+ __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ unsigned int raysInOctant[8];
+ for (unsigned int i = 0; i < 8; i++)
+ raysInOctant[i] = 0;
+ size_t inputRayID = 0;
+
+ for (;;)
+ {
+ int curOctant = -1;
+
+ /* sort rays into octants */
+ for (; inputRayID < N*numPackets;)
+ {
+ const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float);
+
+ /* skip invalid rays */
+ if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays
+ #if defined(EMBREE_IGNORE_INVALID_RAYS)
+ __aligned(64) Ray ray = rayN.getRayByOffset(offset);
+ if (unlikely(!ray.valid())) { inputRayID++; continue; }
+ #endif
+
+ const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset);
+
+ assert(octantID < 8);
+ octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset;
+ inputRayID++;
+ if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
+ {
+ curOctant = octantID;
+ break;
+ }
+ }
+
+ /* need to flush rays in octant? */
+ if (unlikely(curOctant == -1))
+ {
+ for (unsigned int i = 0; i < 8; i++)
+ if (raysInOctant[i]) { curOctant = i; break; }
+ }
+
+ /* all rays traced? */
+ if (unlikely(curOctant == -1))
+ break;
+
+ unsigned int* const rayOffsets = &octants[curOctant][0];
+ const unsigned int numOctantRays = raysInOctant[curOctant];
+ assert(numOctantRays);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayOffsets[j];
+ RayK<K>& ray = rays[j/K];
+ rayPtrs[j/K] = &ray;
+ ray = rayN.getRayByOffset<K>(valid, offset);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+ }
+
+ scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayOffsets[j];
+ rayN.setHitByOffset(valid, offset, rays[j/K]);
+ }
+ raysInOctant[curOctant] = 0;
+ }
+ }
+ else
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < numPackets; i++)
+ {
+ const size_t offset = i * stride;
+ RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset);
+ const vbool<K> valid = ray.tnear() <= ray.tfar;
+
+ scene->intersectors.intersect(valid, ray, context);
+ }
+ }
+ }
+ else
+ {
+ /* fallback to packets for arbitrary packet size and alignment */
+ for (size_t i = 0; i < numPackets; i++)
+ {
+ const size_t offsetN = i * stride;
+ RayStreamSOA rayN(rayData + offsetN, N);
+
+ for (size_t j = 0; j < N; j += K)
+ {
+ const size_t offset = j * sizeof(float);
+ vbool<K> valid = (vint<K>(int(j)) + vint<K>(step)) < vint<K>(int(N));
+ RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
+ valid &= ray.tnear() <= ray.tfar;
+
+ scene->intersectors.intersect(valid, ray, context);
+
+ rayN.setHitByOffset(valid, offset, ray);
+ }
+ }
+ }
+ }
+
+ template<int K, bool intersect>
+ __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context)
+ {
+ RayStreamSOP& rayN = *(RayStreamSOP*)_rayN;
+
+ /* use fast path for coherent ray mode */
+ if (unlikely(context->isCoherent()))
+ {
+ __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE)
+ {
+ const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE);
+
+ /* convert from SOP to SOA */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const size_t offset = (i+j) * sizeof(float);
+ const size_t packetIndex = j / K;
+
+ RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+
+ rays[packetIndex] = ray;
+ rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN
+ }
+
+ /* trace stream */
+ scene->intersectors.intersectN(rayPtrs, size, context);
+
+ /* convert from SOA to SOP */
+ for (size_t j = 0; j < size; j += K)
+ {
+ const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step);
+ const vbool<K> valid = vij < vint<K>(int(N));
+ const size_t offset = (i+j) * sizeof(float);
+ const size_t packetIndex = j / K;
+
+ rayN.setHitByOffset(valid, offset, rays[packetIndex]);
+ }
+ }
+ }
+ else if (unlikely(!intersect))
+ {
+ /* octant sorting for occlusion rays */
+ __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE];
+ __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K];
+ __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K];
+
+ unsigned int raysInOctant[8];
+ for (unsigned int i = 0; i < 8; i++)
+ raysInOctant[i] = 0;
+ size_t inputRayID = 0;
+
+ for (;;)
+ {
+ int curOctant = -1;
+
+ /* sort rays into octants */
+ for (; inputRayID < N;)
+ {
+ const size_t offset = inputRayID * sizeof(float);
+ /* skip invalid rays */
+ if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+ __aligned(64) Ray ray = rayN.getRayByOffset(offset);
+ if (unlikely(!ray.valid())) { inputRayID++; continue; }
+#endif
+
+ const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset);
+
+ assert(octantID < 8);
+ octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset;
+ inputRayID++;
+ if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE))
+ {
+ curOctant = octantID;
+ break;
+ }
+ }
+
+ /* need to flush rays in octant? */
+ if (unlikely(curOctant == -1))
+ {
+ for (unsigned int i = 0; i < 8; i++)
+ if (raysInOctant[i]) { curOctant = i; break; }
+ }
+
+ /* all rays traced? */
+ if (unlikely(curOctant == -1))
+ break;
+
+ unsigned int* const rayOffsets = &octants[curOctant][0];
+ const unsigned int numOctantRays = raysInOctant[curOctant];
+ assert(numOctantRays);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayOffsets[j];
+ RayK<K>& ray = rays[j/K];
+ rayPtrs[j/K] = &ray;
+ ray = rayN.getRayByOffset<K>(valid, offset);
+ ray.tnear() = select(valid, ray.tnear(), zero);
+ ray.tfar = select(valid, ray.tfar, neg_inf);
+ }
+
+ scene->intersectors.occludedN(rayPtrs, numOctantRays, context);
+
+ for (unsigned int j = 0; j < numOctantRays; j += K)
+ {
+ const vint<K> vi = vint<K>(int(j)) + vint<K>(step);
+ const vbool<K> valid = vi < vint<K>(int(numOctantRays));
+ const vint<K> offset = *(vint<K>*)&rayOffsets[j];
+ rayN.setHitByOffset(valid, offset, rays[j/K]);
+ }
+
+ raysInOctant[curOctant] = 0;
+ }
+ }
+ else
+ {
+ /* fallback to packets */
+ for (size_t i = 0; i < N; i += K)
+ {
+ const vint<K> vi = vint<K>(int(i)) + vint<K>(step);
+ vbool<K> valid = vi < vint<K>(int(N));
+ const size_t offset = i * sizeof(float);
+
+ RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset);
+ valid &= ray.tnear() <= ray.tfar;
+
+ scene->intersectors.intersect(valid, ray, context);
+
+ rayN.setHitByOffset(valid, offset, ray);
+ }
+ }
+ }
+
+
+ void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterAOS<VSIZEL, true>(scene, _rayN, N, stride, context);
+ else
+ filterAOS<VSIZEX, true>(scene, _rayN, N, stride, context);
+ }
+
+ void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterAOS<VSIZEL, false>(scene, _rayN, N, stride, context);
+ else
+ filterAOS<VSIZEX, false>(scene, _rayN, N, stride, context);
+ }
+
+ void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterAOP<VSIZEL, true>(scene, (void**)_rayN, N, context);
+ else
+ filterAOP<VSIZEX, true>(scene, (void**)_rayN, N, context);
+ }
+
+ void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterAOP<VSIZEL, false>(scene, (void**)_rayN, N, context);
+ else
+ filterAOP<VSIZEX, false>(scene, (void**)_rayN, N, context);
+ }
+
+ void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterSOA<VSIZEL, true>(scene, rayData, N, numPackets, stride, context);
+ else
+ filterSOA<VSIZEX, true>(scene, rayData, N, numPackets, stride, context);
+ }
+
+ void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterSOA<VSIZEL, false>(scene, rayData, N, numPackets, stride, context);
+ else
+ filterSOA<VSIZEX, false>(scene, rayData, N, numPackets, stride, context);
+ }
+
+ void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterSOP<VSIZEL, true>(scene, _rayN, N, context);
+ else
+ filterSOP<VSIZEX, true>(scene, _rayN, N, context);
+ }
+
+ void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) {
+ if (unlikely(context->isCoherent()))
+ filterSOP<VSIZEL, false>(scene, _rayN, N, context);
+ else
+ filterSOP<VSIZEX, false>(scene, _rayN, N, context);
+ }
+
+
+ RayStreamFilterFuncs rayStreamFilterFuncs() {
+ return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP,
+ RayStreamFilter::occludedAOS, RayStreamFilter::occludedAOP, RayStreamFilter::occludedSOA, RayStreamFilter::occludedSOP);
+ }
+ };
+};
diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h
index 80a8ab2a56..2bf7e93587 100644
--- a/thirdparty/embree/kernels/config.h
+++ b/thirdparty/embree/kernels/config.h
@@ -16,7 +16,7 @@
/* #undef EMBREE_GEOMETRY_INSTANCE */
/* #undef EMBREE_GEOMETRY_GRID */
/* #undef EMBREE_GEOMETRY_POINT */
-/* #undef EMBREE_RAY_PACKETS */
+#define EMBREE_RAY_PACKETS
/* #undef EMBREE_COMPACT_POLYS */
#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h
index 10f315cee7..470e15f03e 100644
--- a/thirdparty/embree/kernels/hash.h
+++ b/thirdparty/embree/kernels/hash.h
@@ -2,4 +2,4 @@
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
-#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab"
+#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"