diff options
| -rw-r--r-- | modules/raycast/SCsub | 3 | ||||
| -rw-r--r-- | modules/raycast/godot_update_embree.py | 12 | ||||
| -rw-r--r-- | thirdparty/embree/include/embree3/rtcore_config.h | 6 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp | 917 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp | 59 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp | 528 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp | 36 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp | 657 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/config.h | 2 | ||||
| -rw-r--r-- | thirdparty/embree/kernels/hash.h | 2 | 
10 files changed, 2216 insertions, 6 deletions
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub index 6e7b3e7b8d..1fdc8fe1b3 100644 --- a/modules/raycast/SCsub +++ b/modules/raycast/SCsub @@ -55,6 +55,9 @@ if env["builtin_embree"]:          "kernels/bvh/bvh_builder_sah_mb.cpp",          "kernels/bvh/bvh_builder_twolevel.cpp",          "kernels/bvh/bvh_intersector1_bvh4.cpp", +        "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", +        "kernels/bvh/bvh_intersector_stream_bvh4.cpp", +        "kernels/bvh/bvh_intersector_stream_filters.cpp",      ]      thirdparty_sources = [thirdparty_dir + file for file in embree_src] diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py index 31a25a318f..e31d88b741 100644 --- a/modules/raycast/godot_update_embree.py +++ b/modules/raycast/godot_update_embree.py @@ -61,6 +61,11 @@ cpp_files = [      "kernels/bvh/bvh_builder_twolevel.cpp",      "kernels/bvh/bvh_intersector1.cpp",      "kernels/bvh/bvh_intersector1_bvh4.cpp", +    "kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp", +    "kernels/bvh/bvh_intersector_stream_bvh4.cpp", +    "kernels/bvh/bvh_intersector_stream_filters.cpp", +    "kernels/bvh/bvh_intersector_hybrid.cpp", +    "kernels/bvh/bvh_intersector_stream.cpp",  ]  os.chdir("../../thirdparty") @@ -117,7 +122,7 @@ with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:  /* #undef EMBREE_GEOMETRY_INSTANCE */  /* #undef EMBREE_GEOMETRY_GRID */  /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS  /* #undef EMBREE_COMPACT_POLYS */  #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 @@ -249,3 +254,8 @@ with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as con  os.chdir("..")  shutil.rmtree("embree-tmp") + +subprocess.run(["git", "restore", "embree/patches"]) + +for patch in os.listdir("embree/patches"): +    subprocess.run(["git", "apply", "embree/patches/" + patch]) diff --git a/thirdparty/embree/include/embree3/rtcore_config.h b/thirdparty/embree/include/embree3/rtcore_config.h index 3a9819c9f1..62b7b6f4dc 100644 --- a/thirdparty/embree/include/embree3/rtcore_config.h +++ b/thirdparty/embree/include/embree3/rtcore_config.h @@ -6,9 +6,9 @@  #define RTC_VERSION_MAJOR 3  #define RTC_VERSION_MINOR 13 -#define RTC_VERSION_PATCH 0 -#define RTC_VERSION 31300 -#define RTC_VERSION_STRING "3.13.0" +#define RTC_VERSION_PATCH 1 +#define RTC_VERSION 31301 +#define RTC_VERSION_STRING "3.13.1"  #define RTC_MAX_INSTANCE_LEVEL_COUNT 1 diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp new file mode 100644 index 0000000000..6e9a5a538e --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid.cpp @@ -0,0 +1,917 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.h" +#include "bvh_traverser1.h" +#include "node_intersector1.h" +#include "node_intersector_packet.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/curveNv_intersector.h" +#include "../geometry/curveNi_intersector.h" +#include "../geometry/curveNi_mb_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" +#include "../geometry/subgrid_intersector.h" +#include "../geometry/subgrid_mb_intersector.h" +#include "../geometry/curve_intersector_virtual.h" + +#define SWITCH_DURING_DOWN_TRAVERSAL 1 +#define FORCE_SINGLE_MODE 0 + +#define ENABLE_FAST_COHERENT_CODEPATHS 1 + +namespace embree +{ +  namespace isa +  { +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect1(Accel::Intersectors* This, +                                                                                                const BVH* bvh, +                                                                                                NodeRef root, +                                                                                                size_t k, +                                                                                                Precalculations& pre, +                                                                                                RayHitK<K>& ray, +                                                                                                const TravRayK<K, robust>& tray, +                                                                                                IntersectContext* context) +    { +      /* stack state */ +      StackItemT<NodeRef> stack[stackSizeSingle];  // stack of nodes +      StackItemT<NodeRef>* stackPtr = stack + 1;   // current stack pointer +      StackItemT<NodeRef>* stackEnd = stack + stackSizeSingle; +      stack[0].ptr = root; +      stack[0].dist = neg_inf; + +      /* load the ray into SIMD registers */ +      TravRay<N,robust> tray1; +      tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + +      /* pop loop */ +      while (true) pop: +      { +        /* pop next node */ +        if (unlikely(stackPtr == stack)) break; +        stackPtr--; +        NodeRef cur = NodeRef(stackPtr->ptr); + +        /* if popped node is too far, pop next one */ +        if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k])) +          continue; + +        /* downtraversal loop */ +        while (true) +        { +          /* intersect node */ +          size_t mask; vfloat<N> tNear; +          STAT3(normal.trav_nodes, 1, 1, 1); +          bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask); +          if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; } + +          /* if no child is hit, pop next node */ +          if (unlikely(mask == 0)) +            goto pop; + +          /* select next child and push other children */ +          BVHNNodeTraverser1Hit<N, types>::traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd); +        } + +        /* this is a leaf node */ +        assert(cur != BVH::emptyNode); +        STAT3(normal.trav_leaves, 1, 1, 1); +        size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + +        size_t lazy_node = 0; +        PrimitiveIntersectorK::intersect(This, pre, ray, k, context, prim, num, tray1, lazy_node); + +        tray1.tfar = ray.tfar[k]; + +        if (unlikely(lazy_node)) { +          stackPtr->ptr = lazy_node; +          stackPtr->dist = neg_inf; +          stackPtr++; +        } +      } +    } + +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersect(vint<K>* __restrict__ valid_i, +                                                                                               Accel::Intersectors* __restrict__ This, +                                                                                               RayHitK<K>& __restrict__ ray, +                                                                                               IntersectContext* __restrict__ context) +    { +      BVH* __restrict__ bvh = (BVH*)This->ptr; +       +      /* we may traverse an empty BVH in case all geometry was invalid */ +      if (bvh->root == BVH::emptyNode) +        return; +             +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 +      assert(context); +      if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) +      { +        intersectCoherent(valid_i, This, ray, context); +        return; +      } +#endif + +      /* filter out invalid rays */ +      vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) +      valid &= ray.valid(); +#endif + +      /* return if there are no valid rays */ +      size_t valid_bits = movemask(valid); + +#if defined(__AVX__) +      STAT3(normal.trav_hit_boxes[popcnt(movemask(valid))], 1, 1, 1); +#endif + +      if (unlikely(valid_bits == 0)) return; + +      /* verify correct input */ +      assert(all(valid, ray.valid())); +      assert(all(valid, ray.tnear() >= 0.0f)); +      assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); +      Precalculations pre(valid, ray); + +      /* load ray */ +      TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); +      const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); +      const vfloat<K> org_ray_tfar  = max(ray.tfar , 0.0f); + +      if (single) +      { +        tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf)); +        tray.tfar  = select(valid, org_ray_tfar , vfloat<K>(neg_inf)); +         +        for (; valid_bits!=0; ) { +          const size_t i = bscf(valid_bits); +          intersect1(This, bvh, bvh->root, i, pre, ray, tray, context); +        } +        return; +      } + +       /* determine switch threshold based on flags */ +      const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + +      vint<K> octant = ray.octant(); +      octant = select(valid, octant, vint<K>(0xffffffff)); + +      /* test whether we have ray with opposing direction signs in the packet */ +      bool split = false; +      { +        size_t bits = valid_bits; +        vbool<K> vsplit( false ); +        do +        { +          const size_t valid_index = bsf(bits); +          vbool<K> octant_valid = octant[valid_index] == octant; +          bits &= ~(size_t)movemask(octant_valid); +          vsplit |= vint<K>(octant[valid_index]) == (octant^vint<K>(0x7)); +        } while (bits); +        if (any(vsplit)) split = true; +      } + +      do +      { +        const size_t valid_index = bsf(valid_bits); +        const vint<K> diff_octant = vint<K>(octant[valid_index])^octant; +        const vint<K> count_diff_octant = \ +          ((diff_octant >> 2) & 1) + +          ((diff_octant >> 1) & 1) + +          ((diff_octant >> 0) & 1); + +        vbool<K> octant_valid = (count_diff_octant <= 1) & (octant != vint<K>(0xffffffff)); +        if (!single || !split) octant_valid = valid; // deactivate octant sorting in pure chunk mode, otherwise instance traversal performance goes down  + + +        octant = select(octant_valid,vint<K>(0xffffffff),octant); +        valid_bits &= ~(size_t)movemask(octant_valid); + +        tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); +        tray.tfar  = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf)); + +        /* allocate stack and push root node */ +        vfloat<K> stack_near[stackSizeChunk]; +        NodeRef stack_node[stackSizeChunk]; +        stack_node[0] = BVH::invalidNode; +        stack_near[0] = inf; +        stack_node[1] = bvh->root; +        stack_near[1] = tray.tnear; +        NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; +        NodeRef* __restrict__ sptr_node = stack_node + 2; +        vfloat<K>* __restrict__ sptr_near = stack_near + 2; + +        while (1) pop: +        { +          /* pop next node from stack */ +          assert(sptr_node > stack_node); +          sptr_node--; +          sptr_near--; +          NodeRef cur = *sptr_node; +          if (unlikely(cur == BVH::invalidNode)) { +            assert(sptr_node == stack_node); +            break; +          } + +          /* cull node if behind closest hit point */ +          vfloat<K> curDist = *sptr_near; +          const vbool<K> active = curDist < tray.tfar; +          if (unlikely(none(active))) +            continue; + +          /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 +          if (single) +#endif +          { +            size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 +            if (unlikely(popcnt(bits) <= switchThreshold)) +#endif +            { +              for (; bits!=0; ) { +                const size_t i = bscf(bits); +                intersect1(This, bvh, cur, i, pre, ray, tray, context); +              } +              tray.tfar = min(tray.tfar, ray.tfar); +              continue; +            } +          } +#endif +          while (likely(!cur.isLeaf())) +          { +            /* process nodes */ +            const vbool<K> valid_node = tray.tfar > curDist; +            STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); +            const NodeRef nodeRef = cur; +            const BaseNode* __restrict__ const node = nodeRef.baseNode(); + +            /* set cur to invalid */ +            cur = BVH::emptyNode; +            curDist = pos_inf; + +            size_t num_child_hits = 0; + +            for (unsigned i = 0; i < N; i++) +            { +              const NodeRef child = node->children[i]; +              if (unlikely(child == BVH::emptyNode)) break; +              vfloat<K> lnearP; +              vbool<K> lhit = valid_node; +              BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + +              /* if we hit the child we choose to continue with that child if it +                 is closer than the current next child, or we push it onto the stack */ +              if (likely(any(lhit))) +              {                                 +                assert(sptr_node < stackEnd); +                assert(child != BVH::emptyNode); +                const vfloat<K> childDist = select(lhit, lnearP, inf); +                /* push cur node onto stack and continue with hit child */ +                if (any(childDist < curDist)) +                { +                  if (likely(cur != BVH::emptyNode)) { +                    num_child_hits++; +                    *sptr_node = cur; sptr_node++; +                    *sptr_near = curDist; sptr_near++; +                  } +                  curDist = childDist; +                  cur = child; +                } + +                /* push hit child onto stack */ +                else { +                  num_child_hits++;                   +                  *sptr_node = child; sptr_node++; +                  *sptr_near = childDist; sptr_near++; +                } +              } +            } + +#if defined(__AVX__) +            //STAT3(normal.trav_hit_boxes[num_child_hits], 1, 1, 1); +#endif + +            if (unlikely(cur == BVH::emptyNode)) +              goto pop; +             +            /* improved distance sorting for 3 or more hits */ +            if (unlikely(num_child_hits >= 2)) +            { +              if (any(sptr_near[-2] < sptr_near[-1])) +              { +                std::swap(sptr_near[-2],sptr_near[-1]); +                std::swap(sptr_node[-2],sptr_node[-1]); +              } +              if (unlikely(num_child_hits >= 3)) +              { +                if (any(sptr_near[-3] < sptr_near[-1])) +                { +                  std::swap(sptr_near[-3],sptr_near[-1]); +                  std::swap(sptr_node[-3],sptr_node[-1]); +                } +                if (any(sptr_near[-3] < sptr_near[-2])) +                { +                  std::swap(sptr_near[-3],sptr_near[-2]); +                  std::swap(sptr_node[-3],sptr_node[-2]); +                } +              } +            } + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 +            if (single) +            { +              // seems to be the best place for testing utilization +              if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) +              { +                *sptr_node++ = cur; +                *sptr_near++ = curDist; +                goto pop; +              } +            } +#endif +          } + +          /* return if stack is empty */ +          if (unlikely(cur == BVH::invalidNode)) { +            assert(sptr_node == stack_node); +            break; +          } + +          /* intersect leaf */ +          assert(cur != BVH::emptyNode); +          const vbool<K> valid_leaf = tray.tfar > curDist; +          STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); +          if (unlikely(none(valid_leaf))) continue; +          size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + +          size_t lazy_node = 0; +          PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); +          tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); + +          if (unlikely(lazy_node)) { +            *sptr_node = lazy_node; sptr_node++; +            *sptr_near = neg_inf;   sptr_near++; +          } +        } +      } while(valid_bits); +    } + + +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::intersectCoherent(vint<K>* __restrict__ valid_i, +                                                                                                       Accel::Intersectors* __restrict__ This, +                                                                                                       RayHitK<K>& __restrict__ ray, +                                                                                                       IntersectContext* context) +    { +      BVH* __restrict__ bvh = (BVH*)This->ptr; +       +      /* filter out invalid rays */ +      vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) +      valid &= ray.valid(); +#endif + +      /* return if there are no valid rays */ +      size_t valid_bits = movemask(valid); +      if (unlikely(valid_bits == 0)) return; + +      /* verify correct input */ +      assert(all(valid, ray.valid())); +      assert(all(valid, ray.tnear() >= 0.0f)); +      assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); +      Precalculations pre(valid, ray); + +      /* load ray */ +      TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); +      const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); +      const vfloat<K> org_ray_tfar  = max(ray.tfar , 0.0f); + +      vint<K> octant = ray.octant(); +      octant = select(valid, octant, vint<K>(0xffffffff)); + +      do +      { +        const size_t valid_index = bsf(valid_bits); +        const vbool<K> octant_valid = octant[valid_index] == octant; +        valid_bits &= ~(size_t)movemask(octant_valid); + +        tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); +        tray.tfar  = select(octant_valid, org_ray_tfar , vfloat<K>(neg_inf)); + +        Frustum<robust> frustum; +        frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + +        StackItemT<NodeRef> stack[stackSizeSingle];  // stack of nodes +        StackItemT<NodeRef>* stackPtr = stack + 1;   // current stack pointer +        stack[0].ptr  = bvh->root; +        stack[0].dist = neg_inf; + +        while (1) pop: +        { +          /* pop next node from stack */ +          if (unlikely(stackPtr == stack)) break; + +          stackPtr--; +          NodeRef cur = NodeRef(stackPtr->ptr); + +          /* cull node if behind closest hit point */ +          vfloat<K> curDist = *(float*)&stackPtr->dist; +          const vbool<K> active = curDist < tray.tfar; +          if (unlikely(none(active))) continue; + +          while (likely(!cur.isLeaf())) +          { +            /* process nodes */ +            //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); +            const NodeRef nodeRef = cur; +            const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + +            vfloat<N> fmin; +            size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin); + +            if (unlikely(!m_frustum_node)) goto pop; +            cur = BVH::emptyNode; +            curDist = pos_inf; +             +#if defined(__AVX__) +            //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif +            size_t num_child_hits = 0; +            do { +              const size_t i = bscf(m_frustum_node); +              vfloat<K> lnearP; +              vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored +              STAT3(normal.trav_nodes, 1, 1, 1); +              BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + +              if (likely(any(lhit))) +              {                                 +                const vfloat<K> childDist = fmin[i]; +                const NodeRef child = node->child(i); +                BVHN<N>::prefetch(child); +                if (any(childDist < curDist)) +                { +                  if (likely(cur != BVH::emptyNode)) { +                    num_child_hits++; +                    stackPtr->ptr = cur; +                    *(float*)&stackPtr->dist = toScalar(curDist); +                    stackPtr++; +                  } +                  curDist = childDist; +                  cur = child; +                } +                /* push hit child onto stack */ +                else { +                  num_child_hits++; +                  stackPtr->ptr = child; +                  *(float*)&stackPtr->dist = toScalar(childDist); +                  stackPtr++; +                }                 +              } +            } while(m_frustum_node); + +            if (unlikely(cur == BVH::emptyNode)) goto pop; + +            /* improved distance sorting for 3 or more hits */ +            if (unlikely(num_child_hits >= 2)) +            { +              if (stackPtr[-2].dist < stackPtr[-1].dist) +                std::swap(stackPtr[-2],stackPtr[-1]); +              if (unlikely(num_child_hits >= 3)) +              { +                if (stackPtr[-3].dist < stackPtr[-1].dist) +                  std::swap(stackPtr[-3],stackPtr[-1]); +                if (stackPtr[-3].dist < stackPtr[-2].dist) +                  std::swap(stackPtr[-3],stackPtr[-2]); +              } +            } +          } + +          /* intersect leaf */ +          assert(cur != BVH::invalidNode); +          assert(cur != BVH::emptyNode); +          const vbool<K> valid_leaf = tray.tfar > curDist; +          STAT3(normal.trav_leaves, 1, popcnt(valid_leaf), K); +          if (unlikely(none(valid_leaf))) continue; +          size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + +          size_t lazy_node = 0; +          PrimitiveIntersectorK::intersect(valid_leaf, This, pre, ray, context, prim, items, tray, lazy_node); + +          /* reduce max distance interval on successful intersection */ +          if (likely(any((ray.tfar < tray.tfar) & valid_leaf))) +          { +            tray.tfar = select(valid_leaf, ray.tfar, tray.tfar); +            frustum.template updateMaxDist<K>(tray.tfar); +          } + +          if (unlikely(lazy_node)) { +            stackPtr->ptr = lazy_node; +            stackPtr->dist = neg_inf; +            stackPtr++; +          } +        } +         +      } while(valid_bits); +    } + +    // =================================================================================================================================================================== +    // =================================================================================================================================================================== +    // =================================================================================================================================================================== + +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    bool BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded1(Accel::Intersectors* This, +                                                                                               const BVH* bvh, +                                                                                               NodeRef root, +                                                                                               size_t k, +                                                                                               Precalculations& pre, +                                                                                               RayK<K>& ray, +                                                                                               const TravRayK<K, robust>& tray, +                                                                                               IntersectContext* context) +      { +        /* stack state */ +        NodeRef stack[stackSizeSingle];  // stack of nodes that still need to get traversed +        NodeRef* stackPtr = stack+1;     // current stack pointer +	NodeRef* stackEnd = stack+stackSizeSingle; +        stack[0] = root; + +        /* load the ray into SIMD registers */ +        TravRay<N,robust> tray1; +        tray1.template init<K>(k, tray.org, tray.dir, tray.rdir, tray.nearXYZ, tray.tnear[k], tray.tfar[k]); + +	/* pop loop */ +	while (true) pop: +	{ +          /* pop next node */ +	  if (unlikely(stackPtr == stack)) break; +	  stackPtr--; +          NodeRef cur = (NodeRef)*stackPtr; + +          /* downtraversal loop */ +          while (true) +          { +            /* intersect node */ +            size_t mask; vfloat<N> tNear; +            STAT3(shadow.trav_nodes, 1, 1, 1); +            bool nodeIntersected = BVHNNodeIntersector1<N, types, robust>::intersect(cur, tray1, ray.time()[k], tNear, mask); +            if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; } + +            /* if no child is hit, pop next node */ +            if (unlikely(mask == 0)) +              goto pop; + +            /* select next child and push other children */ +            BVHNNodeTraverser1Hit<N, types>::traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd); +          } + +          /* this is a leaf node */ +          assert(cur != BVH::emptyNode); +          STAT3(shadow.trav_leaves, 1, 1, 1); +          size_t num; Primitive* prim = (Primitive*)cur.leaf(num); + +          size_t lazy_node = 0; +          if (PrimitiveIntersectorK::occluded(This, pre, ray, k, context, prim, num, tray1, lazy_node)) { +	    ray.tfar[k] = neg_inf; +	    return true; +	  } + +          if (unlikely(lazy_node)) { +            *stackPtr = lazy_node; +            stackPtr++; +          } +	} +	return false; +      } + +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occluded(vint<K>* __restrict__ valid_i, +                                                                                              Accel::Intersectors* __restrict__ This, +                                                                                              RayK<K>& __restrict__ ray, +                                                                                              IntersectContext* context) +    { +      BVH* __restrict__ bvh = (BVH*)This->ptr; +       +      /* we may traverse an empty BVH in case all geometry was invalid */ +      if (bvh->root == BVH::emptyNode) +        return; +       +#if ENABLE_FAST_COHERENT_CODEPATHS == 1 +      assert(context); +      if (unlikely(types == BVH_AN1 && context->user && context->isCoherent())) +      { +        occludedCoherent(valid_i, This, ray, context); +        return; +      } +#endif + +      /* filter out already occluded and invalid rays */ +      vbool<K> valid = (*valid_i == -1) & (ray.tfar >= 0.0f); +#if defined(EMBREE_IGNORE_INVALID_RAYS) +      valid &= ray.valid(); +#endif + +      /* return if there are no valid rays */ +      const size_t valid_bits = movemask(valid); +      if (unlikely(valid_bits == 0)) return; + +      /* verify correct input */ +      assert(all(valid, ray.valid())); +      assert(all(valid, ray.tnear() >= 0.0f)); +      assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); +      Precalculations pre(valid, ray); + +      /* load ray */ +      TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); +      const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); +      const vfloat<K> org_ray_tfar  = max(ray.tfar , 0.0f); + +      tray.tnear = select(valid, org_ray_tnear, vfloat<K>(pos_inf)); +      tray.tfar  = select(valid, org_ray_tfar , vfloat<K>(neg_inf)); + +      vbool<K> terminated = !valid; +      const vfloat<K> inf = vfloat<K>(pos_inf); + +      /* determine switch threshold based on flags */ +      const size_t switchThreshold = (context->user && context->isCoherent()) ? 2 : switchThresholdIncoherent; + +      /* allocate stack and push root node */ +      vfloat<K> stack_near[stackSizeChunk]; +      NodeRef stack_node[stackSizeChunk]; +      stack_node[0] = BVH::invalidNode; +      stack_near[0] = inf; +      stack_node[1] = bvh->root; +      stack_near[1] = tray.tnear; +      NodeRef* stackEnd MAYBE_UNUSED = stack_node+stackSizeChunk; +      NodeRef* __restrict__ sptr_node = stack_node + 2; +      vfloat<K>* __restrict__ sptr_near = stack_near + 2; + +      while (1) pop: +      { +        /* pop next node from stack */ +        assert(sptr_node > stack_node); +        sptr_node--; +        sptr_near--; +        NodeRef cur = *sptr_node; +        if (unlikely(cur == BVH::invalidNode)) { +          assert(sptr_node == stack_node); +          break; +        } + +        /* cull node if behind closest hit point */ +        vfloat<K> curDist = *sptr_near; +        const vbool<K> active = curDist < tray.tfar; +        if (unlikely(none(active))) +          continue; + +        /* switch to single ray traversal */ +#if (!defined(__WIN32__) || defined(__X86_64__)) && defined(__SSE4_2__) +#if FORCE_SINGLE_MODE == 0 +        if (single) +#endif +        { +          size_t bits = movemask(active); +#if FORCE_SINGLE_MODE == 0 +          if (unlikely(popcnt(bits) <= switchThreshold))  +#endif +          { +            for (; bits!=0; ) { +              const size_t i = bscf(bits); +              if (occluded1(This, bvh, cur, i, pre, ray, tray, context)) +                set(terminated, i); +            } +            if (all(terminated)) break; +            tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); +            continue; +          } +        } +#endif + +        while (likely(!cur.isLeaf())) +        { +          /* process nodes */ +          const vbool<K> valid_node = tray.tfar > curDist; +          STAT3(shadow.trav_nodes, 1, popcnt(valid_node), K); +          const NodeRef nodeRef = cur; +          const BaseNode* __restrict__ const node = nodeRef.baseNode(); + +          /* set cur to invalid */ +          cur = BVH::emptyNode; +          curDist = pos_inf; + +          for (unsigned i = 0; i < N; i++) +          { +            const NodeRef child = node->children[i]; +            if (unlikely(child == BVH::emptyNode)) break; +            vfloat<K> lnearP; +            vbool<K> lhit = valid_node; +            BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + +            /* if we hit the child we push the previously hit node onto the stack, and continue with the currently hit child */ +            if (likely(any(lhit))) +            { +              assert(sptr_node < stackEnd); +              assert(child != BVH::emptyNode); +              const vfloat<K> childDist = select(lhit, lnearP, inf); + +              /* push 'cur' node onto stack and continue with hit child */ +              if (likely(cur != BVH::emptyNode)) { +                *sptr_node = cur; sptr_node++; +                *sptr_near = curDist; sptr_near++; +              } +              curDist = childDist; +              cur = child; +            } +          } +          if (unlikely(cur == BVH::emptyNode)) +            goto pop; + +#if SWITCH_DURING_DOWN_TRAVERSAL == 1 +          if (single) +          { +            // seems to be the best place for testing utilization +            if (unlikely(popcnt(tray.tfar > curDist) <= switchThreshold)) +            { +              *sptr_node++ = cur; +              *sptr_near++ = curDist; +              goto pop; +            } +          } +#endif +	} + +        /* return if stack is empty */ +        if (unlikely(cur == BVH::invalidNode)) { +          assert(sptr_node == stack_node); +          break; +        } + + +        /* intersect leaf */ +        assert(cur != BVH::emptyNode); +        const vbool<K> valid_leaf = tray.tfar > curDist; +        STAT3(shadow.trav_leaves, 1, popcnt(valid_leaf), K); +        if (unlikely(none(valid_leaf))) continue; +        size_t items; const Primitive* prim = (Primitive*) cur.leaf(items); + +        size_t lazy_node = 0; +        terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); +        if (all(terminated)) break; +        tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays + +        if (unlikely(lazy_node)) { +          *sptr_node = lazy_node; sptr_node++; +          *sptr_near = neg_inf;   sptr_near++; +        } +      } + +      vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf); +    } + + +    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single> +    void BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, single>::occludedCoherent(vint<K>* __restrict__ valid_i, +                                                                                                      Accel::Intersectors* __restrict__ This, +                                                                                                      RayK<K>& __restrict__ ray, +                                                                                                      IntersectContext* context) +    { +      BVH* __restrict__ bvh = (BVH*)This->ptr; +       +      /* filter out invalid rays */ +      vbool<K> valid = *valid_i == -1; +#if defined(EMBREE_IGNORE_INVALID_RAYS) +      valid &= ray.valid(); +#endif + +      /* return if there are no valid rays */ +      size_t valid_bits = movemask(valid); +      if (unlikely(valid_bits == 0)) return; + +      /* verify correct input */ +      assert(all(valid, ray.valid())); +      assert(all(valid, ray.tnear() >= 0.0f)); +      assert(!(types & BVH_MB) || all(valid, (ray.time() >= 0.0f) & (ray.time() <= 1.0f))); +      Precalculations pre(valid,ray); + +      /* load ray */ +      TravRayK<K, robust> tray(ray.org, ray.dir, single ? N : 0); +      const vfloat<K> org_ray_tnear = max(ray.tnear(), 0.0f); +      const vfloat<K> org_ray_tfar  = max(ray.tfar , 0.0f); + +      vbool<K> terminated = !valid; + +      vint<K> octant = ray.octant(); +      octant = select(valid, octant, vint<K>(0xffffffff)); + +      do +      { +        const size_t valid_index = bsf(valid_bits); +        vbool<K> octant_valid = octant[valid_index] == octant; +        valid_bits &= ~(size_t)movemask(octant_valid); + +        tray.tnear = select(octant_valid, org_ray_tnear, vfloat<K>(pos_inf)); +        tray.tfar  = select(octant_valid, org_ray_tfar,  vfloat<K>(neg_inf)); + +        Frustum<robust> frustum; +        frustum.template init<K>(octant_valid, tray.org, tray.rdir, tray.tnear, tray.tfar, N); + +        StackItemMaskT<NodeRef> stack[stackSizeSingle];  // stack of nodes +        StackItemMaskT<NodeRef>* stackPtr = stack + 1;   // current stack pointer +        stack[0].ptr  = bvh->root; +        stack[0].mask = movemask(octant_valid); + +        while (1) pop: +        { +          /* pop next node from stack */ +          if (unlikely(stackPtr == stack)) break; + +          stackPtr--; +          NodeRef cur = NodeRef(stackPtr->ptr); + +          /* cull node of active rays have already been terminated */ +          size_t m_active = (size_t)stackPtr->mask & (~(size_t)movemask(terminated)); + +          if (unlikely(m_active == 0)) continue; + +          while (likely(!cur.isLeaf())) +          { +            /* process nodes */ +            //STAT3(normal.trav_nodes, 1, popcnt(valid_node), K); +            const NodeRef nodeRef = cur; +            const AABBNode* __restrict__ const node = nodeRef.getAABBNode(); + +            vfloat<N> fmin; +            size_t m_frustum_node = intersectNodeFrustum<N>(node, frustum, fmin); + +            if (unlikely(!m_frustum_node)) goto pop; +            cur = BVH::emptyNode; +            m_active = 0; + +#if defined(__AVX__) +            //STAT3(normal.trav_hit_boxes[popcnt(m_frustum_node)], 1, 1, 1); +#endif +            size_t num_child_hits = 0; +            do { +              const size_t i = bscf(m_frustum_node); +              vfloat<K> lnearP; +              vbool<K> lhit = false; // motion blur is not supported, so the initial value will be ignored +              STAT3(normal.trav_nodes, 1, 1, 1); +              BVHNNodeIntersectorK<N, K, types, robust>::intersect(nodeRef, i, tray, ray.time(), lnearP, lhit); + +              if (likely(any(lhit))) +              {                                 +                const NodeRef child = node->child(i); +                assert(child != BVH::emptyNode); +                BVHN<N>::prefetch(child); +                if (likely(cur != BVH::emptyNode)) { +                  num_child_hits++; +                  stackPtr->ptr  = cur; +                  stackPtr->mask = m_active; +                  stackPtr++; +                } +                cur = child; +                m_active = movemask(lhit); +              } +            } while(m_frustum_node); + +            if (unlikely(cur == BVH::emptyNode)) goto pop; +          } + +          /* intersect leaf */ +          assert(cur != BVH::invalidNode); +          assert(cur != BVH::emptyNode); +#if defined(__AVX__) +          STAT3(normal.trav_leaves, 1, popcnt(m_active), K); +#endif +          if (unlikely(!m_active)) continue; +          size_t items; const Primitive* prim = (Primitive*)cur.leaf(items); + +          size_t lazy_node = 0; +          terminated |= PrimitiveIntersectorK::occluded(!terminated, This, pre, ray, context, prim, items, tray, lazy_node); +          octant_valid &= !terminated; +          if (unlikely(none(octant_valid))) break; +          tray.tfar = select(terminated, vfloat<K>(neg_inf), tray.tfar); // ignore node intersections for terminated rays + +          if (unlikely(lazy_node)) { +            stackPtr->ptr  = lazy_node; +            stackPtr->mask = movemask(octant_valid); +            stackPtr++; +          } +        } +      } while(valid_bits); + +      vfloat<K>::store(valid & terminated, &ray.tfar, neg_inf); +    } +  } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp new file mode 100644 index 0000000000..2137da6a25 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_hybrid4_bvh4.cpp @@ -0,0 +1,59 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_hybrid.cpp" + +namespace embree +{ +  namespace isa +  { +    //////////////////////////////////////////////////////////////////////////////// +    /// BVH4Intersector4 Definitions +    //////////////////////////////////////////////////////////////////////////////// + +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoeller,         BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller  <4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4Intersector4HybridMoellerNoFilter, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMIntersectorKMoeller  <4 COMMA 4 COMMA false> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridMoeller,        BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vIntersector4HybridPluecker,       BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersectorK_1<4 COMMA TriangleMvIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iIntersector4HybridPluecker,       BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersectorK_1<4 COMMA TriangleMiIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridMoeller,  BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridMoeller,  BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKMoeller <4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4vMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersectorK_1<4 COMMA TriangleMvMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTOR4(BVH4Triangle4iMBIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersectorK_1<4 COMMA TriangleMiMBIntersectorKPluecker<4 COMMA 4 COMMA true> > >)); + +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoeller,        BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridMoellerNoFilter,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKMoeller <4 COMMA 4 COMMA false> > >)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridMoeller,        BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4vIntersector4HybridPluecker,       BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersectorK_1<4 COMMA QuadMvIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iIntersector4HybridPluecker,       BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersectorK_1<4 COMMA QuadMiIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKMoeller <4 COMMA 4 COMMA true > > >)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTOR4(BVH4Quad4iMBIntersector4HybridPluecker,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersectorK_1<4 COMMA QuadMiMBIntersectorKPluecker<4 COMMA 4 COMMA true > > >)); + +    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersectorK<4> >)); +    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersector4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersectorK<4> >)); + +    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4Hybrid, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersectorK<4> >)); +    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR4(BVH4OBBVirtualCurveIntersectorRobust4HybridMB,BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersectorK<4> >)); +   +    //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); +    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1Intersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector4>)); +    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); +    //IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR4(BVH4SubdivPatch1MBIntersector4, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA SubdivPatch1MBIntersector4>)); + +    IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4> >)); +    IF_ENABLED_USER(DEFINE_INTERSECTOR4(BVH4VirtualMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA ObjectIntersector4MB> >)); + +    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorK<4>> >)); +    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR4(BVH4InstanceMBIntersector4Chunk, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersectorK_1<4 COMMA InstanceIntersectorKMB<4>> >)); + +    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); +    //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridMoeller, BVHNIntersectorKChunk<4 COMMA 4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersectorKMoeller <4 COMMA 4 COMMA true> >)); +     +    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridMBIntersector4HybridMoeller, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersectorKPluecker <4 COMMA 4 COMMA true> >)); +    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR4(BVH4GridIntersector4HybridPluecker, BVHNIntersectorKHybrid<4 COMMA 4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersectorKPluecker <4 COMMA 4 COMMA true> >)); + +  } +} + diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp new file mode 100644 index 0000000000..4a74d8468d --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream.cpp @@ -0,0 +1,528 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.h" + +#include "../geometry/intersector_iterators.h" +#include "../geometry/triangle_intersector.h" +#include "../geometry/trianglev_intersector.h" +#include "../geometry/trianglev_mb_intersector.h" +#include "../geometry/trianglei_intersector.h" +#include "../geometry/quadv_intersector.h" +#include "../geometry/quadi_intersector.h" +#include "../geometry/linei_intersector.h" +#include "../geometry/subdivpatch1_intersector.h" +#include "../geometry/object_intersector.h" +#include "../geometry/instance_intersector.h" + +#include "../common/scene.h" +#include <bitset> + +namespace embree +{ +  namespace isa +  { +    __aligned(64) static const int shiftTable[32] = {  +      (int)1 << 0, (int)1 << 1, (int)1 << 2, (int)1 << 3, (int)1 << 4, (int)1 << 5, (int)1 << 6, (int)1 << 7,   +      (int)1 << 8, (int)1 << 9, (int)1 << 10, (int)1 << 11, (int)1 << 12, (int)1 << 13, (int)1 << 14, (int)1 << 15,   +      (int)1 << 16, (int)1 << 17, (int)1 << 18, (int)1 << 19, (int)1 << 20, (int)1 << 21, (int)1 << 22, (int)1 << 23,   +      (int)1 << 24, (int)1 << 25, (int)1 << 26, (int)1 << 27, (int)1 << 28, (int)1 << 29, (int)1 << 30, (int)1 << 31 +    }; + +    template<int N, int types, bool robust, typename PrimitiveIntersector> +    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersect(Accel::Intersectors* __restrict__ This, +                                                                                                    RayHitN** inputPackets, +                                                                                                    size_t numOctantRays, +                                                                                                    IntersectContext* context) +    { +      /* we may traverse an empty BVH in case all geometry was invalid */ +      BVH* __restrict__ bvh = (BVH*) This->ptr; +      if (bvh->root == BVH::emptyNode) +        return; +       +      // Only the coherent code path is implemented +      assert(context->isCoherent()); +      intersectCoherent(This, (RayHitK<VSIZEL>**)inputPackets, numOctantRays, context); +    } + +    template<int N, int types, bool robust, typename PrimitiveIntersector> +    template<int K> +    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::intersectCoherent(Accel::Intersectors* __restrict__ This, +                                                                                                            RayHitK<K>** inputPackets, +                                                                                                            size_t numOctantRays, +                                                                                                            IntersectContext* context) +    { +      assert(context->isCoherent()); + +      BVH* __restrict__ bvh = (BVH*) This->ptr; +      __aligned(64) StackItemMaskCoherent stack[stackSizeSingle];  // stack of nodes +      assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + +      __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K]; +      __aligned(64) Frustum<robust> frustum; + +      bool commonOctant = true; +      const size_t m_active = initPacketsAndFrustum((RayK<K>**)inputPackets, numOctantRays, packets, frustum, commonOctant); +      if (unlikely(m_active == 0)) return; + +      /* case of non-common origin */ +      if (unlikely(!commonOctant)) +      { +        const size_t numPackets = (numOctantRays+K-1)/K;  +        for (size_t i = 0; i < numPackets; i++) +          This->intersect(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); +        return; +      } + +      stack[0].mask   = m_active; +      stack[0].parent = 0; +      stack[0].child  = bvh->root; + +      /////////////////////////////////////////////////////////////////////////////////// +      /////////////////////////////////////////////////////////////////////////////////// +      /////////////////////////////////////////////////////////////////////////////////// + +      StackItemMaskCoherent* stackPtr = stack + 1; + +      while (1) pop: +      { +        if (unlikely(stackPtr == stack)) break; + +        STAT3(normal.trav_stack_pop,1,1,1); +        stackPtr--; +        /*! pop next node */ +        NodeRef cur = NodeRef(stackPtr->child); +        size_t m_trav_active = stackPtr->mask; +        assert(m_trav_active); +        NodeRef parent = stackPtr->parent; + +        while (1) +        { +          if (unlikely(cur.isLeaf())) break; +          const AABBNode* __restrict__ const node = cur.getAABBNode(); +          parent = cur; + +          __aligned(64) size_t maskK[N]; +          for (size_t i = 0; i < N; i++) +            maskK[i] = m_trav_active; +          vfloat<N> dist; +          const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); +          if (unlikely(m_node_hit == 0)) goto pop; + +          BVHNNodeTraverserStreamHitCoherent<N, types>::traverseClosestHit(cur, m_trav_active, vbool<N>((int)m_node_hit), dist, (size_t*)maskK, stackPtr); +          assert(m_trav_active); +        } + +        /* non-root and leaf => full culling test for all rays */ +        if (unlikely(parent != 0 && cur.isLeaf())) +        { +          const AABBNode* __restrict__ const node = parent.getAABBNode(); +          size_t boxID = 0xff; +          for (size_t i = 0; i < N; i++) +            if (node->child(i) == cur) { boxID = i; break; } +          assert(boxID < N); +          assert(cur == node->child(boxID)); +          m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); +        } + +        /*! this is a leaf node */ +        assert(cur != BVH::emptyNode); +        STAT3(normal.trav_leaves, 1, 1, 1); +        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + +        size_t bits = m_trav_active; + +        /*! intersect stream of rays with all primitives */ +        size_t lazy_node = 0; +#if defined(__SSE4_2__) +        STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif +        while(bits) +        { +          size_t i = bsf(bits) / K; +          const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); +          assert(m_isec & bits); +          bits &= ~m_isec; + +          TravRayKStream<K, robust>& p = packets[i]; +          vbool<K> m_valid = p.tnear <= p.tfar; +          PrimitiveIntersectorK<K>::intersectK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); +          p.tfar = min(p.tfar, inputPackets[i]->tfar); +        }; + +      } // traversal + intersection +    } + +    template<int N, int types, bool robust, typename PrimitiveIntersector> +    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occluded(Accel::Intersectors* __restrict__ This, +                                                                                                   RayN** inputPackets, +                                                                                                   size_t numOctantRays, +                                                                                                   IntersectContext* context) +    { +      /* we may traverse an empty BVH in case all geometry was invalid */ +      BVH* __restrict__ bvh = (BVH*) This->ptr; +      if (bvh->root == BVH::emptyNode) +        return; +       +      if (unlikely(context->isCoherent())) +        occludedCoherent(This, (RayK<VSIZEL>**)inputPackets, numOctantRays, context); +      else +        occludedIncoherent(This, (RayK<VSIZEX>**)inputPackets, numOctantRays, context); +    } + +    template<int N, int types, bool robust, typename PrimitiveIntersector> +    template<int K> +    __noinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedCoherent(Accel::Intersectors* __restrict__ This, +                                                                                                        RayK<K>** inputPackets, +                                                                                                        size_t numOctantRays, +                                                                                                        IntersectContext* context) +    { +      assert(context->isCoherent()); + +      BVH* __restrict__ bvh = (BVH*)This->ptr; +      __aligned(64) StackItemMaskCoherent stack[stackSizeSingle];  // stack of nodes +      assert(numOctantRays <= MAX_INTERNAL_STREAM_SIZE); + +      /* inactive rays should have been filtered out before */ +      __aligned(64) TravRayKStream<K, robust> packets[MAX_INTERNAL_STREAM_SIZE/K]; +      __aligned(64) Frustum<robust> frustum; + +      bool commonOctant = true; +      size_t m_active = initPacketsAndFrustum(inputPackets, numOctantRays, packets, frustum, commonOctant); + +      /* valid rays */ +      if (unlikely(m_active == 0)) return; + +      /* case of non-common origin */ +      if (unlikely(!commonOctant)) +      { +        const size_t numPackets = (numOctantRays+K-1)/K;  +        for (size_t i = 0; i < numPackets; i++) +          This->occluded(inputPackets[i]->tnear() <= inputPackets[i]->tfar, *inputPackets[i], context); +        return; +      } + +      stack[0].mask   = m_active; +      stack[0].parent = 0; +      stack[0].child  = bvh->root; + +      /////////////////////////////////////////////////////////////////////////////////// +      /////////////////////////////////////////////////////////////////////////////////// +      /////////////////////////////////////////////////////////////////////////////////// + +      StackItemMaskCoherent* stackPtr = stack + 1; + +      while (1) pop: +      { +        if (unlikely(stackPtr == stack)) break; + +        STAT3(normal.trav_stack_pop,1,1,1); +        stackPtr--; +        /*! pop next node */ +        NodeRef cur = NodeRef(stackPtr->child); +        size_t m_trav_active = stackPtr->mask & m_active; +        if (unlikely(!m_trav_active)) continue; +        assert(m_trav_active); +        NodeRef parent = stackPtr->parent; + +        while (1) +        { +          if (unlikely(cur.isLeaf())) break; +          const AABBNode* __restrict__ const node = cur.getAABBNode(); +          parent = cur; + +          __aligned(64) size_t maskK[N]; +          for (size_t i = 0; i < N; i++) +            maskK[i] = m_trav_active; + +          vfloat<N> dist; +          const size_t m_node_hit = traverseCoherentStream(m_trav_active, packets, node, frustum, maskK, dist); +          if (unlikely(m_node_hit == 0)) goto pop; + +          BVHNNodeTraverserStreamHitCoherent<N, types>::traverseAnyHit(cur, m_trav_active, vbool<N>((int)m_node_hit), (size_t*)maskK, stackPtr); +          assert(m_trav_active); +        } + +        /* non-root and leaf => full culling test for all rays */ +        if (unlikely(parent != 0 && cur.isLeaf())) +        { +          const AABBNode* __restrict__ const node = parent.getAABBNode(); +          size_t boxID = 0xff; +          for (size_t i = 0; i < N; i++) +            if (node->child(i) == cur) { boxID = i; break; } +          assert(boxID < N); +          assert(cur == node->child(boxID)); +          m_trav_active = intersectAABBNodePacket(m_trav_active, packets, node, boxID, frustum.nf); +        } + +        /*! this is a leaf node */ +        assert(cur != BVH::emptyNode); +        STAT3(normal.trav_leaves, 1, 1, 1); +        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + +        size_t bits = m_trav_active & m_active; +        /*! intersect stream of rays with all primitives */ +        size_t lazy_node = 0; +#if defined(__SSE4_2__) +        STAT_USER(1,(popcnt(bits)+K-1)/K*4); +#endif +        while (bits) +        { +          size_t i = bsf(bits) / K; +          const size_t m_isec = ((((size_t)1 << K)-1) << (i*K)); +          assert(m_isec & bits); +          bits &= ~m_isec; +          TravRayKStream<K, robust>& p = packets[i]; +          vbool<K> m_valid = p.tnear <= p.tfar; +          vbool<K> m_hit = PrimitiveIntersectorK<K>::occludedK(m_valid, This, *inputPackets[i], context, prim, num, lazy_node); +          inputPackets[i]->tfar = select(m_hit & m_valid, vfloat<K>(neg_inf), inputPackets[i]->tfar); +          m_active &= ~((size_t)movemask(m_hit) << (i*K)); +        } + +      } // traversal + intersection +    } + + +    template<int N, int types, bool robust, typename PrimitiveIntersector> +    template<int K> +    __forceinline void BVHNIntersectorStream<N, types, robust, PrimitiveIntersector>::occludedIncoherent(Accel::Intersectors* __restrict__ This, +                                                                                                             RayK<K>** inputPackets, +                                                                                                             size_t numOctantRays, +                                                                                                             IntersectContext* context) +    { +      assert(!context->isCoherent()); +      assert(types & BVH_FLAG_ALIGNED_NODE); + +      __aligned(64) TravRayKStream<K,robust> packet[MAX_INTERNAL_STREAM_SIZE/K]; + +      assert(numOctantRays <= 32); +      const size_t numPackets = (numOctantRays+K-1)/K; +      size_t m_active = 0; +      for (size_t i = 0; i < numPackets; i++) +      { +        const vfloat<K> tnear = inputPackets[i]->tnear(); +        const vfloat<K> tfar  = inputPackets[i]->tfar; +        vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f); +        m_active |= (size_t)movemask(m_valid) << (K*i); +        const Vec3vf<K>& org = inputPackets[i]->org; +        const Vec3vf<K>& dir = inputPackets[i]->dir; +        vfloat<K> packet_min_dist = max(tnear, 0.0f); +        vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf); +        new (&packet[i]) TravRayKStream<K,robust>(org, dir, packet_min_dist, packet_max_dist); +      } + +      BVH* __restrict__ bvh = (BVH*)This->ptr; + +      StackItemMaskT<NodeRef> stack[stackSizeSingle]; // stack of nodes +      StackItemMaskT<NodeRef>* stackPtr = stack + 1;  // current stack pointer +      stack[0].ptr = bvh->root; +      stack[0].mask = m_active; + +      size_t terminated = ~m_active; + +      /* near/far offsets based on first ray */ +      const NearFarPrecalculations nf(Vec3fa(packet[0].rdir.x[0], packet[0].rdir.y[0], packet[0].rdir.z[0]), N); + +      while (1) pop: +      { +        if (unlikely(stackPtr == stack)) break; +        STAT3(shadow.trav_stack_pop,1,1,1); +        stackPtr--; +        NodeRef cur = NodeRef(stackPtr->ptr); +        size_t cur_mask = stackPtr->mask & (~terminated); +        if (unlikely(cur_mask == 0)) continue; + +        while (true) +        { +          /*! stop if we found a leaf node */ +          if (unlikely(cur.isLeaf())) break; +          const AABBNode* __restrict__ const node = cur.getAABBNode(); + +          const vint<N> vmask = traverseIncoherentStream(cur_mask, packet, node, nf, shiftTable); + +          size_t mask = movemask(vmask != vint<N>(zero)); +          if (unlikely(mask == 0)) goto pop; + +          __aligned(64) unsigned int child_mask[N]; +          vint<N>::storeu(child_mask, vmask); // this explicit store here causes much better code generation +           +          /*! one child is hit, continue with that child */ +          size_t r = bscf(mask); +          assert(r < N); +          cur = node->child(r);          +          BVHN<N>::prefetch(cur,types); +          cur_mask = child_mask[r]; + +          /* simple in order sequence */ +          assert(cur != BVH::emptyNode); +          if (likely(mask == 0)) continue; +          stackPtr->ptr  = cur; +          stackPtr->mask = cur_mask; +          stackPtr++; + +          for (; ;) +          { +            r = bscf(mask); +            assert(r < N); + +            cur = node->child(r);           +            BVHN<N>::prefetch(cur,types); +            cur_mask = child_mask[r];             +            assert(cur != BVH::emptyNode); +            if (likely(mask == 0)) break; +            stackPtr->ptr  = cur; +            stackPtr->mask = cur_mask; +            stackPtr++; +          } +        } +         +        /*! this is a leaf node */ +        assert(cur != BVH::emptyNode); +        STAT3(shadow.trav_leaves,1,1,1); +        size_t num; PrimitiveK<K>* prim = (PrimitiveK<K>*)cur.leaf(num); + +        size_t bits = cur_mask; +        size_t lazy_node = 0; + +        for (; bits != 0;) +        { +          const size_t rayID = bscf(bits); + +          RayK<K> &ray = *inputPackets[rayID / K]; +          const size_t k = rayID % K; +          if (PrimitiveIntersectorK<K>::occluded(This, ray, k, context, prim, num, lazy_node)) +          { +            ray.tfar[k] = neg_inf; +            terminated |= (size_t)1 << rayID; +          } + +          /* lazy node */ +          if (unlikely(lazy_node)) +          { +            stackPtr->ptr = lazy_node; +            stackPtr->mask = cur_mask; +            stackPtr++; +          } +        } + +        if (unlikely(terminated == (size_t)-1)) break; +      } +    } + +    //////////////////////////////////////////////////////////////////////////////// +    /// ArrayIntersectorKStream Definitions +    //////////////////////////////////////////////////////////////////////////////// + +    template<bool filter> +    struct Triangle4IntersectorStreamMoeller { +      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMIntersectorKMoeller<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Triangle4vIntersectorStreamPluecker { +      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMvIntersectorKPluecker<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Triangle4iIntersectorStreamMoeller { +      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKMoeller<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Triangle4iIntersectorStreamPluecker { +      template<int K> using Type = ArrayIntersectorKStream<K,TriangleMiIntersectorKPluecker<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Quad4vIntersectorStreamMoeller { +      template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKMoeller<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Quad4iIntersectorStreamMoeller { +      template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKMoeller<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Quad4vIntersectorStreamPluecker { +      template<int K> using Type = ArrayIntersectorKStream<K,QuadMvIntersectorKPluecker<4 COMMA K COMMA true>>; +    }; + +    template<bool filter> +    struct Quad4iIntersectorStreamPluecker { +      template<int K> using Type = ArrayIntersectorKStream<K,QuadMiIntersectorKPluecker<4 COMMA K COMMA true>>; +    }; + +    struct ObjectIntersectorStream { +      template<int K> using Type = ArrayIntersectorKStream<K,ObjectIntersectorK<K COMMA false>>; +    }; + +    struct InstanceIntersectorStream { +      template<int K> using Type = ArrayIntersectorKStream<K,InstanceIntersectorK<K>>; +    }; + +    // ===================================================================================================== +    // ===================================================================================================== +    // ===================================================================================================== + +    template<int N> +    void BVHNIntersectorStreamPacketFallback<N>::intersect(Accel::Intersectors* __restrict__ This, +                                                               RayHitN** inputRays, +                                                               size_t numTotalRays, +                                                               IntersectContext* context) +    { +      if (unlikely(context->isCoherent())) +        intersectK(This, (RayHitK<VSIZEL>**)inputRays, numTotalRays, context); +      else +        intersectK(This, (RayHitK<VSIZEX>**)inputRays, numTotalRays, context); +    } + +    template<int N> +    void BVHNIntersectorStreamPacketFallback<N>::occluded(Accel::Intersectors* __restrict__ This, +                                                              RayN** inputRays, +                                                              size_t numTotalRays, +                                                              IntersectContext* context) +    { +      if (unlikely(context->isCoherent())) +        occludedK(This, (RayK<VSIZEL>**)inputRays, numTotalRays, context); +      else +        occludedK(This, (RayK<VSIZEX>**)inputRays, numTotalRays, context); +    } + +    template<int N> +    template<int K> +    __noinline void BVHNIntersectorStreamPacketFallback<N>::intersectK(Accel::Intersectors* __restrict__ This, +                                                                              RayHitK<K>** inputRays, +                                                                              size_t numTotalRays, +                                                                              IntersectContext* context) +    { +      /* fallback to packets */ +      for (size_t i = 0; i < numTotalRays; i += K) +      { +        const vint<K> vi = vint<K>(int(i)) + vint<K>(step); +        vbool<K> valid = vi < vint<K>(int(numTotalRays)); +        RayHitK<K>& ray = *(inputRays[i / K]); +        valid &= ray.tnear() <= ray.tfar; +        This->intersect(valid, ray, context); +      } +    } + +    template<int N> +    template<int K> +    __noinline void BVHNIntersectorStreamPacketFallback<N>::occludedK(Accel::Intersectors* __restrict__ This, +                                                                             RayK<K>** inputRays, +                                                                             size_t numTotalRays, +                                                                             IntersectContext* context) +    { +      /* fallback to packets */ +      for (size_t i = 0; i < numTotalRays; i += K) +      { +        const vint<K> vi = vint<K>(int(i)) + vint<K>(step); +        vbool<K> valid = vi < vint<K>(int(numTotalRays)); +        RayK<K>& ray = *(inputRays[i / K]); +        valid &= ray.tnear() <= ray.tfar; +        This->occluded(valid, ray, context); +      } +    } +  } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp new file mode 100644 index 0000000000..c3e5f137b8 --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_bvh4.cpp @@ -0,0 +1,36 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream.cpp" + +namespace embree +{ +  namespace isa +  { + +    //////////////////////////////////////////////////////////////////////////////// +    /// General BVHIntersectorStreamPacketFallback Intersector +    //////////////////////////////////////////////////////////////////////////////// + +    DEFINE_INTERSECTORN(BVH4IntersectorStreamPacketFallback,BVHNIntersectorStreamPacketFallback<4>); + +    //////////////////////////////////////////////////////////////////////////////// +    /// BVH4IntersectorStream Definitions +    //////////////////////////////////////////////////////////////////////////////// + +    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4iIntersectorStreamMoeller<true>>)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4vIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Triangle4vIntersectorStreamPluecker<true>>)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4iIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Triangle4iIntersectorStreamPluecker<true>>)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoeller,         BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<true>>)); +    IF_ENABLED_TRIS(DEFINE_INTERSECTORN(BVH4Triangle4IntersectorStreamMoellerNoFilter, BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Triangle4IntersectorStreamMoeller<false>>)); + +    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<true>>)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamMoellerNoFilter,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4vIntersectorStreamMoeller<false>>)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamMoeller,        BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA Quad4iIntersectorStreamMoeller<true>>)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4vIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Quad4vIntersectorStreamPluecker<true>>)); +    IF_ENABLED_QUADS(DEFINE_INTERSECTORN(BVH4Quad4iIntersectorStreamPluecker,       BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA true  COMMA Quad4iIntersectorStreamPluecker<true>>)); + +    IF_ENABLED_USER(DEFINE_INTERSECTORN(BVH4VirtualIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA ObjectIntersectorStream>)); +    IF_ENABLED_INSTANCE(DEFINE_INTERSECTORN(BVH4InstanceIntersectorStream,BVHNIntersectorStream<4 COMMA BVH_AN1 COMMA false COMMA InstanceIntersectorStream>)); +  } +} diff --git a/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp new file mode 100644 index 0000000000..b858eb163f --- /dev/null +++ b/thirdparty/embree/kernels/bvh/bvh_intersector_stream_filters.cpp @@ -0,0 +1,657 @@ +// Copyright 2009-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "bvh_intersector_stream_filters.h" +#include "bvh_intersector_stream.h" + +namespace embree +{ +  namespace isa +  { +    template<int K, bool intersect> +    __noinline void RayStreamFilter::filterAOS(Scene* scene, void* _rayN, size_t N, size_t stride, IntersectContext* context) +    { +      RayStreamAOS rayN(_rayN); + +      /* use fast path for coherent ray mode */ +      if (unlikely(context->isCoherent())) +      { +        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) +        { +          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + +          /* convert from AOS to SOA */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const vint<K> offset = vij * int(stride); +            const size_t packetIndex = j / K; + +            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); + +            rays[packetIndex] = ray; +            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN +          } + +          /* trace stream */ +          scene->intersectors.intersectN(rayPtrs, size, context); + +          /* convert from SOA to AOS */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const vint<K> offset = vij * int(stride); +            const size_t packetIndex = j / K; +            rayN.setHitByOffset(valid, offset, rays[packetIndex]); +          } +        } +      } +      else if (unlikely(!intersect)) +      { +        /* octant sorting for occlusion rays */ +        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; +        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        unsigned int raysInOctant[8]; +        for (unsigned int i = 0; i < 8; i++) +          raysInOctant[i] = 0; +        size_t inputRayID = 0; + +        for (;;) +        { +          int curOctant = -1; + +          /* sort rays into octants */ +          for (; inputRayID < N;) +          { +            const Ray& ray = rayN.getRayByOffset(inputRayID * stride); + +            /* skip invalid rays */ +            if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) +            if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + +            const unsigned int octantID = movemask(vfloat4(Vec3fa(ray.dir)) < 0.0f) & 0x7; + +            assert(octantID < 8); +            octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; +            inputRayID++; +            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) +            { +              curOctant = octantID; +              break; +            } +          } + +          /* need to flush rays in octant? */ +          if (unlikely(curOctant == -1)) +          { +            for (unsigned int i = 0; i < 8; i++) +              if (raysInOctant[i]) { curOctant = i; break; } +          } + +          /* all rays traced? */ +          if (unlikely(curOctant == -1)) +            break; +         +          unsigned int* const rayIDs = &octants[curOctant][0]; +          const unsigned int numOctantRays = raysInOctant[curOctant]; +          assert(numOctantRays); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride); +            RayK<K>& ray = rays[j/K]; +            rayPtrs[j/K] = &ray; +            ray = rayN.getRayByOffset<K>(valid, offset); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); +          } + +          scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> offset = *(vint<K>*)&rayIDs[j] * int(stride); +            rayN.setHitByOffset<K>(valid, offset, rays[j/K]); +          } + +          raysInOctant[curOctant] = 0; +        } +      } +      else +      { +        /* fallback to packets */ +        for (size_t i = 0; i < N; i += K) +        { +          const vint<K> vi = vint<K>(int(i)) + vint<K>(step); +          vbool<K> valid = vi < vint<K>(int(N)); +          const vint<K> offset = vi * int(stride); + +          RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); +          valid &= ray.tnear() <= ray.tfar; + +          scene->intersectors.intersect(valid, ray, context); + +          rayN.setHitByOffset<K>(valid, offset, ray); +        } +      } +    } + +    template<int K, bool intersect> +    __noinline void RayStreamFilter::filterAOP(Scene* scene, void** _rayN, size_t N, IntersectContext* context) +    { +      RayStreamAOP rayN(_rayN); + +      /* use fast path for coherent ray mode */ +      if (unlikely(context->isCoherent())) +      { +        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) +        { +          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + +          /* convert from AOP to SOA */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const size_t packetIndex = j / K; + +            RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vij); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); + +            rays[packetIndex] = ray; +            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN +          } + +          /* trace stream */ +          scene->intersectors.intersectN(rayPtrs, size, context); + +          /* convert from SOA to AOP */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const size_t packetIndex = j / K; + +            rayN.setHitByIndex<K>(valid, vij, rays[packetIndex]); +          } +        } +      } +      else if (unlikely(!intersect)) +      { +        /* octant sorting for occlusion rays */ +        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; +        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        unsigned int raysInOctant[8]; +        for (unsigned int i = 0; i < 8; i++) +          raysInOctant[i] = 0; +        size_t inputRayID = 0; + +        for (;;) +        { +          int curOctant = -1; + +          /* sort rays into octants */ +          for (; inputRayID < N;) +          { +            const Ray& ray = rayN.getRayByIndex(inputRayID); + +            /* skip invalid rays */ +            if (unlikely(ray.tnear() > ray.tfar || ray.tfar < 0.0f)) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) +            if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + +            const unsigned int octantID = movemask(lt_mask(ray.dir,Vec3fa(0.0f))); + +            assert(octantID < 8); +            octants[octantID][raysInOctant[octantID]++] = (unsigned int)inputRayID; +            inputRayID++; +            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) +            { +              curOctant = octantID; +              break; +            } +          } + +          /* need to flush rays in octant? */ +          if (unlikely(curOctant == -1)) +          { +            for (unsigned int i = 0; i < 8; i++) +              if (raysInOctant[i]) { curOctant = i; break; } +          } + +          /* all rays traced? */ +          if (unlikely(curOctant == -1)) +            break; + +          unsigned int* const rayIDs = &octants[curOctant][0]; +          const unsigned int numOctantRays = raysInOctant[curOctant]; +          assert(numOctantRays); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> index = *(vint<K>*)&rayIDs[j]; +            RayK<K>& ray = rays[j/K]; +            rayPtrs[j/K] = &ray; +            ray = rayN.getRayByIndex<K>(valid, index); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); +          } + +          scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> index = *(vint<K>*)&rayIDs[j]; +            rayN.setHitByIndex<K>(valid, index, rays[j/K]); +          } + +          raysInOctant[curOctant] = 0; +        } +      } +      else +      { +        /* fallback to packets */ +        for (size_t i = 0; i < N; i += K) +        { +          const vint<K> vi = vint<K>(int(i)) + vint<K>(step); +          vbool<K> valid = vi < vint<K>(int(N)); + +          RayTypeK<K, intersect> ray = rayN.getRayByIndex<K>(valid, vi); +          valid &= ray.tnear() <= ray.tfar; + +          scene->intersectors.intersect(valid, ray, context); + +          rayN.setHitByIndex<K>(valid, vi, ray); +        } +      } +    } + +    template<int K, bool intersect> +    __noinline void RayStreamFilter::filterSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) +    { +      const size_t rayDataAlignment = (size_t)rayData % (K*sizeof(float)); +      const size_t offsetAlignment  = (size_t)stride  % (K*sizeof(float)); + +      /* fast path for packets with the correct width and data alignment */ +      if (likely(N == K && +                 !rayDataAlignment && +                 !offsetAlignment)) +      { +        if (unlikely(context->isCoherent())) +        { +          __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +          size_t packetIndex = 0; +          for (size_t i = 0; i < numPackets; i++) +          { +            const size_t offset = i * stride; +            RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset); +            rayPtrs[packetIndex++] = &ray; + +            /* trace as stream */ +            if (unlikely(packetIndex == MAX_INTERNAL_STREAM_SIZE / K)) +            { +              const size_t size = packetIndex*K; +              scene->intersectors.intersectN(rayPtrs, size, context); +              packetIndex = 0; +            } +          } + +          /* flush remaining packets */ +          if (unlikely(packetIndex > 0)) +          { +            const size_t size = packetIndex*K; +            scene->intersectors.intersectN(rayPtrs, size, context); +          } +        } +        else if (unlikely(!intersect)) +        { +          /* octant sorting for occlusion rays */ +          RayStreamSOA rayN(rayData, K); + +          __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; +          __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; +          __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +          unsigned int raysInOctant[8]; +          for (unsigned int i = 0; i < 8; i++) +            raysInOctant[i] = 0; +          size_t inputRayID = 0; + +          for (;;) +          { +            int curOctant = -1; + +            /* sort rays into octants */ +            for (; inputRayID < N*numPackets;) +            { +              const size_t offset = (inputRayID / K) * stride + (inputRayID % K) * sizeof(float); + +              /* skip invalid rays */ +              if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays +  #if defined(EMBREE_IGNORE_INVALID_RAYS) +              __aligned(64) Ray ray = rayN.getRayByOffset(offset); +              if (unlikely(!ray.valid())) { inputRayID++; continue; } +  #endif + +              const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + +              assert(octantID < 8); +              octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; +              inputRayID++; +              if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) +              { +                curOctant = octantID; +                break; +              } +            } + +            /* need to flush rays in octant? */ +            if (unlikely(curOctant == -1)) +            { +              for (unsigned int i = 0; i < 8; i++) +                if (raysInOctant[i]) { curOctant = i; break; } +            } + +            /* all rays traced? */ +            if (unlikely(curOctant == -1)) +              break; + +            unsigned int* const rayOffsets = &octants[curOctant][0]; +            const unsigned int numOctantRays = raysInOctant[curOctant]; +            assert(numOctantRays); + +            for (unsigned int j = 0; j < numOctantRays; j += K) +            { +              const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +              const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +              const vint<K> offset = *(vint<K>*)&rayOffsets[j]; +              RayK<K>& ray = rays[j/K]; +              rayPtrs[j/K] = &ray; +              ray = rayN.getRayByOffset<K>(valid, offset); +              ray.tnear() = select(valid, ray.tnear(), zero); +              ray.tfar  = select(valid, ray.tfar,  neg_inf); +            } + +            scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + +            for (unsigned int j = 0; j < numOctantRays; j += K) +            { +              const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +              const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +              const vint<K> offset = *(vint<K>*)&rayOffsets[j]; +              rayN.setHitByOffset(valid, offset, rays[j/K]); +            } +            raysInOctant[curOctant] = 0; +          } +        } +        else +        { +          /* fallback to packets */ +          for (size_t i = 0; i < numPackets; i++) +          { +            const size_t offset = i * stride; +            RayTypeK<K, intersect>& ray = *(RayTypeK<K, intersect>*)(rayData + offset); +            const vbool<K> valid = ray.tnear() <= ray.tfar; + +            scene->intersectors.intersect(valid, ray, context); +          } +        } +      } +      else +      { +        /* fallback to packets for arbitrary packet size and alignment */ +        for (size_t i = 0; i < numPackets; i++) +        { +          const size_t offsetN = i * stride; +          RayStreamSOA rayN(rayData + offsetN, N); + +          for (size_t j = 0; j < N; j += K) +          { +            const size_t offset = j * sizeof(float); +            vbool<K> valid = (vint<K>(int(j)) + vint<K>(step)) < vint<K>(int(N)); +            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); +            valid &= ray.tnear() <= ray.tfar; + +            scene->intersectors.intersect(valid, ray, context); + +            rayN.setHitByOffset(valid, offset, ray); +          } +        } +      } +    } + +    template<int K, bool intersect> +    __noinline void RayStreamFilter::filterSOP(Scene* scene, const void* _rayN, size_t N, IntersectContext* context) +    {  +      RayStreamSOP& rayN = *(RayStreamSOP*)_rayN; + +      /* use fast path for coherent ray mode */ +      if (unlikely(context->isCoherent())) +      { +        __aligned(64) RayTypeK<K, intersect> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayTypeK<K, intersect>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        for (size_t i = 0; i < N; i += MAX_INTERNAL_STREAM_SIZE) +        { +          const size_t size = min(N - i, MAX_INTERNAL_STREAM_SIZE); + +          /* convert from SOP to SOA */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const size_t offset = (i+j) * sizeof(float); +            const size_t packetIndex = j / K; + +            RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); + +            rays[packetIndex] = ray; +            rayPtrs[packetIndex] = &rays[packetIndex]; // rayPtrs might get reordered for occludedN +          } + +          /* trace stream */ +          scene->intersectors.intersectN(rayPtrs, size, context); + +          /* convert from SOA to SOP */ +          for (size_t j = 0; j < size; j += K) +          { +            const vint<K> vij = vint<K>(int(i+j)) + vint<K>(step); +            const vbool<K> valid = vij < vint<K>(int(N)); +            const size_t offset = (i+j) * sizeof(float); +            const size_t packetIndex = j / K; + +            rayN.setHitByOffset(valid, offset, rays[packetIndex]); +          } +        } +      } +      else if (unlikely(!intersect)) +      { +        /* octant sorting for occlusion rays */ +        __aligned(64) unsigned int octants[8][MAX_INTERNAL_STREAM_SIZE]; +        __aligned(64) RayK<K> rays[MAX_INTERNAL_STREAM_SIZE / K]; +        __aligned(64) RayK<K>* rayPtrs[MAX_INTERNAL_STREAM_SIZE / K]; + +        unsigned int raysInOctant[8]; +        for (unsigned int i = 0; i < 8; i++) +          raysInOctant[i] = 0; +        size_t inputRayID = 0; + +        for (;;) +        { +          int curOctant = -1; + +          /* sort rays into octants */ +          for (; inputRayID < N;) +          { +            const size_t offset = inputRayID * sizeof(float); +            /* skip invalid rays */ +            if (unlikely(!rayN.isValidByOffset(offset))) { inputRayID++; continue; } // ignore invalid or already occluded rays +#if defined(EMBREE_IGNORE_INVALID_RAYS) +            __aligned(64) Ray ray = rayN.getRayByOffset(offset); +            if (unlikely(!ray.valid())) { inputRayID++; continue; } +#endif + +            const unsigned int octantID = (unsigned int)rayN.getOctantByOffset(offset); + +            assert(octantID < 8); +            octants[octantID][raysInOctant[octantID]++] = (unsigned int)offset; +            inputRayID++; +            if (unlikely(raysInOctant[octantID] == MAX_INTERNAL_STREAM_SIZE)) +            { +              curOctant = octantID; +              break; +            } +          } + +          /* need to flush rays in octant? */ +          if (unlikely(curOctant == -1)) +          { +            for (unsigned int i = 0; i < 8; i++) +              if (raysInOctant[i]) { curOctant = i; break; } +          } + +          /* all rays traced? */ +          if (unlikely(curOctant == -1)) +            break; + +          unsigned int* const rayOffsets = &octants[curOctant][0]; +          const unsigned int numOctantRays = raysInOctant[curOctant]; +          assert(numOctantRays); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> offset = *(vint<K>*)&rayOffsets[j]; +            RayK<K>& ray = rays[j/K]; +            rayPtrs[j/K] = &ray; +            ray = rayN.getRayByOffset<K>(valid, offset); +            ray.tnear() = select(valid, ray.tnear(), zero); +            ray.tfar  = select(valid, ray.tfar,  neg_inf); +          } + +          scene->intersectors.occludedN(rayPtrs, numOctantRays, context); + +          for (unsigned int j = 0; j < numOctantRays; j += K) +          { +            const vint<K> vi = vint<K>(int(j)) + vint<K>(step); +            const vbool<K> valid = vi < vint<K>(int(numOctantRays)); +            const vint<K> offset = *(vint<K>*)&rayOffsets[j]; +            rayN.setHitByOffset(valid, offset, rays[j/K]); +          } + +          raysInOctant[curOctant] = 0; +        } +      } +      else +      { +        /* fallback to packets */ +        for (size_t i = 0; i < N; i += K) +        { +          const vint<K> vi = vint<K>(int(i)) + vint<K>(step); +          vbool<K> valid = vi < vint<K>(int(N)); +          const size_t offset = i * sizeof(float); + +          RayTypeK<K, intersect> ray = rayN.getRayByOffset<K>(valid, offset); +          valid &= ray.tnear() <= ray.tfar; + +          scene->intersectors.intersect(valid, ray, context); + +          rayN.setHitByOffset(valid, offset, ray); +        } +      } +    } + + +    void RayStreamFilter::intersectAOS(Scene* scene, RTCRayHit* _rayN, size_t N, size_t stride, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterAOS<VSIZEL, true>(scene, _rayN, N, stride, context); +      else +        filterAOS<VSIZEX, true>(scene, _rayN, N, stride, context); +    } + +    void RayStreamFilter::occludedAOS(Scene* scene, RTCRay* _rayN, size_t N, size_t stride, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterAOS<VSIZEL, false>(scene, _rayN, N, stride, context); +      else +        filterAOS<VSIZEX, false>(scene, _rayN, N, stride, context); +    } + +    void RayStreamFilter::intersectAOP(Scene* scene, RTCRayHit** _rayN, size_t N, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterAOP<VSIZEL, true>(scene, (void**)_rayN, N, context); +      else +        filterAOP<VSIZEX, true>(scene, (void**)_rayN, N, context); +    } + +    void RayStreamFilter::occludedAOP(Scene* scene, RTCRay** _rayN, size_t N, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterAOP<VSIZEL, false>(scene, (void**)_rayN, N, context); +      else +        filterAOP<VSIZEX, false>(scene, (void**)_rayN, N, context); +    } + +    void RayStreamFilter::intersectSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterSOA<VSIZEL, true>(scene, rayData, N, numPackets, stride, context); +      else +        filterSOA<VSIZEX, true>(scene, rayData, N, numPackets, stride, context); +    } + +    void RayStreamFilter::occludedSOA(Scene* scene, char* rayData, size_t N, size_t numPackets, size_t stride, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterSOA<VSIZEL, false>(scene, rayData, N, numPackets, stride, context); +      else +        filterSOA<VSIZEX, false>(scene, rayData, N, numPackets, stride, context); +    } + +    void RayStreamFilter::intersectSOP(Scene* scene, const RTCRayHitNp* _rayN, size_t N, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterSOP<VSIZEL, true>(scene, _rayN, N, context); +      else +        filterSOP<VSIZEX, true>(scene, _rayN, N, context); +    } + +    void RayStreamFilter::occludedSOP(Scene* scene, const RTCRayNp* _rayN, size_t N, IntersectContext* context) { +      if (unlikely(context->isCoherent())) +        filterSOP<VSIZEL, false>(scene, _rayN, N, context); +      else +        filterSOP<VSIZEX, false>(scene, _rayN, N, context); +    } + + +    RayStreamFilterFuncs rayStreamFilterFuncs() { +      return RayStreamFilterFuncs(RayStreamFilter::intersectAOS, RayStreamFilter::intersectAOP, RayStreamFilter::intersectSOA, RayStreamFilter::intersectSOP, +                                  RayStreamFilter::occludedAOS,  RayStreamFilter::occludedAOP,  RayStreamFilter::occludedSOA,  RayStreamFilter::occludedSOP); +    } +  }; +}; diff --git a/thirdparty/embree/kernels/config.h b/thirdparty/embree/kernels/config.h index 80a8ab2a56..2bf7e93587 100644 --- a/thirdparty/embree/kernels/config.h +++ b/thirdparty/embree/kernels/config.h @@ -16,7 +16,7 @@  /* #undef EMBREE_GEOMETRY_INSTANCE */  /* #undef EMBREE_GEOMETRY_GRID */  /* #undef EMBREE_GEOMETRY_POINT */ -/* #undef EMBREE_RAY_PACKETS */ +#define EMBREE_RAY_PACKETS  /* #undef EMBREE_COMPACT_POLYS */  #define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0 diff --git a/thirdparty/embree/kernels/hash.h b/thirdparty/embree/kernels/hash.h index 10f315cee7..470e15f03e 100644 --- a/thirdparty/embree/kernels/hash.h +++ b/thirdparty/embree/kernels/hash.h @@ -2,4 +2,4 @@  // Copyright 2009-2020 Intel Corporation  // SPDX-License-Identifier: Apache-2.0 -#define RTC_HASH "7c53133eb21424f7f0ae1e25bf357e358feaf6ab" +#define RTC_HASH "12b99393438a4cc9e478e33459eed78bec6233fd"  |